diff --git a/lib/kokkos/.gitignore b/lib/kokkos/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..f9d16be1558495fb95e3f5c4b785eefd3b3aa854
--- /dev/null
+++ b/lib/kokkos/.gitignore
@@ -0,0 +1,8 @@
+# Standard ignores
+*~
+*.pyc
+\#*#
+.#*
+.*.swp
+.cproject
+.project
diff --git a/lib/kokkos/CMakeLists.txt b/lib/kokkos/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1219352f73dc47360555639b1f4c3ddde410e9a5
--- /dev/null
+++ b/lib/kokkos/CMakeLists.txt
@@ -0,0 +1,184 @@
+
+IF(COMMAND TRIBITS_PACKAGE_DECL)
+  SET(KOKKOS_HAS_TRILINOS ON CACHE BOOL "")
+ELSE()
+  SET(KOKKOS_HAS_TRILINOS OFF CACHE BOOL "")
+ENDIF()
+
+IF(NOT KOKKOS_HAS_TRILINOS)
+  CMAKE_MINIMUM_REQUIRED(VERSION 2.8.11 FATAL_ERROR)
+  INCLUDE(cmake/tribits.cmake)
+ENDIF()
+
+#
+# A) Forward delcare the package so that certain options are also defined for
+# subpackages
+#
+
+TRIBITS_PACKAGE_DECL(Kokkos) # ENABLE_SHADOWING_WARNINGS)
+
+#------------------------------------------------------------------------------
+#
+# B) Define the common options for Kokkos first so they can be used by
+# subpackages as well.
+#
+
+
+
+# mfh 01 Aug 2016: See Issue #61:
+#
+# https://github.com/kokkos/kokkos/issues/61
+#
+# Don't use TRIBITS_ADD_DEBUG_OPTION() here, because that defines
+# HAVE_KOKKOS_DEBUG.  We define KOKKOS_HAVE_DEBUG here instead,
+# for compatibility with Kokkos' Makefile build system.
+
+TRIBITS_ADD_OPTION_AND_DEFINE(
+  ${PACKAGE_NAME}_ENABLE_DEBUG
+  ${PACKAGE_NAME_UC}_HAVE_DEBUG
+  "Enable run-time debug checks.  These checks may be expensive, so they are disabled by default in a release build."
+  ${${PROJECT_NAME}_ENABLE_DEBUG}
+)
+
+TRIBITS_ADD_OPTION_AND_DEFINE(
+  Kokkos_ENABLE_SIERRA_BUILD
+  KOKKOS_FOR_SIERRA
+  "Configure Kokkos for building within the Sierra build system."
+  OFF
+  )
+
+TRIBITS_ADD_OPTION_AND_DEFINE(
+  Kokkos_ENABLE_Cuda
+  KOKKOS_HAVE_CUDA
+  "Enable CUDA support in Kokkos."
+  "${TPL_ENABLE_CUDA}"
+  )
+
+TRIBITS_ADD_OPTION_AND_DEFINE(
+  Kokkos_ENABLE_Cuda_UVM
+  KOKKOS_USE_CUDA_UVM
+  "Enable CUDA Unified Virtual Memory support in Kokkos."
+  OFF
+  )
+
+TRIBITS_ADD_OPTION_AND_DEFINE(
+  Kokkos_ENABLE_Pthread
+  KOKKOS_HAVE_PTHREAD
+  "Enable Pthread support in Kokkos."
+  OFF
+  )
+
+ASSERT_DEFINED(TPL_ENABLE_Pthread)
+IF (Kokkos_ENABLE_Pthread AND NOT TPL_ENABLE_Pthread)
+  MESSAGE(FATAL_ERROR "You set Kokkos_ENABLE_Pthread=ON, but Trilinos' support for Pthread(s) is not enabled (TPL_ENABLE_Pthread=OFF).  This is not allowed.  Please enable Pthreads in Trilinos before attempting to enable Kokkos' support for Pthreads.")
+ENDIF ()
+
+TRIBITS_ADD_OPTION_AND_DEFINE(
+  Kokkos_ENABLE_OpenMP
+  KOKKOS_HAVE_OPENMP
+  "Enable OpenMP support in Kokkos."
+  "${${PROJECT_NAME}_ENABLE_OpenMP}"
+  )
+
+TRIBITS_ADD_OPTION_AND_DEFINE(
+  Kokkos_ENABLE_QTHREAD
+  KOKKOS_HAVE_QTHREAD
+  "Enable QTHREAD support in Kokkos."
+  "${TPL_ENABLE_QTHREAD}"
+  )
+
+TRIBITS_ADD_OPTION_AND_DEFINE(
+  Kokkos_ENABLE_CXX11
+  KOKKOS_HAVE_CXX11
+  "Enable C++11 support in Kokkos."
+  "${${PROJECT_NAME}_ENABLE_CXX11}"
+  )
+  
+TRIBITS_ADD_OPTION_AND_DEFINE(
+  Kokkos_ENABLE_HWLOC
+  KOKKOS_HAVE_HWLOC
+  "Enable HWLOC support in Kokkos."
+  "${TPL_ENABLE_HWLOC}"
+  )
+
+TRIBITS_ADD_OPTION_AND_DEFINE(
+  Kokkos_ENABLE_MPI
+  KOKKOS_HAVE_MPI
+  "Enable MPI support in Kokkos."
+  "${TPL_ENABLE_MPI}"
+  )
+
+# Set default value of Kokkos_ENABLE_Debug_Bounds_Check option
+#
+# CMake is case sensitive.  The Kokkos_ENABLE_Debug_Bounds_Check
+# option (defined below) is annoyingly not all caps, but we need to
+# keep it that way for backwards compatibility.  If users forget and
+# try using an all-caps variable, then make it count by using the
+# all-caps version as the default value of the original, not-all-caps
+# option.  Otherwise, the default value of this option comes from
+# Kokkos_ENABLE_DEBUG (see Issue #367).
+
+ASSERT_DEFINED(${PACKAGE_NAME}_ENABLE_DEBUG)
+IF(DEFINED Kokkos_ENABLE_DEBUG_BOUNDS_CHECK)
+  IF(Kokkos_ENABLE_DEBUG_BOUNDS_CHECK)
+    SET(Kokkos_ENABLE_Debug_Bounds_Check_DEFAULT ON)
+  ELSE()
+    SET(Kokkos_ENABLE_Debug_Bounds_Check_DEFAULT "${${PACKAGE_NAME}_ENABLE_DEBUG}")
+  ENDIF()
+ELSE()
+  SET(Kokkos_ENABLE_Debug_Bounds_Check_DEFAULT "${${PACKAGE_NAME}_ENABLE_DEBUG}")
+ENDIF()
+ASSERT_DEFINED(Kokkos_ENABLE_Debug_Bounds_Check_DEFAULT)
+
+TRIBITS_ADD_OPTION_AND_DEFINE(
+  Kokkos_ENABLE_Debug_Bounds_Check
+  KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK
+  "Enable Kokkos::View run-time bounds checking."
+  "${Kokkos_ENABLE_Debug_Bounds_Check_DEFAULT}"
+  )
+
+TRIBITS_ADD_OPTION_AND_DEFINE(
+  Kokkos_ENABLE_Profiling
+  KOKKOS_ENABLE_PROFILING_INTERNAL
+  "Enable KokkosP profiling support for kernel data collections."
+  "${TPL_ENABLE_DLlib}"
+  )
+
+# placeholder for future device...
+TRIBITS_ADD_OPTION_AND_DEFINE(
+  Kokkos_ENABLE_Winthread
+  KOKKOS_HAVE_WINTHREAD
+  "Enable Winthread support in Kokkos."
+  "${TPL_ENABLE_Winthread}"
+  )
+
+# use new/old View
+TRIBITS_ADD_OPTION_AND_DEFINE(
+  Kokkos_USING_DEPRECATED_VIEW
+  KOKKOS_USING_DEPRECATED_VIEW
+  "Choose whether to use the old, deprecated Kokkos::View"
+  OFF
+  )
+
+#------------------------------------------------------------------------------
+#
+# C) Process the subpackages for Kokkos
+#
+
+TRIBITS_PROCESS_SUBPACKAGES()
+
+#
+# D) If Kokkos itself is enabled, process the Kokkos package
+#
+
+TRIBITS_PACKAGE_DEF()
+
+TRIBITS_EXCLUDE_AUTOTOOLS_FILES()
+
+TRIBITS_EXCLUDE_FILES(
+  classic/doc
+  classic/LinAlg/doc/CrsRefactorNotesMay2012
+  )
+
+TRIBITS_PACKAGE_POSTPROCESS()
+
diff --git a/lib/kokkos/Copyright.txt b/lib/kokkos/Copyright.txt
new file mode 100644
index 0000000000000000000000000000000000000000..05980758fa8fe6317bb08fcc6eb70668b5fd1580
--- /dev/null
+++ b/lib/kokkos/Copyright.txt
@@ -0,0 +1,40 @@
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
diff --git a/lib/kokkos/HOW_TO_SNAPSHOT b/lib/kokkos/HOW_TO_SNAPSHOT
new file mode 100644
index 0000000000000000000000000000000000000000..46bfb4167f9023a8273ebc872ac450c626603bf0
--- /dev/null
+++ b/lib/kokkos/HOW_TO_SNAPSHOT
@@ -0,0 +1,73 @@
+
+Developers of Kokkos (those who commit modifications to Kokkos)
+must maintain the snapshot of Kokkos in the Trilinos repository.
+
+This file contains instructions for how to
+snapshot Kokkos from github.com/kokkos to Trilinos.
+
+------------------------------------------------------------------------
+*** EVERYTHING GOES RIGHT WORKFLOW ***
+
+1) Given a 'git clone' of Kokkos and of Trilinos repositories.
+1.1) Let ${KOKKOS} be the absolute path to the Kokkos clone.
+     This path *must* terminate with the directory name 'kokkos';
+     e.g., ${HOME}/kokkos .
+1.2) Let ${TRILINOS} be the absolute path to the Trilinos directory.
+
+2) Given that the Kokkos build & test is clean and
+   changes are committed to the Kokkos clone.
+
+3) Snapshot the current commit in the Kokkos clone into the Trilinos clone.
+   This overwrites ${TRILINOS}/packages/kokkos with the content of ${KOKKOS}:
+	${KOKKOS}/config/snapshot.py --verbose ${KOKKOS} ${TRILINOS}/packages
+
+4) Verify the snapshot commit happened as expected
+	cd ${TRILINOS}/packages/kokkos
+	git log -1 --name-only
+
+5) Modify, build, and test Trilinos with the Kokkos snapshot.
+
+6) Given that that the Trilinos build & test is clean and
+   changes are committed to the Trilinos clone.
+
+7) Attempt push to the Kokkos repository.
+   If push fails then you must 'remove the Kokkos snapshot'
+   from your Trilinos clone.
+   See below.
+
+8) Attempt to push to the Trilinos repository.
+   If updating for a failed push requires you to change Kokkos you must
+   'remove the Kokkos snapshot' from your Trilinos clone.
+   See below.
+
+------------------------------------------------------------------------
+*** WHEN SOMETHING GOES WRONG AND YOU MUST              ***
+*** REMOVE THE KOKKOS SNAPSHOT FROM YOUR TRILINOS CLONE ***
+
+1) Query the Trilinos clone commit log.
+	git log --oneline
+
+2) Note the <SHA1> of the commit to the Trillinos clone
+   immediately BEFORE the Kokkos snapshot commit.
+   Copy this <SHA1> for use in the next command.
+
+3) IF more than one outstanding commit then you can remove just the
+   Kokkos snapshot commit with 'git rebase -i'.  Edit the rebase file.
+   Remove or comment out the Kokkos snapshot commit entry.
+	git rebase -i <SHA1>
+
+4) IF the Kokkos snapshot commit is the one and only
+   outstanding commit then remove just than commit.
+	git reset --hard HEAD~1
+
+------------------------------------------------------------------------
+*** REGARDING 'snapshot.py' TOOL ***
+
+The 'snapshot.py' tool is developed and maintained by the
+Center for Computing Research (CCR)
+Software Engineering, Maintenance, and Support (SEMS) team.
+
+Contact Brent Perschbacher <bmpersc@sandia.gov> for questions>
+
+------------------------------------------------------------------------
+
diff --git a/lib/kokkos/LICENSE b/lib/kokkos/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..05980758fa8fe6317bb08fcc6eb70668b5fd1580
--- /dev/null
+++ b/lib/kokkos/LICENSE
@@ -0,0 +1,40 @@
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos
new file mode 100644
index 0000000000000000000000000000000000000000..c9b6cc464ddd7df1d942a9e38275fa82e96742dc
--- /dev/null
+++ b/lib/kokkos/Makefile.kokkos
@@ -0,0 +1,480 @@
+# Default settings common options
+
+#LAMMPS specific settings:
+KOKKOS_PATH=../../lib/kokkos
+CXXFLAGS=$(CCFLAGS)
+
+#Options: OpenMP,Serial,Pthreads,Cuda
+KOKKOS_DEVICES ?= "OpenMP"
+#KOKKOS_DEVICES ?= "Pthreads"
+#Options: KNC,SNB,HSW,Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal61,ARMv8,BGQ,Power7,Power8,KNL,BDW
+KOKKOS_ARCH ?= ""
+#Options: yes,no
+KOKKOS_DEBUG ?= "no"
+#Options: hwloc,librt,experimental_memkind
+KOKKOS_USE_TPLS ?= ""
+#Options: c++11
+KOKKOS_CXX_STANDARD ?= "c++11"
+#Options: aggressive_vectorization,disable_profiling
+KOKKOS_OPTIONS ?= ""
+
+#Default settings specific options
+#Options: force_uvm,use_ldg,rdc,enable_lambda
+KOKKOS_CUDA_OPTIONS ?= ""
+
+# Check for general settings
+
+KOKKOS_INTERNAL_ENABLE_DEBUG := $(strip $(shell echo $(KOKKOS_DEBUG) | grep "yes" | wc -l))
+KOKKOS_INTERNAL_ENABLE_CXX11 := $(strip $(shell echo $(KOKKOS_CXX_STANDARD) | grep "c++11" | wc -l))
+
+# Check for external libraries
+KOKKOS_INTERNAL_USE_HWLOC := $(strip $(shell echo $(KOKKOS_USE_TPLS) | grep "hwloc" | wc -l))
+KOKKOS_INTERNAL_USE_LIBRT := $(strip $(shell echo $(KOKKOS_USE_TPLS) | grep "librt" | wc -l))
+KOKKOS_INTERNAL_USE_MEMKIND := $(strip $(shell echo $(KOKKOS_USE_TPLS) | grep "experimental_memkind" | wc -l))
+
+# Check for advanced settings
+KOKKOS_INTERNAL_OPT_RANGE_AGGRESSIVE_VECTORIZATION := $(strip $(shell echo $(KOKKOS_OPTIONS) | grep "aggressive_vectorization" | wc -l))
+KOKKOS_INTERNAL_DISABLE_PROFILING := $(strip $(shell echo $(KOKKOS_OPTIONS) | grep "disable_profiling" | wc -l))
+KOKKOS_INTERNAL_CUDA_USE_LDG := $(strip $(shell echo $(KOKKOS_CUDA_OPTIONS) | grep "use_ldg" | wc -l))
+KOKKOS_INTERNAL_CUDA_USE_UVM := $(strip $(shell echo $(KOKKOS_CUDA_OPTIONS) | grep "force_uvm" | wc -l))
+KOKKOS_INTERNAL_CUDA_USE_RELOC := $(strip $(shell echo $(KOKKOS_CUDA_OPTIONS) | grep "rdc" | wc -l))
+KOKKOS_INTERNAL_CUDA_USE_LAMBDA := $(strip $(shell echo $(KOKKOS_CUDA_OPTIONS) | grep "enable_lambda" | wc -l))
+
+# Check for Kokkos Host Execution Spaces one of which must be on
+
+KOKKOS_INTERNAL_USE_OPENMP := $(strip $(shell echo $(KOKKOS_DEVICES) | grep OpenMP | wc -l))
+KOKKOS_INTERNAL_USE_PTHREADS := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Pthread | wc -l))
+KOKKOS_INTERNAL_USE_SERIAL := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Serial | wc -l))
+KOKKOS_INTERNAL_USE_QTHREAD := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Qthread | wc -l))
+
+ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 0)
+ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 0)
+	KOKKOS_INTERNAL_USE_SERIAL := 1
+endif
+endif
+
+KOKKOS_INTERNAL_COMPILER_INTEL := $(shell $(CXX) --version        2>&1 | grep "Intel Corporation" | wc -l)
+KOKKOS_INTERNAL_COMPILER_PGI   := $(shell $(CXX) --version        2>&1 | grep PGI   | wc -l)
+KOKKOS_INTERNAL_COMPILER_XL    := $(shell $(CXX) -qversion        2>&1 | grep XL    | wc -l)
+KOKKOS_INTERNAL_COMPILER_CRAY  := $(shell $(CXX) -craype-verbose  2>&1 | grep "CC-" | wc -l)
+KOKKOS_INTERNAL_OS_CYGWIN      := $(shell uname | grep CYGWIN | wc -l)
+
+ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
+  KOKKOS_INTERNAL_OPENMP_FLAG := -mp 
+else
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1)
+    KOKKOS_INTERNAL_OPENMP_FLAG := -qsmp=omp
+  else
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
+      # OpenMP is turned on by default in Cray compiler environment
+      KOKKOS_INTERNAL_OPENMP_FLAG :=
+    else
+      KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp
+    endif
+  endif
+endif
+
+ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
+  KOKKOS_INTERNAL_CXX11_FLAG := --c++11
+else
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1)
+     KOKKOS_INTERNAL_CXX11_FLAG := -std=c++11
+  else
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
+      KOKKOS_INTERNAL_CXX11_FLAG := -hstd=c++11
+    else
+      KOKKOS_INTERNAL_CXX11_FLAG := --std=c++11
+    endif
+  endif
+endif
+
+# Check for other Execution Spaces
+KOKKOS_INTERNAL_USE_CUDA := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Cuda | wc -l))
+
+# Check for Kokkos Architecture settings
+
+#Intel based
+KOKKOS_INTERNAL_USE_ARCH_KNC := $(strip $(shell echo $(KOKKOS_ARCH) | grep KNC | wc -l))
+KOKKOS_INTERNAL_USE_ARCH_SNB := $(strip $(shell echo $(KOKKOS_ARCH) | grep SNB | wc -l))
+KOKKOS_INTERNAL_USE_ARCH_HSW := $(strip $(shell echo $(KOKKOS_ARCH) | grep HSW | wc -l))
+KOKKOS_INTERNAL_USE_ARCH_BDW := $(strip $(shell echo $(KOKKOS_ARCH) | grep BDW | wc -l))
+KOKKOS_INTERNAL_USE_ARCH_KNL := $(strip $(shell echo $(KOKKOS_ARCH) | grep KNL | wc -l))
+
+#NVIDIA based
+NVCC_WRAPPER :=  $(KOKKOS_PATH)/config/nvcc_wrapper
+KOKKOS_INTERNAL_USE_ARCH_KEPLER30 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Kepler30 | wc -l))
+KOKKOS_INTERNAL_USE_ARCH_KEPLER32 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Kepler32 | wc -l))
+KOKKOS_INTERNAL_USE_ARCH_KEPLER35 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Kepler35 | wc -l))
+KOKKOS_INTERNAL_USE_ARCH_KEPLER37 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Kepler37 | wc -l))
+KOKKOS_INTERNAL_USE_ARCH_MAXWELL50 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Maxwell50 | wc -l))
+KOKKOS_INTERNAL_USE_ARCH_MAXWELL52 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Maxwell52 | wc -l))
+KOKKOS_INTERNAL_USE_ARCH_MAXWELL53 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Maxwell53 | wc -l))
+KOKKOS_INTERNAL_USE_ARCH_PASCAL61 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Pascal61 | wc -l))
+KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30)  \
+                                                      + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32)  \
+                                                      + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35)  \
+                                                      + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER37)  \
+                                                      + $(KOKKOS_INTERNAL_USE_ARCH_PASCAL61)  \
+                                                      + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50) \
+                                                      + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52) \
+                                                      + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53) | bc))
+
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
+KOKKOS_INTERNAL_USE_ARCH_MAXWELL50 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Maxwell | wc -l))
+KOKKOS_INTERNAL_USE_ARCH_KEPLER35 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Kepler | wc -l))
+KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30)  \
+                                                      + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32)  \
+                                                      + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35)  \
+                                                      + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER37)  \
+                                                      + $(KOKKOS_INTERNAL_USE_ARCH_PASCAL61)  \
+                                                      + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50) \
+                                                      + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52) \
+                                                      + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53) | bc))
+endif
+
+#ARM based
+KOKKOS_INTERNAL_USE_ARCH_ARMV80 := $(strip $(shell echo $(KOKKOS_ARCH) | grep ARMv8 | wc -l))
+
+#IBM based
+KOKKOS_INTERNAL_USE_ARCH_BGQ := $(strip $(shell echo $(KOKKOS_ARCH) | grep BGQ | wc -l))
+KOKKOS_INTERNAL_USE_ARCH_POWER7 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Power7 | wc -l))
+KOKKOS_INTERNAL_USE_ARCH_POWER8 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Power8 | wc -l))
+KOKKOS_INTERNAL_USE_ARCH_IBM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_BGQ)+$(KOKKOS_INTERNAL_USE_ARCH_POWER7)+$(KOKKOS_INTERNAL_USE_ARCH_POWER8) | bc))
+
+#AMD based
+KOKKOS_INTERNAL_USE_ARCH_AMDAVX := $(strip $(shell echo $(KOKKOS_ARCH) | grep AMDAVX | wc -l))
+
+#Any AVX?
+KOKKOS_INTERNAL_USE_ARCH_AVX       := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_AMDAVX) | bc ))
+KOKKOS_INTERNAL_USE_ARCH_AVX2      := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW) | bc ))
+KOKKOS_INTERNAL_USE_ARCH_AVX512MIC := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNL) | bc ))
+
+# Decide what ISA level we are able to support
+KOKKOS_INTERNAL_USE_ISA_X86_64     := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW)+$(KOKKOS_INTERNAL_USE_ARCH_KNL) | bc ))
+KOKKOS_INTERNAL_USE_ISA_KNC        := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNC) | bc ))
+KOKKOS_INTERNAL_USE_ISA_POWERPCLE  := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_POWER8) | bc ))
+
+#Incompatible flags?
+KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_AMDAVX)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV80)>1" | bc ))
+KOKKOS_INTERNAL_USE_ARCH_MULTIGPU := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_NVIDIA)>1" | bc))
+
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MULTIHOST), 1)
+  $(error Defined Multiple Host architectures: KOKKOS_ARCH=$(KOKKOS_ARCH) )
+endif
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MULTIGPU), 1)
+  $(error Defined Multiple GPU architectures: KOKKOS_ARCH=$(KOKKOS_ARCH) )
+endif
+
+#Generating the list of Flags
+
+KOKKOS_CPPFLAGS = -I./ -I$(KOKKOS_PATH)/core/src -I$(KOKKOS_PATH)/containers/src -I$(KOKKOS_PATH)/algorithms/src
+
+# No warnings:
+KOKKOS_CXXFLAGS =
+# INTEL and CLANG warnings:
+#KOKKOS_CXXFLAGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized
+# GCC warnings:
+#KOKKOS_CXXFLAGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized -Wignored-qualifiers -Wempty-body -Wclobbered
+
+KOKKOS_LIBS = -lkokkos -ldl
+KOKKOS_LDFLAGS = -L$(shell pwd)
+KOKKOS_SRC = 
+KOKKOS_HEADERS =
+
+#Generating the KokkosCore_config.h file
+
+tmp := $(shell echo "/* ---------------------------------------------" > KokkosCore_config.tmp)
+tmp := $(shell echo "Makefile constructed configuration:" >> KokkosCore_config.tmp)
+tmp := $(shell date >> KokkosCore_config.tmp)
+tmp := $(shell echo "----------------------------------------------*/" >> KokkosCore_config.tmp)
+
+
+tmp := $(shell echo "/* Execution Spaces */" >> KokkosCore_config.tmp)
+ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
+	tmp := $(shell echo '\#define KOKKOS_HAVE_OPENMP 1' >> KokkosCore_config.tmp) 
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
+	tmp := $(shell echo "\#define KOKKOS_HAVE_PTHREAD 1" >> KokkosCore_config.tmp )
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
+	tmp := $(shell echo "\#define KOKKOS_HAVE_SERIAL 1" >> KokkosCore_config.tmp )
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+	tmp := $(shell echo "\#define KOKKOS_HAVE_CUDA 1" >> KokkosCore_config.tmp )
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_ISA_X86_64), 1)
+  	tmp := $(shell echo "\#define KOKKOS_USE_ISA_X86_64" >> KokkosCore_config.tmp )
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_ISA_KNC), 1)
+  	tmp := $(shell echo "\#define KOKKOS_USE_ISA_KNC" >> KokkosCore_config.tmp )
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_ISA_POWERPCLE), 1)
+  	tmp := $(shell echo "\#define KOKKOS_USE_ISA_POWERPCLE" >> KokkosCore_config.tmp )
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_QTHREAD), 1)
+	KOKKOS_CPPFLAGS += -I$(QTHREAD_PATH)/include
+	KOKKOS_LDFLAGS += -L$(QTHREAD_PATH)/lib 
+	tmp := $(shell echo "\#define KOKKOS_HAVE_QTHREAD 1" >> KokkosCore_config.tmp )
+endif
+
+tmp := $(shell echo "/* General Settings */" >> KokkosCore_config.tmp)
+ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX11), 1)
+	KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX11_FLAG)
+	tmp := $(shell echo "\#define KOKKOS_HAVE_CXX11 1" >> KokkosCore_config.tmp )
+endif
+
+ifeq ($(KOKKOS_INTERNAL_ENABLE_DEBUG), 1)
+ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+	KOKKOS_CXXFLAGS += -G
+endif
+	KOKKOS_CXXFLAGS += -g 
+	KOKKOS_LDFLAGS += -g -ldl
+	tmp := $(shell echo "\#define KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK 1" >> KokkosCore_config.tmp )
+	tmp := $(shell echo "\#define KOKKOS_HAVE_DEBUG 1" >> KokkosCore_config.tmp )
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_HWLOC), 1)
+	KOKKOS_CPPFLAGS += -I$(HWLOC_PATH)/include
+	KOKKOS_LDFLAGS += -L$(HWLOC_PATH)/lib 
+        KOKKOS_LIBS += -lhwloc
+	tmp := $(shell echo "\#define KOKKOS_HAVE_HWLOC 1" >> KokkosCore_config.tmp )
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_LIBRT), 1)
+	tmp := $(shell echo "\#define KOKKOS_USE_LIBRT 1" >> KokkosCore_config.tmp )
+	tmp := $(shell echo "\#define PREC_TIMER 1" >> KokkosCore_config.tmp )
+  tmp := $(shell echo "\#define KOKKOSP_ENABLE_RTLIB 1" >> KokkosCore_config.tmp )
+	KOKKOS_LIBS += -lrt
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1)
+  KOKKOS_CPPFLAGS += -I$(MEMKIND_PATH)/include
+  KOKKOS_LDFLAGS += -L$(MEMKIND_PATH)/lib 
+        KOKKOS_LIBS += -lmemkind
+  tmp := $(shell echo "\#define KOKKOS_HAVE_HBWSPACE 1" >> KokkosCore_config.tmp )
+endif
+
+ifeq ($(KOKKOS_INTERNAL_DISABLE_PROFILING), 1)
+  tmp := $(shell echo "\#define KOKKOS_ENABLE_PROFILING 0" >> KokkosCore_config.tmp )
+endif
+
+tmp := $(shell echo "/* Optimization Settings */" >> KokkosCore_config.tmp)
+
+ifeq ($(KOKKOS_INTERNAL_OPT_RANGE_AGGRESSIVE_VECTORIZATION), 1)
+  tmp := $(shell echo "\#define KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION 1" >> KokkosCore_config.tmp )
+endif
+
+tmp := $(shell echo "/* Cuda Settings */" >> KokkosCore_config.tmp)
+
+ifeq ($(KOKKOS_INTERNAL_CUDA_USE_LDG), 1)
+	tmp := $(shell echo "\#define KOKKOS_CUDA_USE_LDG_INTRINSIC 1" >> KokkosCore_config.tmp )
+endif
+
+ifeq ($(KOKKOS_INTERNAL_CUDA_USE_UVM), 1)
+	tmp := $(shell echo "\#define KOKKOS_CUDA_USE_UVM 1" >> KokkosCore_config.tmp )
+  tmp := $(shell echo "\#define KOKKOS_USE_CUDA_UVM 1" >> KokkosCore_config.tmp )
+endif
+
+ifeq ($(KOKKOS_INTERNAL_CUDA_USE_RELOC), 1)
+	tmp := $(shell echo "\#define KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE 1" >> KokkosCore_config.tmp )
+	KOKKOS_CXXFLAGS += --relocatable-device-code=true
+	KOKKOS_LDFLAGS += --relocatable-device-code=true
+endif
+
+ifeq ($(KOKKOS_INTERNAL_CUDA_USE_LAMBDA), 1)
+  tmp := $(shell echo "\#define KOKKOS_CUDA_USE_LAMBDA 1" >> KokkosCore_config.tmp )
+  KOKKOS_CXXFLAGS += -expt-extended-lambda
+endif
+
+#Add Architecture flags
+
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX), 1)
+    tmp := $(shell echo "\#define KOKKOS_ARCH_AVX 1" >> KokkosCore_config.tmp )
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
+	KOKKOS_CXXFLAGS +=
+	KOKKOS_LDFLAGS +=
+    else	
+	KOKKOS_CXXFLAGS += -mavx
+	KOKKOS_LDFLAGS += -mavx
+    endif
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER8), 1)
+    tmp := $(shell echo "\#define KOKKOS_ARCH_POWER8 1" >> KokkosCore_config.tmp )
+	KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8
+	KOKKOS_LDFLAGS  += -mcpu=power8 -mtune=power8
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX2), 1)
+    tmp := $(shell echo "\#define KOKKOS_ARCH_AVX2 1" >> KokkosCore_config.tmp )
+	ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
+		KOKKOS_CXXFLAGS += -xCORE-AVX2
+		KOKKOS_LDFLAGS  += -xCORE-AVX2
+	else
+		ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
+
+		else
+			ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1) 
+
+			else
+				# Assume that this is a really a GNU compiler
+				KOKKOS_CXXFLAGS += -march=core-avx2 -mtune=core-avx2
+				KOKKOS_LDFLAGS  += -march=core-avx2 -mtune=core-avx2
+			endif
+		endif
+	endif
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC), 1)
+    tmp := $(shell echo "\#define KOKKOS_ARCH_AVX512MIC 1" >> KokkosCore_config.tmp )
+	ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
+		KOKKOS_CXXFLAGS += -xMIC-AVX512
+		KOKKOS_LDFLAGS  += -xMIC-AVX512
+	else
+		ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
+
+		else
+			ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
+
+			else
+				# Asssume that this is really a GNU compiler
+				KOKKOS_CXXFLAGS += -march=knl
+				KOKKOS_LDFLAGS  += -march=knl
+			endif
+		endif
+	endif
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KNC), 1)
+    tmp := $(shell echo "\#define KOKKOS_ARCH_KNC 1" >> KokkosCore_config.tmp )
+	KOKKOS_CXXFLAGS += -mmic
+	KOKKOS_LDFLAGS += -mmic
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER30), 1)
+    tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER 1" >> KokkosCore_config.tmp )
+    tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER30 1" >> KokkosCore_config.tmp )
+	KOKKOS_CXXFLAGS += -arch=sm_30
+endif
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER32), 1)
+    tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER 1" >> KokkosCore_config.tmp )
+    tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER32 1" >> KokkosCore_config.tmp )
+	KOKKOS_CXXFLAGS += -arch=sm_32
+endif
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER35), 1)
+    tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER 1" >> KokkosCore_config.tmp )
+    tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER35 1" >> KokkosCore_config.tmp )
+	KOKKOS_CXXFLAGS += -arch=sm_35
+endif
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER37), 1)
+    tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER 1" >> KokkosCore_config.tmp )
+    tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER37 1" >> KokkosCore_config.tmp )
+	KOKKOS_CXXFLAGS += -arch=sm_37
+endif
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50), 1)
+    tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL 1" >> KokkosCore_config.tmp )
+    tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL50 1" >> KokkosCore_config.tmp )
+	KOKKOS_CXXFLAGS += -arch=sm_50
+endif
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52), 1)
+    tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL 1" >> KokkosCore_config.tmp )
+    tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL52 1" >> KokkosCore_config.tmp )
+	KOKKOS_CXXFLAGS += -arch=sm_52
+endif
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53), 1)
+    tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL 1" >> KokkosCore_config.tmp )
+    tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL53 1" >> KokkosCore_config.tmp )
+	KOKKOS_CXXFLAGS += -arch=sm_53
+endif
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL61), 1)
+    tmp := $(shell echo "\#define KOKKOS_ARCH_PASCAL 1" >> KokkosCore_config.tmp )
+    tmp := $(shell echo "\#define KOKKOS_ARCH_PASCAL61 1" >> KokkosCore_config.tmp )
+        KOKKOS_CXXFLAGS += -arch=sm_61
+endif
+endif
+ 
+KOKKOS_INTERNAL_LS_CONFIG := $(shell ls KokkosCore_config.h)
+ifeq ($(KOKKOS_INTERNAL_LS_CONFIG), KokkosCore_config.h)
+KOKKOS_INTERNAL_NEW_CONFIG := $(strip $(shell diff KokkosCore_config.h KokkosCore_config.tmp | grep define | wc -l))
+else
+KOKKOS_INTERNAL_NEW_CONFIG := 1
+endif
+
+ifneq ($(KOKKOS_INTERNAL_NEW_CONFIG), 0)
+	tmp := $(shell cp KokkosCore_config.tmp KokkosCore_config.h)
+endif
+
+KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/*.hpp)
+KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/impl/*.hpp)
+KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/containers/src/*.hpp)
+KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/containers/src/impl/*.hpp)
+KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/algorithms/src/*.hpp)
+
+KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/impl/*.cpp)
+KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/containers/src/impl/*.cpp)
+
+ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+	KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.cpp)
+	KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.hpp)
+	KOKKOS_LDFLAGS += -L$(CUDA_PATH)/lib64 
+	KOKKOS_LIBS += -lcudart -lcuda
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
+	KOKKOS_LIBS += -lpthread
+	KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Threads/*.cpp)
+	KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Threads/*.hpp)
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_QTHREAD), 1)
+	KOKKOS_LIBS += -lqthread
+	KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Qthread/*.cpp)
+	KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Qthread/*.hpp)
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
+	KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/OpenMP/*.cpp)
+	KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/OpenMP/*.hpp)
+	ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+		KOKKOS_CXXFLAGS += -Xcompiler $(KOKKOS_INTERNAL_OPENMP_FLAG)
+	else
+		KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_OPENMP_FLAG)
+	endif
+	KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_OPENMP_FLAG)
+endif
+
+#With Cygwin functions such as fdopen and fileno are not defined 
+#when strict ansi is enabled. strict ansi gets enabled with --std=c++11
+#though. So we hard undefine it here. Not sure if that has any bad side effects
+#This is needed for gtest actually, not for Kokkos itself!
+ifeq ($(KOKKOS_INTERNAL_OS_CYGWIN), 1)
+  KOKKOS_CXXFLAGS += -U__STRICT_ANSI__
+endif
+
+# Setting up dependencies
+
+KokkosCore_config.h:
+
+KOKKOS_CPP_DEPENDS := KokkosCore_config.h $(KOKKOS_HEADERS)
+
+KOKKOS_OBJ = $(KOKKOS_SRC:.cpp=.o)
+KOKKOS_OBJ_LINK = $(notdir $(KOKKOS_OBJ))
+
+include $(KOKKOS_PATH)/Makefile.targets
+
+kokkos-clean:
+	rm -f $(KOKKOS_OBJ_LINK) KokkosCore_config.h KokkosCore_config.tmp libkokkos.a
+
+libkokkos.a: $(KOKKOS_OBJ_LINK) $(KOKKOS_SRC) $(KOKKOS_HEADERS)
+	ar cr libkokkos.a $(KOKKOS_OBJ_LINK)
+	ranlib libkokkos.a
+
+KOKKOS_LINK_DEPENDS=libkokkos.a
diff --git a/lib/kokkos/Makefile.targets b/lib/kokkos/Makefile.targets
new file mode 100644
index 0000000000000000000000000000000000000000..86929ea0fe6e9e2158923e6907c7b2a179e5af61
--- /dev/null
+++ b/lib/kokkos/Makefile.targets
@@ -0,0 +1,72 @@
+Kokkos_UnorderedMap_impl.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/containers/src/impl/Kokkos_UnorderedMap_impl.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/containers/src/impl/Kokkos_UnorderedMap_impl.cpp
+Kokkos_Core.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Core.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Core.cpp
+Kokkos_CPUDiscovery.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_CPUDiscovery.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_CPUDiscovery.cpp
+Kokkos_Error.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Error.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Error.cpp
+Kokkos_ExecPolicy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_ExecPolicy.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_ExecPolicy.cpp
+Kokkos_HostSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostSpace.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostSpace.cpp
+Kokkos_hwloc.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_hwloc.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_hwloc.cpp
+Kokkos_Serial.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial.cpp
+Kokkos_Serial_TaskPolicy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_TaskPolicy.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_TaskPolicy.cpp
+Kokkos_TaskQueue.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp
+Kokkos_Serial_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_Task.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_Task.cpp
+Kokkos_Shape.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Shape.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Shape.cpp
+Kokkos_spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_spinwait.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_spinwait.cpp
+Kokkos_Profiling_Interface.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Profiling_Interface.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Profiling_Interface.cpp
+KokkosExp_SharedAlloc.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/KokkosExp_SharedAlloc.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/KokkosExp_SharedAlloc.cpp
+Kokkos_MemoryPool.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_MemoryPool.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_MemoryPool.cpp
+
+ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+Kokkos_Cuda_Impl.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Impl.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Impl.cpp
+Kokkos_CudaSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_CudaSpace.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_CudaSpace.cpp
+Kokkos_Cuda_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp
+Kokkos_Cuda_TaskPolicy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_TaskPolicy.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_TaskPolicy.cpp
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
+Kokkos_ThreadsExec_base.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec_base.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec_base.cpp
+Kokkos_ThreadsExec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec.cpp
+Kokkos_Threads_TaskPolicy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_Threads_TaskPolicy.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_Threads_TaskPolicy.cpp
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_QTHREAD), 1)
+Kokkos_QthreadExec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Qthread/Kokkos_QthreadExec.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Qthread/Kokkos_QthreadExec.cpp
+Kokkos_Qthread_TaskPolicy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Qthread/Kokkos_Qthread_TaskPolicy.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Qthread/Kokkos_Qthread_TaskPolicy.cpp
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
+Kokkos_OpenMPexec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMPexec.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMPexec.cpp
+Kokkos_OpenMP_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Task.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Task.cpp
+endif
+
+Kokkos_HBWSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp
+Kokkos_HBWAllocators.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWAllocators.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWAllocators.cpp
+
diff --git a/lib/kokkos/README b/lib/kokkos/README
new file mode 100644
index 0000000000000000000000000000000000000000..b094578af631b179e9744f744a823a1800bd885b
--- /dev/null
+++ b/lib/kokkos/README
@@ -0,0 +1,152 @@
+Kokkos implements a programming model in C++ for writing performance portable
+applications targeting all major HPC platforms. For that purpose it provides
+abstractions for both parallel execution of code and data management.
+Kokkos is designed to target complex node architectures with N-level memory
+hierarchies and multiple types of execution resources. It currently can use
+OpenMP, Pthreads and CUDA as backend programming models.
+
+The core developers of Kokkos are Carter Edwards and Christian Trott
+at the Computer Science Research Institute of the Sandia National
+Laboratories.
+
+The KokkosP interface and associated tools are developed by the Application
+Performance Team and Kokkos core developers at Sandia National Laboratories.
+
+To learn more about Kokkos consider watching one of our presentations:
+GTC 2015:
+  http://on-demand.gputechconf.com/gtc/2015/video/S5166.html
+  http://on-demand.gputechconf.com/gtc/2015/presentation/S5166-H-Carter-Edwards.pdf
+
+A programming guide can be found under doc/Kokkos_PG.pdf. This is an initial version
+and feedback is greatly appreciated.
+
+A separate repository with extensive tutorial material can be found under 
+https://github.com/kokkos/kokkos-tutorials.
+
+If you have a patch to contribute please feel free to issue a pull request against
+the develop branch. For major contributions it is better to contact us first
+for guidance.
+
+For questions please send an email to
+kokkos-users@software.sandia.gov
+
+For non-public questions send an email to
+hcedwar(at)sandia.gov and crtrott(at)sandia.gov
+
+============================================================================
+====Requirements============================================================
+============================================================================
+
+Primary tested compilers on X86 are:
+  GCC 4.7.2
+  GCC 4.8.4
+  GCC 4.9.2
+  GCC 5.1.0
+  Intel 14.0.4
+  Intel 15.0.2
+  Intel 16.0.1
+  Clang 3.5.2
+  Clang 3.6.1
+
+Primary tested compilers on Power 8 are:
+  IBM XL 13.1.3 (OpenMP,Serial)
+  GCC 4.9.2 (OpenMP,Serial)
+  GCC 5.3.0 (OpenMP,Serial)
+
+Secondary tested compilers are:
+  CUDA 6.5 (with gcc 4.7.2)
+  CUDA 7.0 (with gcc 4.7.2)
+  CUDA 7.5 (with gcc 4.8.4)
+
+Other compilers working:
+  X86:
+   Intel 17.0.042 (the FENL example causes internal compiler error)
+   PGI 15.4
+   Cygwin 2.1.0 64bit with gcc 4.9.3
+  KNL:
+   Intel 16.2.181 (the FENL example causes internal compiler error)
+   Intel 17.0.042 (the FENL example causes internal compiler error)
+
+Known non-working combinations:
+  Power8:
+   GCC 6.1.0
+   Pthreads backend
+
+
+Primary tested compiler are passing in release mode
+with warnings as errors. They also are tested with a comprehensive set of 
+backend combinations (i.e. OpenMP, Pthreads, Serial, OpenMP+Serial, ...).
+We are using the following set of flags:
+GCC:   -Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits
+       -Wignored-qualifiers -Wempty-body -Wclobbered -Wuninitialized
+Intel: -Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wuninitialized
+Clang: -Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wuninitialized
+
+Secondary compilers are passing without -Werror.
+Other compilers are tested occasionally, in particular when pushing from develop to 
+master branch, without -Werror and only for a select set of backends.
+
+============================================================================
+====Getting started=========================================================
+============================================================================
+
+In the 'example/tutorial' directory you will find step by step tutorial
+examples which explain many of the features of Kokkos. They work with
+simple Makefiles. To build with g++ and OpenMP simply type 'make openmp'
+in the 'example/tutorial' directory. This will build all examples in the
+subfolders.
+
+============================================================================
+====Running Unit Tests======================================================
+============================================================================
+
+To run the unit tests create a build directory and run the following commands
+
+KOKKOS_PATH/generate_makefile.bash
+make build-test
+make test
+
+Run KOKKOS_PATH/generate_makefile.bash --help for more detailed options such as
+changing the device type for which to build.
+
+============================================================================
+====Install the library=====================================================
+============================================================================
+
+To install Kokkos as a library create a build directory and run the following
+
+KOKKOS_PATH/generate_makefile.bash --prefix=INSTALL_PATH
+make lib
+make install
+
+KOKKOS_PATH/generate_makefile.bash --help for more detailed options such as
+changing the device type for which to build.
+
+============================================================================
+====CMakeFiles==============================================================
+============================================================================
+
+The CMake files contained in this repository require Tribits and are used
+for integration with Trilinos. They do not currently support a standalone
+CMake build.
+
+===========================================================================
+====Kokkos and CUDA UVM====================================================
+===========================================================================
+
+Kokkos does support UVM as a specific memory space called CudaUVMSpace. 
+Allocations made with that space are accessible from host and device. 
+You can tell Kokkos to use that as the default space for Cuda allocations.
+In either case UVM comes with a number of restrictions:
+(i) You can't access allocations on the host while a kernel is potentially 
+running. This will lead to segfaults. To avoid that you either need to 
+call Kokkos::Cuda::fence() (or just Kokkos::fence()), after kernels, or
+you can set the environment variable CUDA_LAUNCH_BLOCKING=1.
+Furthermore in multi socket multi GPU machines, UVM defaults to using 
+zero copy allocations for technical reasons related to using multiple
+GPUs from the same process. If an executable doesn't do that (e.g. each
+MPI rank of an application uses a single GPU [can be the same GPU for 
+multiple MPI ranks]) you can set CUDA_MANAGED_FORCE_DEVICE_ALLOC=1.
+This will enforce proper UVM allocations, but can lead to errors if 
+more than a single GPU is used by a single process.
+ 
diff --git a/lib/kokkos/algorithms/CMakeLists.txt b/lib/kokkos/algorithms/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7853184a5418a5a4d3247cc1b64190719c251635
--- /dev/null
+++ b/lib/kokkos/algorithms/CMakeLists.txt
@@ -0,0 +1,10 @@
+
+
+TRIBITS_SUBPACKAGE(Algorithms)
+
+ADD_SUBDIRECTORY(src)
+
+TRIBITS_ADD_TEST_DIRECTORIES(unit_tests)
+#TRIBITS_ADD_TEST_DIRECTORIES(performance_tests)
+
+TRIBITS_SUBPACKAGE_POSTPROCESS()
diff --git a/lib/kokkos/algorithms/cmake/Dependencies.cmake b/lib/kokkos/algorithms/cmake/Dependencies.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..1d71d8af341181f689a6a8bf63036b67584cb138
--- /dev/null
+++ b/lib/kokkos/algorithms/cmake/Dependencies.cmake
@@ -0,0 +1,5 @@
+TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
+  LIB_REQUIRED_PACKAGES KokkosCore
+  LIB_OPTIONAL_TPLS Pthread CUDA HWLOC
+  TEST_OPTIONAL_TPLS CUSPARSE
+  )
diff --git a/lib/kokkos/algorithms/cmake/KokkosAlgorithms_config.h.in b/lib/kokkos/algorithms/cmake/KokkosAlgorithms_config.h.in
new file mode 100644
index 0000000000000000000000000000000000000000..67334b70f36b6db55b225f25c91d8a8c4cb3aaab
--- /dev/null
+++ b/lib/kokkos/algorithms/cmake/KokkosAlgorithms_config.h.in
@@ -0,0 +1,4 @@
+#ifndef KOKKOS_ALGORITHMS_CONFIG_H
+#define KOKKOS_ALGORITHMS_CONFIG_H
+
+#endif
diff --git a/lib/kokkos/algorithms/src/CMakeLists.txt b/lib/kokkos/algorithms/src/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..dfbf3323c2d51953a12d8e82371d9f971aaa1e13
--- /dev/null
+++ b/lib/kokkos/algorithms/src/CMakeLists.txt
@@ -0,0 +1,21 @@
+
+TRIBITS_CONFIGURE_FILE(${PACKAGE_NAME}_config.h)
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+#-----------------------------------------------------------------------------
+
+FILE(GLOB HEADERS *.hpp)
+FILE(GLOB SOURCES *.cpp)
+LIST(APPEND HEADERS ${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME}_config.h)
+
+#-----------------------------------------------------------------------------
+
+TRIBITS_ADD_LIBRARY(
+    kokkosalgorithms
+    HEADERS ${HEADERS}
+    SOURCES ${SOURCES}
+    DEPLIBS
+    )
+
diff --git a/lib/kokkos/algorithms/src/KokkosAlgorithms_dummy.cpp b/lib/kokkos/algorithms/src/KokkosAlgorithms_dummy.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/lib/kokkos/algorithms/src/Kokkos_Random.hpp b/lib/kokkos/algorithms/src/Kokkos_Random.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d7c06dc14be99bc63b8f0170843d81067577771e
--- /dev/null
+++ b/lib/kokkos/algorithms/src/Kokkos_Random.hpp
@@ -0,0 +1,1751 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_RANDOM_HPP
+#define KOKKOS_RANDOM_HPP
+
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Complex.hpp>
+#include <cstdio>
+#include <cstdlib>
+#include <cmath>
+
+/// \file Kokkos_Random.hpp
+/// \brief Pseudorandom number generators
+///
+/// These generators are based on Vigna, Sebastiano (2014). "An
+/// experimental exploration of Marsaglia's xorshift generators,
+/// scrambled."  See: http://arxiv.org/abs/1402.6246
+
+namespace Kokkos {
+
+  /*Template functions to get equidistributed random numbers from a generator for a specific Scalar type
+
+       template<class Generator,Scalar>
+       struct rand{
+
+         //Max value returned by draw(Generator& gen)
+         KOKKOS_INLINE_FUNCTION
+         static Scalar max();
+
+         //Returns a value between zero and max()
+         KOKKOS_INLINE_FUNCTION
+         static Scalar draw(Generator& gen);
+
+         //Returns a value between zero and range()
+         //Note: for floating point values range can be larger than max()
+         KOKKOS_INLINE_FUNCTION
+         static Scalar draw(Generator& gen, const Scalar& range){}
+
+         //Return value between start and end
+         KOKKOS_INLINE_FUNCTION
+         static Scalar draw(Generator& gen, const Scalar& start, const Scalar& end);
+      };
+
+    The Random number generators themselves have two components a state-pool and the actual generator
+    A state-pool manages a number of generators, so that each active thread is able to grep its own.
+    This allows the generation of random numbers which are independent between threads. Note that
+    in contrast to CuRand none of the functions of the pool (or the generator) are collectives,
+    i.e. all functions can be called inside conditionals.
+
+    template<class Device>
+    class Pool {
+     public:
+      //The Kokkos device type
+      typedef Device device_type;
+      //The actual generator type
+      typedef Generator<Device> generator_type;
+
+      //Default constructor: does not initialize a pool
+      Pool();
+
+      //Initializing constructor: calls init(seed,Device_Specific_Number);
+      Pool(unsigned int seed);
+
+      //Intialize Pool with seed as a starting seed with a pool_size of num_states
+      //The Random_XorShift64 generator is used in serial to initialize all states,
+      //thus the intialization process is platform independent and deterministic.
+      void init(unsigned int seed, int num_states);
+
+      //Get a generator. This will lock one of the states, guaranteeing that each thread
+      //will have its private generator. Note: on Cuda getting a state involves atomics,
+      //and is thus not deterministic!
+      generator_type get_state();
+
+      //Give a state back to the pool. This unlocks the state, and writes the modified
+      //state of the generator back to the pool.
+      void free_state(generator_type gen);
+
+    }
+
+    template<class Device>
+    class Generator {
+     public:
+     //The Kokkos device type
+    typedef DeviceType device_type;
+
+    //Max return values of respective [X]rand[S]() functions
+    enum {MAX_URAND = 0xffffffffU};
+    enum {MAX_URAND64 = 0xffffffffffffffffULL-1};
+    enum {MAX_RAND = static_cast<int>(0xffffffffU/2)};
+    enum {MAX_RAND64 = static_cast<int64_t>(0xffffffffffffffffULL/2-1)};
+
+
+    //Init with a state and the idx with respect to pool. Note: in serial the
+    //Generator can be used by just giving it the necessary state arguments
+    KOKKOS_INLINE_FUNCTION
+    Generator (STATE_ARGUMENTS, int state_idx = 0);
+
+    //Draw a equidistributed uint32_t in the range (0,MAX_URAND]
+    KOKKOS_INLINE_FUNCTION
+    uint32_t urand();
+
+    //Draw a equidistributed uint64_t in the range (0,MAX_URAND64]
+    KOKKOS_INLINE_FUNCTION
+    uint64_t urand64();
+
+    //Draw a equidistributed uint32_t in the range (0,range]
+    KOKKOS_INLINE_FUNCTION
+    uint32_t urand(const uint32_t& range);
+
+    //Draw a equidistributed uint32_t in the range (start,end]
+    KOKKOS_INLINE_FUNCTION
+    uint32_t urand(const uint32_t& start, const uint32_t& end );
+
+    //Draw a equidistributed uint64_t in the range (0,range]
+    KOKKOS_INLINE_FUNCTION
+    uint64_t urand64(const uint64_t& range);
+
+    //Draw a equidistributed uint64_t in the range (start,end]
+    KOKKOS_INLINE_FUNCTION
+    uint64_t urand64(const uint64_t& start, const uint64_t& end );
+
+    //Draw a equidistributed int in the range (0,MAX_RAND]
+    KOKKOS_INLINE_FUNCTION
+    int rand();
+
+    //Draw a equidistributed int in the range (0,range]
+    KOKKOS_INLINE_FUNCTION
+    int rand(const int& range);
+
+    //Draw a equidistributed int in the range (start,end]
+    KOKKOS_INLINE_FUNCTION
+    int rand(const int& start, const int& end );
+
+    //Draw a equidistributed int64_t in the range (0,MAX_RAND64]
+    KOKKOS_INLINE_FUNCTION
+    int64_t rand64();
+
+    //Draw a equidistributed int64_t in the range (0,range]
+    KOKKOS_INLINE_FUNCTION
+    int64_t rand64(const int64_t& range);
+
+    //Draw a equidistributed int64_t in the range (start,end]
+    KOKKOS_INLINE_FUNCTION
+    int64_t rand64(const int64_t& start, const int64_t& end );
+
+    //Draw a equidistributed float in the range (0,1.0]
+    KOKKOS_INLINE_FUNCTION
+    float frand();
+
+    //Draw a equidistributed float in the range (0,range]
+    KOKKOS_INLINE_FUNCTION
+    float frand(const float& range);
+
+    //Draw a equidistributed float in the range (start,end]
+    KOKKOS_INLINE_FUNCTION
+    float frand(const float& start, const float& end );
+
+    //Draw a equidistributed double in the range (0,1.0]
+    KOKKOS_INLINE_FUNCTION
+    double drand();
+
+    //Draw a equidistributed double in the range (0,range]
+    KOKKOS_INLINE_FUNCTION
+    double drand(const double& range);
+
+    //Draw a equidistributed double in the range (start,end]
+    KOKKOS_INLINE_FUNCTION
+    double drand(const double& start, const double& end );
+
+    //Draw a standard normal distributed double
+    KOKKOS_INLINE_FUNCTION
+    double normal() ;
+
+    //Draw a normal distributed double with given mean and standard deviation
+    KOKKOS_INLINE_FUNCTION
+    double normal(const double& mean, const double& std_dev=1.0);
+    }
+
+    //Additional Functions:
+
+    //Fills view with random numbers in the range (0,range]
+    template<class ViewType, class PoolType>
+    void fill_random(ViewType view, PoolType pool, ViewType::value_type range);
+
+    //Fills view with random numbers in the range (start,end]
+    template<class ViewType, class PoolType>
+    void fill_random(ViewType view, PoolType pool,
+                     ViewType::value_type start, ViewType::value_type end);
+
+*/
+
+  template<class Generator, class Scalar>
+  struct rand;
+
+
+  template<class Generator>
+  struct rand<Generator,char> {
+
+    KOKKOS_INLINE_FUNCTION
+    static short max(){return 127;}
+    KOKKOS_INLINE_FUNCTION
+    static short draw(Generator& gen)
+                          {return short((gen.rand()&0xff+256)%256);}
+    KOKKOS_INLINE_FUNCTION
+    static short draw(Generator& gen, const char& range)
+                          {return char(gen.rand(range));}
+    KOKKOS_INLINE_FUNCTION
+    static short draw(Generator& gen, const char& start, const char& end)
+                          {return char(gen.rand(start,end));}
+
+  };
+
+  template<class Generator>
+  struct rand<Generator,short> {
+    KOKKOS_INLINE_FUNCTION
+    static short max(){return 32767;}
+    KOKKOS_INLINE_FUNCTION
+    static short draw(Generator& gen)
+                          {return short((gen.rand()&0xffff+65536)%32768);}
+    KOKKOS_INLINE_FUNCTION
+    static short draw(Generator& gen, const short& range)
+                          {return short(gen.rand(range));}
+    KOKKOS_INLINE_FUNCTION
+    static short draw(Generator& gen, const short& start, const short& end)
+                          {return short(gen.rand(start,end));}
+
+  };
+
+  template<class Generator>
+  struct rand<Generator,int> {
+    KOKKOS_INLINE_FUNCTION
+    static int max(){return Generator::MAX_RAND;}
+    KOKKOS_INLINE_FUNCTION
+    static int draw(Generator& gen)
+                          {return gen.rand();}
+    KOKKOS_INLINE_FUNCTION
+    static int draw(Generator& gen, const int& range)
+                          {return gen.rand(range);}
+    KOKKOS_INLINE_FUNCTION
+    static int draw(Generator& gen, const int& start, const int& end)
+                          {return gen.rand(start,end);}
+
+  };
+
+  template<class Generator>
+  struct rand<Generator,unsigned int> {
+    KOKKOS_INLINE_FUNCTION
+    static unsigned int max () {
+      return Generator::MAX_URAND;
+    }
+    KOKKOS_INLINE_FUNCTION
+    static unsigned int draw (Generator& gen) {
+      return gen.urand ();
+    }
+    KOKKOS_INLINE_FUNCTION
+    static unsigned int draw(Generator& gen, const unsigned int& range) {
+      return gen.urand (range);
+    }
+    KOKKOS_INLINE_FUNCTION
+    static unsigned int
+    draw (Generator& gen, const unsigned int& start, const unsigned int& end) {
+      return gen.urand (start, end);
+    }
+  };
+
+  template<class Generator>
+  struct rand<Generator,long> {
+    KOKKOS_INLINE_FUNCTION
+    static long max () {
+      // FIXME (mfh 26 Oct 2014) It would be better to select the
+      // return value at compile time, using something like enable_if.
+      return sizeof (long) == 4 ?
+        static_cast<long> (Generator::MAX_RAND) :
+        static_cast<long> (Generator::MAX_RAND64);
+    }
+    KOKKOS_INLINE_FUNCTION
+    static long draw (Generator& gen) {
+      // FIXME (mfh 26 Oct 2014) It would be better to select the
+      // return value at compile time, using something like enable_if.
+      return sizeof (long) == 4 ?
+        static_cast<long> (gen.rand ()) :
+        static_cast<long> (gen.rand64 ());
+    }
+    KOKKOS_INLINE_FUNCTION
+    static long draw (Generator& gen, const long& range) {
+      // FIXME (mfh 26 Oct 2014) It would be better to select the
+      // return value at compile time, using something like enable_if.
+      return sizeof (long) == 4 ?
+        static_cast<long> (gen.rand (static_cast<int> (range))) :
+        static_cast<long> (gen.rand64 (range));
+    }
+    KOKKOS_INLINE_FUNCTION
+    static long draw (Generator& gen, const long& start, const long& end) {
+      // FIXME (mfh 26 Oct 2014) It would be better to select the
+      // return value at compile time, using something like enable_if.
+      return sizeof (long) == 4 ?
+        static_cast<long> (gen.rand (static_cast<int> (start),
+                                     static_cast<int> (end))) :
+        static_cast<long> (gen.rand64 (start, end));
+    }
+  };
+
+  template<class Generator>
+  struct rand<Generator,unsigned long> {
+    KOKKOS_INLINE_FUNCTION
+    static unsigned long max () {
+      // FIXME (mfh 26 Oct 2014) It would be better to select the
+      // return value at compile time, using something like enable_if.
+      return sizeof (unsigned long) == 4 ?
+        static_cast<unsigned long> (Generator::MAX_URAND) :
+        static_cast<unsigned long> (Generator::MAX_URAND64);
+    }
+    KOKKOS_INLINE_FUNCTION
+    static unsigned long draw (Generator& gen) {
+      // FIXME (mfh 26 Oct 2014) It would be better to select the
+      // return value at compile time, using something like enable_if.
+      return sizeof (unsigned long) == 4 ?
+        static_cast<unsigned long> (gen.urand ()) :
+        static_cast<unsigned long> (gen.urand64 ());
+    }
+    KOKKOS_INLINE_FUNCTION
+    static unsigned long draw(Generator& gen, const unsigned long& range) {
+      // FIXME (mfh 26 Oct 2014) It would be better to select the
+      // return value at compile time, using something like enable_if.
+      return sizeof (unsigned long) == 4 ?
+        static_cast<unsigned long> (gen.urand (static_cast<unsigned int> (range))) :
+        static_cast<unsigned long> (gen.urand64 (range));
+    }
+    KOKKOS_INLINE_FUNCTION
+    static unsigned long
+    draw (Generator& gen, const unsigned long& start, const unsigned long& end) {
+      // FIXME (mfh 26 Oct 2014) It would be better to select the
+      // return value at compile time, using something like enable_if.
+      return sizeof (unsigned long) == 4 ?
+        static_cast<unsigned long> (gen.urand (static_cast<unsigned int> (start),
+                                               static_cast<unsigned int> (end))) :
+        static_cast<unsigned long> (gen.urand64 (start, end));
+    }
+  };
+
+  // NOTE (mfh 26 oct 2014) This is a partial specialization for long
+  // long, a C99 / C++11 signed type which is guaranteed to be at
+  // least 64 bits.  Do NOT write a partial specialization for
+  // int64_t!!!  This is just a typedef!  It could be either long or
+  // long long.  We don't know which a priori, and I've seen both.
+  // The types long and long long are guaranteed to differ, so it's
+  // always safe to specialize for both.
+  template<class Generator>
+  struct rand<Generator, long long> {
+    KOKKOS_INLINE_FUNCTION
+    static long long max () {
+      // FIXME (mfh 26 Oct 2014) It's legal for long long to be > 64 bits.
+      return Generator::MAX_RAND64;
+    }
+    KOKKOS_INLINE_FUNCTION
+    static long long draw (Generator& gen) {
+      // FIXME (mfh 26 Oct 2014) It's legal for long long to be > 64 bits.
+      return gen.rand64 ();
+    }
+    KOKKOS_INLINE_FUNCTION
+    static long long draw (Generator& gen, const long long& range) {
+      // FIXME (mfh 26 Oct 2014) It's legal for long long to be > 64 bits.
+      return gen.rand64 (range);
+    }
+    KOKKOS_INLINE_FUNCTION
+    static long long draw (Generator& gen, const long long& start, const long long& end) {
+      // FIXME (mfh 26 Oct 2014) It's legal for long long to be > 64 bits.
+      return gen.rand64 (start, end);
+    }
+  };
+
+  // NOTE (mfh 26 oct 2014) This is a partial specialization for
+  // unsigned long long, a C99 / C++11 unsigned type which is
+  // guaranteed to be at least 64 bits.  Do NOT write a partial
+  // specialization for uint64_t!!!  This is just a typedef!  It could
+  // be either unsigned long or unsigned long long.  We don't know
+  // which a priori, and I've seen both.  The types unsigned long and
+  // unsigned long long are guaranteed to differ, so it's always safe
+  // to specialize for both.
+  template<class Generator>
+  struct rand<Generator,unsigned long long> {
+    KOKKOS_INLINE_FUNCTION
+    static unsigned long long max () {
+      // FIXME (mfh 26 Oct 2014) It's legal for unsigned long long to be > 64 bits.
+      return Generator::MAX_URAND64;
+    }
+    KOKKOS_INLINE_FUNCTION
+    static unsigned long long draw (Generator& gen) {
+      // FIXME (mfh 26 Oct 2014) It's legal for unsigned long long to be > 64 bits.
+      return gen.urand64 ();
+    }
+    KOKKOS_INLINE_FUNCTION
+    static unsigned long long draw (Generator& gen, const unsigned long long& range) {
+      // FIXME (mfh 26 Oct 2014) It's legal for long long to be > 64 bits.
+      return gen.urand64 (range);
+    }
+    KOKKOS_INLINE_FUNCTION
+    static unsigned long long
+    draw (Generator& gen, const unsigned long long& start, const unsigned long long& end) {
+      // FIXME (mfh 26 Oct 2014) It's legal for long long to be > 64 bits.
+      return gen.urand64 (start, end);
+    }
+  };
+
+  template<class Generator>
+  struct rand<Generator,float> {
+    KOKKOS_INLINE_FUNCTION
+    static float max(){return 1.0f;}
+    KOKKOS_INLINE_FUNCTION
+    static float draw(Generator& gen)
+                          {return gen.frand();}
+    KOKKOS_INLINE_FUNCTION
+    static float draw(Generator& gen, const float& range)
+                          {return gen.frand(range);}
+    KOKKOS_INLINE_FUNCTION
+    static float draw(Generator& gen, const float& start, const float& end)
+                          {return gen.frand(start,end);}
+
+  };
+
+  template<class Generator>
+  struct rand<Generator,double> {
+    KOKKOS_INLINE_FUNCTION
+    static double max(){return 1.0;}
+    KOKKOS_INLINE_FUNCTION
+    static double draw(Generator& gen)
+                          {return gen.drand();}
+    KOKKOS_INLINE_FUNCTION
+    static double draw(Generator& gen, const double& range)
+                          {return gen.drand(range);}
+    KOKKOS_INLINE_FUNCTION
+    static double draw(Generator& gen, const double& start, const double& end)
+                          {return gen.drand(start,end);}
+
+  };
+
+  template<class Generator>
+  struct rand<Generator, ::Kokkos::complex<float> > {
+    KOKKOS_INLINE_FUNCTION
+    static ::Kokkos::complex<float> max () {
+      return ::Kokkos::complex<float> (1.0, 1.0);
+    }
+    KOKKOS_INLINE_FUNCTION
+    static ::Kokkos::complex<float> draw (Generator& gen) {
+      const float re = gen.frand ();
+      const float im = gen.frand ();
+      return ::Kokkos::complex<float> (re, im);
+    }
+    KOKKOS_INLINE_FUNCTION
+    static ::Kokkos::complex<float> draw (Generator& gen, const ::Kokkos::complex<float>& range) {
+      const float re = gen.frand (real (range));
+      const float im = gen.frand (imag (range));
+      return ::Kokkos::complex<float> (re, im);
+    }
+    KOKKOS_INLINE_FUNCTION
+    static ::Kokkos::complex<float> draw (Generator& gen, const ::Kokkos::complex<float>& start, const ::Kokkos::complex<float>& end) {
+      const float re = gen.frand (real (start), real (end));
+      const float im = gen.frand (imag (start), imag (end));
+      return ::Kokkos::complex<float> (re, im);
+    }
+  };
+
+  template<class Generator>
+  struct rand<Generator, ::Kokkos::complex<double> > {
+    KOKKOS_INLINE_FUNCTION
+    static ::Kokkos::complex<double> max () {
+      return ::Kokkos::complex<double> (1.0, 1.0);
+    }
+    KOKKOS_INLINE_FUNCTION
+    static ::Kokkos::complex<double> draw (Generator& gen) {
+      const double re = gen.drand ();
+      const double im = gen.drand ();
+      return ::Kokkos::complex<double> (re, im);
+    }
+    KOKKOS_INLINE_FUNCTION
+    static ::Kokkos::complex<double> draw (Generator& gen, const ::Kokkos::complex<double>& range) {
+      const double re = gen.drand (real (range));
+      const double im = gen.drand (imag (range));
+      return ::Kokkos::complex<double> (re, im);
+    }
+    KOKKOS_INLINE_FUNCTION
+    static ::Kokkos::complex<double> draw (Generator& gen, const ::Kokkos::complex<double>& start, const ::Kokkos::complex<double>& end) {
+      const double re = gen.drand (real (start), real (end));
+      const double im = gen.drand (imag (start), imag (end));
+      return ::Kokkos::complex<double> (re, im);
+    }
+  };
+
+  template<class DeviceType>
+  class Random_XorShift64_Pool;
+
+  template<class DeviceType>
+  class Random_XorShift64 {
+  private:
+    uint64_t state_;
+    const int state_idx_;
+    friend class Random_XorShift64_Pool<DeviceType>;
+  public:
+
+    typedef DeviceType device_type;
+
+    enum {MAX_URAND = 0xffffffffU};
+    enum {MAX_URAND64 = 0xffffffffffffffffULL-1};
+    enum {MAX_RAND = static_cast<int>(0xffffffff/2)};
+    enum {MAX_RAND64 = static_cast<int64_t>(0xffffffffffffffffLL/2-1)};
+
+    KOKKOS_INLINE_FUNCTION
+    Random_XorShift64 (uint64_t state, int state_idx = 0)
+     : state_(state),state_idx_(state_idx){}
+
+    KOKKOS_INLINE_FUNCTION
+    uint32_t urand() {
+      state_ ^= state_ >> 12;
+      state_ ^= state_ << 25;
+      state_ ^= state_ >> 27;
+
+      uint64_t tmp = state_ * 2685821657736338717ULL;
+      tmp = tmp>>16;
+      return static_cast<uint32_t>(tmp&MAX_URAND);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint64_t urand64() {
+      state_ ^= state_ >> 12;
+      state_ ^= state_ << 25;
+      state_ ^= state_ >> 27;
+      return (state_ * 2685821657736338717ULL) - 1;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint32_t urand(const uint32_t& range) {
+      const uint32_t max_val = (MAX_URAND/range)*range;
+      uint32_t tmp = urand();
+      while(tmp>=max_val)
+        tmp = urand();
+      return tmp%range;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint32_t urand(const uint32_t& start, const uint32_t& end ) {
+      return urand(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint64_t urand64(const uint64_t& range) {
+      const uint64_t max_val = (MAX_URAND64/range)*range;
+      uint64_t tmp = urand64();
+      while(tmp>=max_val)
+        tmp = urand64();
+      return tmp%range;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint64_t urand64(const uint64_t& start, const uint64_t& end ) {
+      return urand64(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int rand() {
+      return static_cast<int>(urand()/2);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int rand(const int& range) {
+      const int max_val = (MAX_RAND/range)*range;
+      int tmp = rand();
+      while(tmp>=max_val)
+        tmp = rand();
+      return tmp%range;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int rand(const int& start, const int& end ) {
+      return rand(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int64_t rand64() {
+      return static_cast<int64_t>(urand64()/2);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int64_t rand64(const int64_t& range) {
+      const int64_t max_val = (MAX_RAND64/range)*range;
+      int64_t tmp = rand64();
+      while(tmp>=max_val)
+        tmp = rand64();
+      return tmp%range;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int64_t rand64(const int64_t& start, const int64_t& end ) {
+      return rand64(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    float frand() {
+      return 1.0f * urand64()/MAX_URAND64;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    float frand(const float& range) {
+      return range * urand64()/MAX_URAND64;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    float frand(const float& start, const float& end ) {
+      return frand(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double drand() {
+      return 1.0 * urand64()/MAX_URAND64;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double drand(const double& range) {
+      return range * urand64()/MAX_URAND64;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double drand(const double& start, const double& end ) {
+      return drand(end-start)+start;
+    }
+
+    //Marsaglia polar method for drawing a standard normal distributed random number
+    KOKKOS_INLINE_FUNCTION
+    double normal() {
+      double S = 2.0;
+      double U;
+      while(S>=1.0) {
+        U = drand();
+        const double V = drand();
+        S = U*U+V*V;
+      }
+      return U*sqrt(-2.0*log(S)/S);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double normal(const double& mean, const double& std_dev=1.0) {
+      return mean + normal()*std_dev;
+    }
+
+  };
+
+  template<class DeviceType = Kokkos::DefaultExecutionSpace>
+  class Random_XorShift64_Pool {
+  private:
+    typedef View<int*,DeviceType> lock_type;
+    typedef View<uint64_t*,DeviceType> state_data_type;
+    lock_type locks_;
+    state_data_type state_;
+    int num_states_;
+
+  public:
+    typedef Random_XorShift64<DeviceType> generator_type;
+    typedef DeviceType device_type;
+
+    Random_XorShift64_Pool() {
+      num_states_ = 0;
+    }
+    Random_XorShift64_Pool(uint64_t seed) {
+      num_states_ = 0;
+      init(seed,DeviceType::max_hardware_threads());
+    }
+
+    Random_XorShift64_Pool(const Random_XorShift64_Pool& src):
+      locks_(src.locks_),
+      state_(src.state_),
+      num_states_(src.num_states_)
+    {}
+
+    Random_XorShift64_Pool operator = (const Random_XorShift64_Pool& src) {
+      locks_ = src.locks_;
+      state_ = src.state_;
+      num_states_ = src.num_states_;
+      return *this;
+    }
+
+    void init(uint64_t seed, int num_states) {
+      num_states_ = num_states;
+
+      locks_ = lock_type("Kokkos::Random_XorShift64::locks",num_states_);
+      state_ = state_data_type("Kokkos::Random_XorShift64::state",num_states_);
+
+      typename state_data_type::HostMirror h_state = create_mirror_view(state_);
+      typename lock_type::HostMirror h_lock = create_mirror_view(locks_);
+
+      // Execute on the HostMirror's default execution space.
+      Random_XorShift64<typename state_data_type::HostMirror::execution_space> gen(seed,0);
+      for(int i = 0; i < 17; i++)
+        gen.rand();
+      for(int i = 0; i < num_states_; i++) {
+        int n1 = gen.rand();
+        int n2 = gen.rand();
+        int n3 = gen.rand();
+        int n4 = gen.rand();
+        h_state(i) = (((static_cast<uint64_t>(n1)) & 0xffff)<<00) |
+                     (((static_cast<uint64_t>(n2)) & 0xffff)<<16) |
+                     (((static_cast<uint64_t>(n3)) & 0xffff)<<32) |
+                     (((static_cast<uint64_t>(n4)) & 0xffff)<<48);
+        h_lock(i) = 0;
+      }
+      deep_copy(state_,h_state);
+      deep_copy(locks_,h_lock);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    Random_XorShift64<DeviceType> get_state() const {
+      const int i = DeviceType::hardware_thread_id();;
+      return Random_XorShift64<DeviceType>(state_(i),i);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void free_state(const Random_XorShift64<DeviceType>& state) const {
+      state_(state.state_idx_) = state.state_;
+    }
+  };
+
+
+  template<class DeviceType>
+  class Random_XorShift1024_Pool;
+
+  template<class DeviceType>
+  class Random_XorShift1024 {
+  private:
+    int p_;
+    const int state_idx_;
+    uint64_t state_[16];
+    friend class Random_XorShift1024_Pool<DeviceType>;
+  public:
+
+    typedef Random_XorShift1024_Pool<DeviceType> pool_type;
+    typedef DeviceType device_type;
+
+    enum {MAX_URAND = 0xffffffffU};
+    enum {MAX_URAND64 = 0xffffffffffffffffULL-1};
+    enum {MAX_RAND = static_cast<int>(0xffffffffU/2)};
+    enum {MAX_RAND64 = static_cast<int64_t>(0xffffffffffffffffULL/2-1)};
+
+    KOKKOS_INLINE_FUNCTION
+    Random_XorShift1024 (const typename pool_type::state_data_type& state, int p, int state_idx = 0):
+      p_(p),state_idx_(state_idx){
+      for(int i=0 ; i<16; i++)
+        state_[i] = state(state_idx,i);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint32_t urand() {
+      uint64_t state_0 = state_[ p_ ];
+      uint64_t state_1 = state_[ p_ = ( p_ + 1 ) & 15 ];
+      state_1 ^= state_1 << 31;
+      state_1 ^= state_1 >> 11;
+      state_0 ^= state_0 >> 30;
+      uint64_t tmp = ( state_[ p_ ] = state_0 ^ state_1 ) * 1181783497276652981ULL;
+      tmp = tmp>>16;
+      return static_cast<uint32_t>(tmp&MAX_URAND);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint64_t urand64() {
+      uint64_t state_0 = state_[ p_ ];
+      uint64_t state_1 = state_[ p_ = ( p_ + 1 ) & 15 ];
+      state_1 ^= state_1 << 31;
+      state_1 ^= state_1 >> 11;
+      state_0 ^= state_0 >> 30;
+      return (( state_[ p_ ] = state_0 ^ state_1 ) * 1181783497276652981LL) - 1;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint32_t urand(const uint32_t& range) {
+      const uint32_t max_val = (MAX_URAND/range)*range;
+      uint32_t tmp = urand();
+      while(tmp>=max_val)
+        tmp = urand();
+      return tmp%range;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint32_t urand(const uint32_t& start, const uint32_t& end ) {
+      return urand(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint64_t urand64(const uint64_t& range) {
+      const uint64_t max_val = (MAX_URAND64/range)*range;
+      uint64_t tmp = urand64();
+      while(tmp>=max_val)
+        tmp = urand64();
+      return tmp%range;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint64_t urand64(const uint64_t& start, const uint64_t& end ) {
+      return urand64(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int rand() {
+      return static_cast<int>(urand()/2);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int rand(const int& range) {
+      const int max_val = (MAX_RAND/range)*range;
+      int tmp = rand();
+      while(tmp>=max_val)
+        tmp = rand();
+      return tmp%range;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int rand(const int& start, const int& end ) {
+      return rand(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int64_t rand64() {
+      return static_cast<int64_t>(urand64()/2);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int64_t rand64(const int64_t& range) {
+      const int64_t max_val = (MAX_RAND64/range)*range;
+      int64_t tmp = rand64();
+      while(tmp>=max_val)
+        tmp = rand64();
+      return tmp%range;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int64_t rand64(const int64_t& start, const int64_t& end ) {
+      return rand64(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    float frand() {
+      return 1.0f * urand64()/MAX_URAND64;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    float frand(const float& range) {
+      return range * urand64()/MAX_URAND64;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    float frand(const float& start, const float& end ) {
+      return frand(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double drand() {
+      return 1.0 * urand64()/MAX_URAND64;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double drand(const double& range) {
+      return range * urand64()/MAX_URAND64;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double drand(const double& start, const double& end ) {
+      return frand(end-start)+start;
+    }
+
+    //Marsaglia polar method for drawing a standard normal distributed random number
+    KOKKOS_INLINE_FUNCTION
+    double normal() {
+      double S = 2.0;
+      double U;
+      while(S>=1.0) {
+        U = drand();
+        const double V = drand();
+        S = U*U+V*V;
+      }
+      return U*sqrt(-2.0*log(S)/S);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double normal(const double& mean, const double& std_dev=1.0) {
+      return mean + normal()*std_dev;
+    }
+  };
+
+
+  template<class DeviceType = Kokkos::DefaultExecutionSpace>
+  class Random_XorShift1024_Pool {
+  private:
+    typedef View<int*,DeviceType> int_view_type;
+    typedef View<uint64_t*[16],DeviceType> state_data_type;
+
+    int_view_type locks_;
+    state_data_type state_;
+    int_view_type p_;
+    int num_states_;
+    friend class Random_XorShift1024<DeviceType>;
+
+  public:
+    typedef Random_XorShift1024<DeviceType> generator_type;
+
+    typedef DeviceType device_type;
+
+    Random_XorShift1024_Pool() {
+      num_states_ = 0;
+    }
+
+    inline
+    Random_XorShift1024_Pool(uint64_t seed){
+      num_states_ = 0;
+      init(seed,DeviceType::max_hardware_threads());
+    }
+
+    Random_XorShift1024_Pool(const Random_XorShift1024_Pool& src):
+      locks_(src.locks_),
+      state_(src.state_),
+      p_(src.p_),
+      num_states_(src.num_states_)
+    {}
+
+    Random_XorShift1024_Pool operator = (const Random_XorShift1024_Pool& src) {
+      locks_ = src.locks_;
+      state_ = src.state_;
+      p_ = src.p_;
+      num_states_ = src.num_states_;
+      return *this;
+    }
+
+    inline
+    void init(uint64_t seed, int num_states) {
+      num_states_ = num_states;
+
+      locks_ = int_view_type("Kokkos::Random_XorShift1024::locks",num_states_);
+      state_ = state_data_type("Kokkos::Random_XorShift1024::state",num_states_);
+      p_ = int_view_type("Kokkos::Random_XorShift1024::p",num_states_);
+
+      typename state_data_type::HostMirror h_state = create_mirror_view(state_);
+      typename int_view_type::HostMirror h_lock = create_mirror_view(locks_);
+      typename int_view_type::HostMirror h_p = create_mirror_view(p_);
+
+      // Execute on the HostMirror's default execution space.
+      Random_XorShift64<typename state_data_type::HostMirror::execution_space> gen(seed,0);
+      for(int i = 0; i < 17; i++)
+        gen.rand();
+      for(int i = 0; i < num_states_; i++) {
+        for(int j = 0; j < 16 ; j++) {
+          int n1 = gen.rand();
+          int n2 = gen.rand();
+          int n3 = gen.rand();
+          int n4 = gen.rand();
+          h_state(i,j) = (((static_cast<uint64_t>(n1)) & 0xffff)<<00) |
+                         (((static_cast<uint64_t>(n2)) & 0xffff)<<16) |
+                         (((static_cast<uint64_t>(n3)) & 0xffff)<<32) |
+                         (((static_cast<uint64_t>(n4)) & 0xffff)<<48);
+        }
+        h_p(i) = 0;
+        h_lock(i) = 0;
+      }
+      deep_copy(state_,h_state);
+      deep_copy(locks_,h_lock);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    Random_XorShift1024<DeviceType> get_state() const {
+      const int i = DeviceType::hardware_thread_id();
+      return Random_XorShift1024<DeviceType>(state_,p_(i),i);
+    };
+
+    KOKKOS_INLINE_FUNCTION
+    void free_state(const Random_XorShift1024<DeviceType>& state) const {
+      for(int i = 0; i<16; i++)
+        state_(state.state_idx_,i) = state.state_[i];
+      p_(state.state_idx_) = state.p_;
+    }
+  };
+
+#if defined(KOKKOS_HAVE_CUDA) && defined(__CUDACC__)
+
+  template<>
+  class Random_XorShift1024<Kokkos::Cuda> {
+  private:
+    int p_;
+    const int state_idx_;
+    uint64_t* state_;
+    const int stride_;
+    friend class Random_XorShift1024_Pool<Kokkos::Cuda>;
+  public:
+
+    typedef Kokkos::Cuda device_type;
+    typedef Random_XorShift1024_Pool<device_type> pool_type;
+
+    enum {MAX_URAND = 0xffffffffU};
+    enum {MAX_URAND64 = 0xffffffffffffffffULL-1};
+    enum {MAX_RAND = static_cast<int>(0xffffffffU/2)};
+    enum {MAX_RAND64 = static_cast<int64_t>(0xffffffffffffffffULL/2-1)};
+
+    KOKKOS_INLINE_FUNCTION
+    Random_XorShift1024 (const typename pool_type::state_data_type& state, int p, int state_idx = 0):
+      p_(p),state_idx_(state_idx),state_(&state(state_idx,0)),stride_(state.stride_1()){
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint32_t urand() {
+      uint64_t state_0 = state_[ p_ * stride_ ];
+      uint64_t state_1 = state_[ (p_ = ( p_ + 1 ) & 15) * stride_ ];
+      state_1 ^= state_1 << 31;
+      state_1 ^= state_1 >> 11;
+      state_0 ^= state_0 >> 30;
+      uint64_t tmp = ( state_[ p_ * stride_ ] = state_0 ^ state_1 ) * 1181783497276652981ULL;
+      tmp = tmp>>16;
+      return static_cast<uint32_t>(tmp&MAX_URAND);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint64_t urand64() {
+      uint64_t state_0 = state_[ p_ * stride_ ];
+      uint64_t state_1 = state_[ (p_ = ( p_ + 1 ) & 15) * stride_ ];
+      state_1 ^= state_1 << 31;
+      state_1 ^= state_1 >> 11;
+      state_0 ^= state_0 >> 30;
+      return (( state_[ p_ * stride_ ] = state_0 ^ state_1 ) * 1181783497276652981LL) - 1;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint32_t urand(const uint32_t& range) {
+      const uint32_t max_val = (MAX_URAND/range)*range;
+      uint32_t tmp = urand();
+      while(tmp>=max_val)
+        urand();
+      return tmp%range;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint32_t urand(const uint32_t& start, const uint32_t& end ) {
+      return urand(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint64_t urand64(const uint64_t& range) {
+      const uint64_t max_val = (MAX_URAND64/range)*range;
+      uint64_t tmp = urand64();
+      while(tmp>=max_val)
+        urand64();
+      return tmp%range;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint64_t urand64(const uint64_t& start, const uint64_t& end ) {
+      return urand64(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int rand() {
+      return static_cast<int>(urand()/2);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int rand(const int& range) {
+      const int max_val = (MAX_RAND/range)*range;
+      int tmp = rand();
+      while(tmp>=max_val)
+        rand();
+      return tmp%range;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int rand(const int& start, const int& end ) {
+      return rand(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int64_t rand64() {
+      return static_cast<int64_t>(urand64()/2);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int64_t rand64(const int64_t& range) {
+      const int64_t max_val = (MAX_RAND64/range)*range;
+      int64_t tmp = rand64();
+      while(tmp>=max_val)
+        rand64();
+      return tmp%range;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int64_t rand64(const int64_t& start, const int64_t& end ) {
+      return rand64(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    float frand() {
+      return 1.0f * urand64()/MAX_URAND64;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    float frand(const float& range) {
+      return range * urand64()/MAX_URAND64;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    float frand(const float& start, const float& end ) {
+      return frand(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double drand() {
+      return 1.0 * urand64()/MAX_URAND64;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double drand(const double& range) {
+      return range * urand64()/MAX_URAND64;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double drand(const double& start, const double& end ) {
+      return frand(end-start)+start;
+    }
+
+    //Marsaglia polar method for drawing a standard normal distributed random number
+    KOKKOS_INLINE_FUNCTION
+    double normal() {
+      double S = 2.0;
+      double U;
+      while(S>=1.0) {
+        U = drand();
+        const double V = drand();
+        S = U*U+V*V;
+      }
+      return U*sqrt(-2.0*log(S)/S);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double normal(const double& mean, const double& std_dev=1.0) {
+      return mean + normal()*std_dev;
+    }
+  };
+
+template<>
+inline
+Random_XorShift64_Pool<Kokkos::Cuda>::Random_XorShift64_Pool(uint64_t seed) {
+  num_states_ = 0;
+  init(seed,4*32768);
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+Random_XorShift64<Kokkos::Cuda> Random_XorShift64_Pool<Kokkos::Cuda>::get_state() const {
+#ifdef __CUDA_ARCH__
+  const int i_offset = (threadIdx.x*blockDim.y + threadIdx.y)*blockDim.z+threadIdx.z;
+  int i = (((blockIdx.x*gridDim.y+blockIdx.y)*gridDim.z + blockIdx.z) *
+           blockDim.x*blockDim.y*blockDim.z + i_offset)%num_states_;
+  while(Kokkos::atomic_compare_exchange(&locks_(i),0,1)) {
+      i+=blockDim.x*blockDim.y*blockDim.z;
+      if(i>=num_states_) {i = i_offset;}
+  }
+
+  return Random_XorShift64<Kokkos::Cuda>(state_(i),i);
+#else
+  return Random_XorShift64<Kokkos::Cuda>(state_(0),0);
+#endif
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+void Random_XorShift64_Pool<Kokkos::Cuda>::free_state(const Random_XorShift64<Kokkos::Cuda> &state) const {
+#ifdef __CUDA_ARCH__
+  state_(state.state_idx_) = state.state_;
+  locks_(state.state_idx_) = 0;
+  return;
+#endif
+}
+
+
+template<>
+inline
+Random_XorShift1024_Pool<Kokkos::Cuda>::Random_XorShift1024_Pool(uint64_t seed) {
+  num_states_ = 0;
+  init(seed,4*32768);
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+Random_XorShift1024<Kokkos::Cuda> Random_XorShift1024_Pool<Kokkos::Cuda>::get_state() const {
+#ifdef __CUDA_ARCH__
+  const int i_offset = (threadIdx.x*blockDim.y + threadIdx.y)*blockDim.z+threadIdx.z;
+  int i = (((blockIdx.x*gridDim.y+blockIdx.y)*gridDim.z + blockIdx.z) *
+           blockDim.x*blockDim.y*blockDim.z + i_offset)%num_states_;
+  while(Kokkos::atomic_compare_exchange(&locks_(i),0,1)) {
+      i+=blockDim.x*blockDim.y*blockDim.z;
+      if(i>=num_states_) {i = i_offset;}
+  }
+
+  return Random_XorShift1024<Kokkos::Cuda>(state_, p_(i), i);
+#else
+  return Random_XorShift1024<Kokkos::Cuda>(state_, p_(0), 0);
+#endif
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+void Random_XorShift1024_Pool<Kokkos::Cuda>::free_state(const Random_XorShift1024<Kokkos::Cuda> &state) const {
+#ifdef __CUDA_ARCH__
+  for(int i=0; i<16; i++)
+    state_(state.state_idx_,i) = state.state_[i];
+  locks_(state.state_idx_) = 0;
+  return;
+#endif
+}
+
+
+#endif
+
+
+namespace Impl {
+
+template<class ViewType, class RandomPool, int loops, int rank, class IndexType>
+struct fill_random_functor_range;
+template<class ViewType, class RandomPool, int loops, int rank, class IndexType>
+struct fill_random_functor_begin_end;
+
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_range<ViewType,RandomPool,loops,1,IndexType>{
+  typedef typename ViewType::execution_space execution_space;
+  ViewType a;
+  RandomPool rand_pool;
+  typename ViewType::const_value_type range;
+
+  typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand;
+
+  fill_random_functor_range(ViewType a_, RandomPool rand_pool_,
+      typename ViewType::const_value_type range_):
+    a(a_),rand_pool(rand_pool_),range(range_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const IndexType& i) const {
+    typename RandomPool::generator_type gen = rand_pool.get_state();
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.dimension_0()))
+        a(idx) = Rand::draw(gen,range);
+    }
+    rand_pool.free_state(gen);
+  }
+};
+
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_range<ViewType,RandomPool,loops,2,IndexType>{
+  typedef typename ViewType::execution_space execution_space;
+  ViewType a;
+  RandomPool rand_pool;
+  typename ViewType::const_value_type range;
+
+  typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand;
+
+  fill_random_functor_range(ViewType a_, RandomPool rand_pool_,
+      typename ViewType::const_value_type range_):
+    a(a_),rand_pool(rand_pool_),range(range_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (IndexType i) const {
+    typename RandomPool::generator_type gen = rand_pool.get_state();
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.dimension_0())) {
+        for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
+          a(idx,k) = Rand::draw(gen,range);
+      }
+    }
+    rand_pool.free_state(gen);
+  }
+};
+
+
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_range<ViewType,RandomPool,loops,3,IndexType>{
+  typedef typename ViewType::execution_space execution_space;
+  ViewType a;
+  RandomPool rand_pool;
+  typename ViewType::const_value_type range;
+
+  typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand;
+
+  fill_random_functor_range(ViewType a_, RandomPool rand_pool_,
+      typename ViewType::const_value_type range_):
+    a(a_),rand_pool(rand_pool_),range(range_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (IndexType i) const {
+    typename RandomPool::generator_type gen = rand_pool.get_state();
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.dimension_0())) {
+        for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
+          for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
+            a(idx,k,l) = Rand::draw(gen,range);
+      }
+    }
+    rand_pool.free_state(gen);
+  }
+};
+
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_range<ViewType,RandomPool,loops,4, IndexType>{
+  typedef typename ViewType::execution_space execution_space;
+  ViewType a;
+  RandomPool rand_pool;
+  typename ViewType::const_value_type range;
+
+  typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand;
+
+  fill_random_functor_range(ViewType a_, RandomPool rand_pool_,
+      typename ViewType::const_value_type range_):
+    a(a_),rand_pool(rand_pool_),range(range_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (IndexType i) const {
+    typename RandomPool::generator_type gen = rand_pool.get_state();
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.dimension_0())) {
+        for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
+          for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
+            for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
+              a(idx,k,l,m) = Rand::draw(gen,range);
+      }
+    }
+    rand_pool.free_state(gen);
+  }
+};
+
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_range<ViewType,RandomPool,loops,5,IndexType>{
+  typedef typename ViewType::execution_space execution_space;
+  ViewType a;
+  RandomPool rand_pool;
+  typename ViewType::const_value_type range;
+
+  typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand;
+
+  fill_random_functor_range(ViewType a_, RandomPool rand_pool_,
+      typename ViewType::const_value_type range_):
+    a(a_),rand_pool(rand_pool_),range(range_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (IndexType i) const {
+    typename RandomPool::generator_type gen = rand_pool.get_state();
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.dimension_0())) {
+        for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
+          for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
+            for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
+              for(IndexType n=0;n<static_cast<IndexType>(a.dimension_4());n++)
+              a(idx,k,l,m,n) = Rand::draw(gen,range);
+      }
+    }
+    rand_pool.free_state(gen);
+  }
+};
+
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_range<ViewType,RandomPool,loops,6,IndexType>{
+  typedef typename ViewType::execution_space execution_space;
+  ViewType a;
+  RandomPool rand_pool;
+  typename ViewType::const_value_type range;
+
+  typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand;
+
+  fill_random_functor_range(ViewType a_, RandomPool rand_pool_,
+      typename ViewType::const_value_type range_):
+    a(a_),rand_pool(rand_pool_),range(range_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (IndexType i) const {
+    typename RandomPool::generator_type gen = rand_pool.get_state();
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.dimension_0())) {
+        for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
+          for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
+            for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
+              for(IndexType n=0;n<static_cast<IndexType>(a.dimension_4());n++)
+                for(IndexType o=0;o<static_cast<IndexType>(a.dimension_5());o++)
+              a(idx,k,l,m,n,o) = Rand::draw(gen,range);
+      }
+    }
+    rand_pool.free_state(gen);
+  }
+};
+
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_range<ViewType,RandomPool,loops,7,IndexType>{
+  typedef typename ViewType::execution_space execution_space;
+  ViewType a;
+  RandomPool rand_pool;
+  typename ViewType::const_value_type range;
+
+  typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand;
+
+  fill_random_functor_range(ViewType a_, RandomPool rand_pool_,
+      typename ViewType::const_value_type range_):
+    a(a_),rand_pool(rand_pool_),range(range_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (IndexType i) const {
+    typename RandomPool::generator_type gen = rand_pool.get_state();
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.dimension_0())) {
+        for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
+          for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
+            for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
+              for(IndexType n=0;n<static_cast<IndexType>(a.dimension_4());n++)
+                for(IndexType o=0;o<static_cast<IndexType>(a.dimension_5());o++)
+                  for(IndexType p=0;p<static_cast<IndexType>(a.dimension_6());p++)
+              a(idx,k,l,m,n,o,p) = Rand::draw(gen,range);
+      }
+    }
+    rand_pool.free_state(gen);
+  }
+};
+
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_range<ViewType,RandomPool,loops,8,IndexType>{
+  typedef typename ViewType::execution_space execution_space;
+  ViewType a;
+  RandomPool rand_pool;
+  typename ViewType::const_value_type range;
+
+  typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand;
+
+  fill_random_functor_range(ViewType a_, RandomPool rand_pool_,
+      typename ViewType::const_value_type range_):
+    a(a_),rand_pool(rand_pool_),range(range_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (IndexType i) const {
+    typename RandomPool::generator_type gen = rand_pool.get_state();
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.dimension_0())) {
+        for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
+          for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
+            for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
+              for(IndexType n=0;n<static_cast<IndexType>(a.dimension_4());n++)
+                for(IndexType o=0;o<static_cast<IndexType>(a.dimension_5());o++)
+                  for(IndexType p=0;p<static_cast<IndexType>(a.dimension_6());p++)
+                    for(IndexType q=0;q<static_cast<IndexType>(a.dimension_7());q++)
+              a(idx,k,l,m,n,o,p,q) = Rand::draw(gen,range);
+      }
+    }
+    rand_pool.free_state(gen);
+  }
+};
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_begin_end<ViewType,RandomPool,loops,1,IndexType>{
+  typedef typename ViewType::execution_space execution_space;
+  ViewType a;
+  RandomPool rand_pool;
+  typename ViewType::const_value_type begin,end;
+
+  typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand;
+
+  fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_,
+      typename ViewType::const_value_type begin_, typename ViewType::const_value_type end_):
+    a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (IndexType i) const {
+    typename RandomPool::generator_type gen = rand_pool.get_state();
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.dimension_0()))
+        a(idx) = Rand::draw(gen,begin,end);
+    }
+    rand_pool.free_state(gen);
+  }
+};
+
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_begin_end<ViewType,RandomPool,loops,2,IndexType>{
+  typedef typename ViewType::execution_space execution_space;
+  ViewType a;
+  RandomPool rand_pool;
+  typename ViewType::const_value_type begin,end;
+
+  typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand;
+
+  fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_,
+      typename ViewType::const_value_type begin_, typename ViewType::const_value_type end_):
+    a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (IndexType i) const {
+    typename RandomPool::generator_type gen = rand_pool.get_state();
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.dimension_0())) {
+        for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
+          a(idx,k) = Rand::draw(gen,begin,end);
+      }
+    }
+    rand_pool.free_state(gen);
+  }
+};
+
+
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_begin_end<ViewType,RandomPool,loops,3,IndexType>{
+  typedef typename ViewType::execution_space execution_space;
+  ViewType a;
+  RandomPool rand_pool;
+  typename ViewType::const_value_type begin,end;
+
+  typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand;
+
+  fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_,
+      typename ViewType::const_value_type begin_, typename ViewType::const_value_type end_):
+    a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (IndexType i) const {
+    typename RandomPool::generator_type gen = rand_pool.get_state();
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.dimension_0())) {
+        for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
+          for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
+            a(idx,k,l) = Rand::draw(gen,begin,end);
+      }
+    }
+    rand_pool.free_state(gen);
+  }
+};
+
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_begin_end<ViewType,RandomPool,loops,4,IndexType>{
+  typedef typename ViewType::execution_space execution_space;
+  ViewType a;
+  RandomPool rand_pool;
+  typename ViewType::const_value_type begin,end;
+
+  typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand;
+
+  fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_,
+      typename ViewType::const_value_type begin_, typename ViewType::const_value_type end_):
+    a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (IndexType i) const {
+    typename RandomPool::generator_type gen = rand_pool.get_state();
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.dimension_0())) {
+        for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
+          for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
+            for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
+              a(idx,k,l,m) = Rand::draw(gen,begin,end);
+      }
+    }
+    rand_pool.free_state(gen);
+  }
+};
+
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_begin_end<ViewType,RandomPool,loops,5,IndexType>{
+  typedef typename ViewType::execution_space execution_space;
+  ViewType a;
+  RandomPool rand_pool;
+  typename ViewType::const_value_type begin,end;
+
+  typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand;
+
+  fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_,
+      typename ViewType::const_value_type begin_, typename ViewType::const_value_type end_):
+    a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (IndexType i) const {
+    typename RandomPool::generator_type gen = rand_pool.get_state();
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.dimension_0())){
+        for(IndexType l=0;l<static_cast<IndexType>(a.dimension_1());l++)
+          for(IndexType m=0;m<static_cast<IndexType>(a.dimension_2());m++)
+            for(IndexType n=0;n<static_cast<IndexType>(a.dimension_3());n++)
+              for(IndexType o=0;o<static_cast<IndexType>(a.dimension_4());o++)
+          a(idx,l,m,n,o) = Rand::draw(gen,begin,end);
+      }
+    }
+    rand_pool.free_state(gen);
+  }
+};
+
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_begin_end<ViewType,RandomPool,loops,6,IndexType>{
+  typedef typename ViewType::execution_space execution_space;
+  ViewType a;
+  RandomPool rand_pool;
+  typename ViewType::const_value_type begin,end;
+
+  typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand;
+
+  fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_,
+      typename ViewType::const_value_type begin_, typename ViewType::const_value_type end_):
+    a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (IndexType i) const {
+    typename RandomPool::generator_type gen = rand_pool.get_state();
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.dimension_0())) {
+        for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
+          for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
+            for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
+              for(IndexType n=0;n<static_cast<IndexType>(a.dimension_4());n++)
+                for(IndexType o=0;o<static_cast<IndexType>(a.dimension_5());o++)
+          a(idx,k,l,m,n,o) = Rand::draw(gen,begin,end);
+      }
+    }
+    rand_pool.free_state(gen);
+  }
+};
+
+
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_begin_end<ViewType,RandomPool,loops,7,IndexType>{
+  typedef typename ViewType::execution_space execution_space;
+  ViewType a;
+  RandomPool rand_pool;
+  typename ViewType::const_value_type begin,end;
+
+  typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand;
+
+  fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_,
+      typename ViewType::const_value_type begin_, typename ViewType::const_value_type end_):
+    a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (IndexType i) const {
+    typename RandomPool::generator_type gen = rand_pool.get_state();
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.dimension_0())) {
+        for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
+          for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
+            for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
+              for(IndexType n=0;n<static_cast<IndexType>(a.dimension_4());n++)
+                for(IndexType o=0;o<static_cast<IndexType>(a.dimension_5());o++)
+                  for(IndexType p=0;p<static_cast<IndexType>(a.dimension_6());p++)
+            a(idx,k,l,m,n,o,p) = Rand::draw(gen,begin,end);
+      }
+    }
+    rand_pool.free_state(gen);
+  }
+};
+
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_begin_end<ViewType,RandomPool,loops,8,IndexType>{
+  typedef typename ViewType::execution_space execution_space;
+  ViewType a;
+  RandomPool rand_pool;
+  typename ViewType::const_value_type begin,end;
+
+  typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand;
+
+  fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_,
+      typename ViewType::const_value_type begin_, typename ViewType::const_value_type end_):
+    a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (IndexType i) const {
+    typename RandomPool::generator_type gen = rand_pool.get_state();
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.dimension_0())) {
+        for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
+          for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
+            for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
+              for(IndexType n=0;n<static_cast<IndexType>(a.dimension_4());n++)
+                for(IndexType o=0;o<static_cast<IndexType>(a.dimension_5());o++)
+                  for(IndexType p=0;p<static_cast<IndexType>(a.dimension_6());p++)
+                    for(IndexType q=0;q<static_cast<IndexType>(a.dimension_7());q++)
+              a(idx,k,l,m,n,o,p,q) = Rand::draw(gen,begin,end);
+      }
+    }
+    rand_pool.free_state(gen);
+  }
+};
+
+}
+
+template<class ViewType, class RandomPool, class IndexType = int64_t>
+void fill_random(ViewType a, RandomPool g, typename ViewType::const_value_type range) {
+  int64_t LDA = a.dimension_0();
+  if(LDA>0)
+    parallel_for((LDA+127)/128,Impl::fill_random_functor_range<ViewType,RandomPool,128,ViewType::Rank,IndexType>(a,g,range));
+}
+
+template<class ViewType, class RandomPool, class IndexType = int64_t>
+void fill_random(ViewType a, RandomPool g, typename ViewType::const_value_type begin,typename ViewType::const_value_type end ) {
+  int64_t LDA = a.dimension_0();
+  if(LDA>0)
+    parallel_for((LDA+127)/128,Impl::fill_random_functor_begin_end<ViewType,RandomPool,128,ViewType::Rank,IndexType>(a,g,begin,end));
+}
+}
+
+#endif
diff --git a/lib/kokkos/algorithms/src/Kokkos_Sort.hpp b/lib/kokkos/algorithms/src/Kokkos_Sort.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..6123ce978c8a385a87ac57bdca45a9ff8517757f
--- /dev/null
+++ b/lib/kokkos/algorithms/src/Kokkos_Sort.hpp
@@ -0,0 +1,496 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+
+#ifndef KOKKOS_SORT_HPP_
+#define KOKKOS_SORT_HPP_
+
+#include <Kokkos_Core.hpp>
+
+#include <algorithm>
+
+namespace Kokkos {
+
+  namespace SortImpl {
+
+  template<class ValuesViewType, int Rank=ValuesViewType::Rank>
+  struct CopyOp;
+
+  template<class ValuesViewType>
+  struct CopyOp<ValuesViewType,1> {
+    template<class DstType, class SrcType>
+    KOKKOS_INLINE_FUNCTION
+    static void copy(DstType& dst, size_t i_dst,
+                     SrcType& src, size_t i_src ) {
+      dst(i_dst) = src(i_src);
+    }
+  };
+
+  template<class ValuesViewType>
+  struct CopyOp<ValuesViewType,2> {
+    template<class DstType, class SrcType>
+    KOKKOS_INLINE_FUNCTION
+    static void copy(DstType& dst, size_t i_dst,
+                     SrcType& src, size_t i_src ) {
+      for(int j = 0;j< (int) dst.dimension_1(); j++)
+        dst(i_dst,j) = src(i_src,j);
+    }
+  };
+
+  template<class ValuesViewType>
+  struct CopyOp<ValuesViewType,3> {
+    template<class DstType, class SrcType>
+    KOKKOS_INLINE_FUNCTION
+    static void copy(DstType& dst, size_t i_dst,
+                     SrcType& src, size_t i_src ) {
+      for(int j = 0; j<dst.dimension_1(); j++)
+        for(int k = 0; k<dst.dimension_2(); k++)
+          dst(i_dst,j,k) = src(i_src,j,k);
+    }
+  };
+  }
+
+template<class KeyViewType, class BinSortOp, class ExecutionSpace = typename KeyViewType::execution_space,
+         class SizeType = typename KeyViewType::memory_space::size_type>
+class BinSort {
+
+
+public:
+  template<class ValuesViewType, class PermuteViewType, class CopyOp>
+  struct bin_sort_sort_functor {
+    typedef ExecutionSpace execution_space;
+    typedef typename ValuesViewType::non_const_type values_view_type;
+    typedef typename ValuesViewType::const_type const_values_view_type;
+    Kokkos::View<typename values_view_type::const_data_type,typename values_view_type::array_layout,
+                 typename values_view_type::memory_space,Kokkos::MemoryTraits<Kokkos::RandomAccess> > values;
+    values_view_type sorted_values;
+    typename PermuteViewType::const_type sort_order;
+    bin_sort_sort_functor(const_values_view_type values_, values_view_type  sorted_values_, PermuteViewType sort_order_):
+       values(values_),sorted_values(sorted_values_),sort_order(sort_order_) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (const int& i)  const {
+      //printf("Sort: %i %i\n",i,sort_order(i));
+      CopyOp::copy(sorted_values,i,values,sort_order(i));
+    }
+  };
+
+  typedef ExecutionSpace execution_space;
+  typedef BinSortOp bin_op_type;
+
+  struct bin_count_tag {};
+  struct bin_offset_tag {};
+  struct bin_binning_tag {};
+  struct bin_sort_bins_tag {};
+
+public:
+  typedef SizeType size_type;
+  typedef size_type value_type;
+
+  typedef Kokkos::View<size_type*, execution_space> offset_type;
+  typedef Kokkos::View<const int*, execution_space> bin_count_type;
+
+
+  typedef Kokkos::View<typename KeyViewType::const_data_type,
+                       typename KeyViewType::array_layout,
+                       typename KeyViewType::memory_space> const_key_view_type;
+  typedef Kokkos::View<typename KeyViewType::const_data_type,
+                       typename KeyViewType::array_layout,
+                       typename KeyViewType::memory_space,
+                       Kokkos::MemoryTraits<Kokkos::RandomAccess> > const_rnd_key_view_type;
+
+  typedef typename KeyViewType::non_const_value_type non_const_key_scalar;
+  typedef typename KeyViewType::const_value_type     const_key_scalar;
+
+private:
+  const_key_view_type keys;
+  const_rnd_key_view_type keys_rnd;
+
+public:
+  BinSortOp bin_op;
+
+  offset_type bin_offsets;
+
+  Kokkos::View<int*, ExecutionSpace, Kokkos::MemoryTraits<Kokkos::Atomic> > bin_count_atomic;
+  bin_count_type bin_count_const;
+
+  offset_type sort_order;
+
+  bool sort_within_bins;
+
+public:
+
+  // Constructor: takes the keys, the binning_operator and optionally whether to sort within bins (default false)
+  BinSort(const_key_view_type keys_, BinSortOp bin_op_,
+          bool sort_within_bins_ = false)
+     :keys(keys_),keys_rnd(keys_), bin_op(bin_op_) {
+
+    bin_count_atomic = Kokkos::View<int*, ExecutionSpace >("Kokkos::SortImpl::BinSortFunctor::bin_count",bin_op.max_bins());
+    bin_count_const =  bin_count_atomic;
+    bin_offsets =      offset_type("Kokkos::SortImpl::BinSortFunctor::bin_offsets",bin_op.max_bins());
+    sort_order =       offset_type("PermutationVector",keys.dimension_0());
+    sort_within_bins = sort_within_bins_;
+  }
+
+  // Create the permutation vector, the bin_offset array and the bin_count array. Can be called again if keys changed
+  void create_permute_vector() {
+    Kokkos::parallel_for (Kokkos::RangePolicy<ExecutionSpace,bin_count_tag>    (0,keys.dimension_0()),*this);
+    Kokkos::parallel_scan(Kokkos::RangePolicy<ExecutionSpace,bin_offset_tag>   (0,bin_op.max_bins()) ,*this);
+
+    Kokkos::deep_copy(bin_count_atomic,0);
+    Kokkos::parallel_for (Kokkos::RangePolicy<ExecutionSpace,bin_binning_tag>  (0,keys.dimension_0()),*this);
+
+    if(sort_within_bins)
+      Kokkos::parallel_for (Kokkos::RangePolicy<ExecutionSpace,bin_sort_bins_tag>(0,bin_op.max_bins()) ,*this);
+  }
+
+  // Sort a view with respect ot the first dimension using the permutation array
+  template<class ValuesViewType>
+  void sort(ValuesViewType values) {
+    ValuesViewType sorted_values = ValuesViewType("Copy",
+           values.dimension_0(),
+           values.dimension_1(),
+           values.dimension_2(),
+           values.dimension_3(),
+           values.dimension_4(),
+           values.dimension_5(),
+           values.dimension_6(),
+           values.dimension_7());
+
+    parallel_for(values.dimension_0(),
+        bin_sort_sort_functor<ValuesViewType, offset_type,
+                              SortImpl::CopyOp<ValuesViewType> >(values,sorted_values,sort_order));
+
+    deep_copy(values,sorted_values);
+  }
+
+  // Get the permutation vector
+  KOKKOS_INLINE_FUNCTION
+  offset_type get_permute_vector() const { return sort_order;}
+
+  // Get the start offsets for each bin
+  KOKKOS_INLINE_FUNCTION
+  offset_type get_bin_offsets() const { return bin_offsets;}
+
+  // Get the count for each bin
+  KOKKOS_INLINE_FUNCTION
+  bin_count_type get_bin_count() const {return bin_count_const;}
+
+public:
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const bin_count_tag& tag, const int& i) const {
+    bin_count_atomic(bin_op.bin(keys,i))++;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const bin_offset_tag& tag, const int& i, value_type& offset, const bool& final)  const {
+    if(final) {
+      bin_offsets(i) = offset;
+    }
+    offset+=bin_count_const(i);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const bin_binning_tag& tag, const int& i)  const {
+    const int bin = bin_op.bin(keys,i);
+    const int count = bin_count_atomic(bin)++;
+
+    sort_order(bin_offsets(bin) + count) = i;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const bin_sort_bins_tag& tag, const int&i )  const {
+    bool sorted = false;
+    int upper_bound = bin_offsets(i)+bin_count_const(i);
+    while(!sorted) {
+      sorted = true;
+      int old_idx = sort_order(bin_offsets(i));
+      int new_idx;
+      for(int k=bin_offsets(i)+1; k<upper_bound; k++) {
+        new_idx = sort_order(k);
+
+        if(!bin_op(keys_rnd,old_idx,new_idx)) {
+          sort_order(k-1) = new_idx;
+          sort_order(k) = old_idx;
+          sorted = false;
+        } else {
+          old_idx = new_idx;
+        }
+      }
+      upper_bound--;
+    }
+  }
+};
+
+namespace SortImpl {
+
+template<class KeyViewType>
+struct DefaultBinOp1D {
+  const int max_bins_;
+  const double mul_;
+  typename KeyViewType::const_value_type range_;
+  typename KeyViewType::const_value_type min_;
+
+  //Construct BinOp with number of bins, minimum value and maxuimum value
+  DefaultBinOp1D(int max_bins__, typename KeyViewType::const_value_type min,
+                               typename KeyViewType::const_value_type max )
+     :max_bins_(max_bins__+1),mul_(1.0*max_bins__/(max-min)),range_(max-min),min_(min) {}
+
+  //Determine bin index from key value
+  template<class ViewType>
+  KOKKOS_INLINE_FUNCTION
+  int bin(ViewType& keys, const int& i) const {
+    return int(mul_*(keys(i)-min_));
+  }
+
+  //Return maximum bin index + 1
+  KOKKOS_INLINE_FUNCTION
+  int max_bins() const {
+    return max_bins_;
+  }
+
+  //Compare to keys within a bin if true new_val will be put before old_val
+  template<class ViewType, typename iType1, typename iType2>
+  KOKKOS_INLINE_FUNCTION
+  bool operator()(ViewType& keys, iType1& i1, iType2& i2) const {
+    return keys(i1)<keys(i2);
+  }
+};
+
+template<class KeyViewType>
+struct DefaultBinOp3D {
+  int max_bins_[3];
+  double mul_[3];
+  typename KeyViewType::non_const_value_type range_[3];
+  typename KeyViewType::non_const_value_type min_[3];
+
+  DefaultBinOp3D(int max_bins__[], typename KeyViewType::const_value_type min[],
+                               typename KeyViewType::const_value_type max[] )
+  {
+    max_bins_[0] = max_bins__[0]+1;
+    max_bins_[1] = max_bins__[1]+1;
+    max_bins_[2] = max_bins__[2]+1;
+    mul_[0] = 1.0*max_bins__[0]/(max[0]-min[0]);
+    mul_[1] = 1.0*max_bins__[1]/(max[1]-min[1]);
+    mul_[2] = 1.0*max_bins__[2]/(max[2]-min[2]);
+    range_[0] = max[0]-min[0];
+    range_[1] = max[1]-min[1];
+    range_[2] = max[2]-min[2];
+    min_[0] = min[0];
+    min_[1] = min[1];
+    min_[2] = min[2];
+  }
+
+  template<class ViewType>
+  KOKKOS_INLINE_FUNCTION
+  int bin(ViewType& keys, const int& i) const {
+    return int( (((int(mul_[0]*(keys(i,0)-min_[0]))*max_bins_[1]) +
+                   int(mul_[1]*(keys(i,1)-min_[1])))*max_bins_[2]) +
+                   int(mul_[2]*(keys(i,2)-min_[2])));
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  int max_bins() const {
+    return max_bins_[0]*max_bins_[1]*max_bins_[2];
+  }
+
+  template<class ViewType, typename iType1, typename iType2>
+  KOKKOS_INLINE_FUNCTION
+  bool operator()(ViewType& keys, iType1& i1 , iType2& i2) const {
+    if (keys(i1,0)>keys(i2,0)) return true;
+    else if (keys(i1,0)==keys(i2,0)) {
+      if (keys(i1,1)>keys(i2,1)) return true;
+      else if (keys(i1,1)==keys(i2,2)) {
+        if (keys(i1,2)>keys(i2,2)) return true;
+      }
+    }
+    return false;
+  }
+};
+
+template<typename Scalar>
+struct min_max {
+  Scalar min;
+  Scalar max;
+  bool init;
+
+  KOKKOS_INLINE_FUNCTION
+  min_max() {
+    min = 0;
+    max = 0;
+    init = 0;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  min_max (const min_max& val) {
+    min = val.min;
+    max = val.max;
+    init = val.init;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  min_max operator = (const min_max& val) {
+    min = val.min;
+    max = val.max;
+    init = val.init;
+    return *this;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator+= (const Scalar& val) {
+    if(init) {
+      min = min<val?min:val;
+      max = max>val?max:val;
+    } else {
+      min = val;
+      max = val;
+      init = 1;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator+= (const min_max& val) {
+    if(init && val.init) {
+      min = min<val.min?min:val.min;
+      max = max>val.max?max:val.max;
+    } else {
+      if(val.init) {
+        min = val.min;
+        max = val.max;
+        init = 1;
+      }
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator+= (volatile const Scalar& val) volatile {
+    if(init) {
+      min = min<val?min:val;
+      max = max>val?max:val;
+    } else {
+      min = val;
+      max = val;
+      init = 1;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator+= (volatile const min_max& val) volatile {
+    if(init && val.init) {
+      min = min<val.min?min:val.min;
+      max = max>val.max?max:val.max;
+    } else {
+      if(val.init) {
+        min = val.min;
+        max = val.max;
+        init = 1;
+      }
+    }
+  }
+};
+
+
+template<class ViewType>
+struct min_max_functor {
+  typedef typename ViewType::execution_space execution_space;
+  ViewType view;
+  typedef min_max<typename ViewType::non_const_value_type> value_type;
+  min_max_functor (const ViewType view_):view(view_) {
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const size_t& i, value_type& val) const {
+    val += view(i);
+  }
+};
+
+template<class ViewType>
+bool try_std_sort(ViewType view) {
+  bool possible = true;
+#if ! KOKKOS_USING_EXP_VIEW
+  size_t stride[8];
+  view.stride(stride);
+#else
+  size_t stride[8] = { view.stride_0()
+                     , view.stride_1()
+                     , view.stride_2()
+                     , view.stride_3()
+                     , view.stride_4()
+                     , view.stride_5()
+                     , view.stride_6()
+                     , view.stride_7()
+                     };
+#endif
+  possible  = possible && Impl::is_same<typename ViewType::memory_space, HostSpace>::value;
+  possible  = possible && (ViewType::Rank == 1);
+  possible  = possible && (stride[0] == 1);
+  if(possible)  {
+   std::sort(view.ptr_on_device(),view.ptr_on_device()+view.dimension_0());
+  }
+  return possible;
+}
+
+}
+
+template<class ViewType>
+void sort(ViewType view, bool always_use_kokkos_sort = false) {
+  if(!always_use_kokkos_sort) {
+    if(SortImpl::try_std_sort(view)) return;
+  }
+
+  typedef SortImpl::DefaultBinOp1D<ViewType> CompType;
+  SortImpl::min_max<typename ViewType::non_const_value_type> val;
+  parallel_reduce(view.dimension_0(),SortImpl::min_max_functor<ViewType>(view),val);
+  BinSort<ViewType, CompType> bin_sort(view,CompType(view.dimension_0()/2,val.min,val.max),true);
+  bin_sort.create_permute_vector();
+  bin_sort.sort(view);
+}
+
+/*template<class ViewType, class Comparator>
+void sort(ViewType view, Comparator comp, bool always_use_kokkos_sort = false) {
+
+}*/
+
+}
+
+#endif
diff --git a/lib/kokkos/algorithms/unit_tests/CMakeLists.txt b/lib/kokkos/algorithms/unit_tests/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..654104b44e7b395c6937f4c1dc35b4933018268e
--- /dev/null
+++ b/lib/kokkos/algorithms/unit_tests/CMakeLists.txt
@@ -0,0 +1,38 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../src )
+
+SET(SOURCES
+  UnitTestMain.cpp 
+  TestCuda.cpp
+  )
+
+SET(LIBRARIES kokkoscore)
+
+IF(Kokkos_ENABLE_OpenMP)
+  LIST( APPEND SOURCES
+    TestOpenMP.cpp
+  )
+ENDIF()
+
+IF(Kokkos_ENABLE_Serial)
+  LIST( APPEND SOURCES
+    TestSerial.cpp
+  )
+ENDIF()
+
+IF(Kokkos_ENABLE_Pthread)
+  LIST( APPEND SOURCES
+    TestThreads.cpp
+  )
+ENDIF()
+
+TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  UnitTest
+  SOURCES ${SOURCES}
+  COMM serial mpi
+  NUM_MPI_PROCS 1
+  FAIL_REGULAR_EXPRESSION "  FAILED  "
+  TESTONLYLIBS kokkos_gtest
+  )
diff --git a/lib/kokkos/algorithms/unit_tests/Makefile b/lib/kokkos/algorithms/unit_tests/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..5d79364c52abc7a8a61769d187fc06e5612e203b
--- /dev/null
+++ b/lib/kokkos/algorithms/unit_tests/Makefile
@@ -0,0 +1,92 @@
+KOKKOS_PATH = ../..
+
+GTEST_PATH = ../../TPL/gtest
+
+vpath %.cpp ${KOKKOS_PATH}/algorithms/unit_tests
+
+default: build_all
+	echo "End Build"
+
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+	CXX = $(NVCC_WRAPPER)
+	CXXFLAGS ?= -O3
+	LINK = $(CXX)
+	LDFLAGS ?= -lpthread
+else
+	CXX ?= g++
+	CXXFLAGS ?= -O3
+	LINK ?= $(CXX)
+	LDFLAGS ?= -lpthread
+endif
+
+KOKKOS_CXXFLAGS += -I$(GTEST_PATH) -I${KOKKOS_PATH}/algorithms/unit_tests
+
+TEST_TARGETS = 
+TARGETS = 
+
+ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+	OBJ_CUDA = TestCuda.o UnitTestMain.o gtest-all.o
+	TARGETS += KokkosAlgorithms_UnitTest_Cuda
+	TEST_TARGETS += test-cuda
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
+	OBJ_THREADS = TestThreads.o UnitTestMain.o gtest-all.o
+	TARGETS += KokkosAlgorithms_UnitTest_Threads
+	TEST_TARGETS += test-threads
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
+	OBJ_OPENMP = TestOpenMP.o UnitTestMain.o gtest-all.o
+	TARGETS += KokkosAlgorithms_UnitTest_OpenMP
+	TEST_TARGETS += test-openmp
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
+	OBJ_SERIAL = TestSerial.o UnitTestMain.o gtest-all.o
+	TARGETS += KokkosAlgorithms_UnitTest_Serial
+	TEST_TARGETS += test-serial
+endif
+
+KokkosAlgorithms_UnitTest_Cuda: $(OBJ_CUDA) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_CUDA) $(KOKKOS_LIBS) $(LIB) -o KokkosAlgorithms_UnitTest_Cuda
+
+KokkosAlgorithms_UnitTest_Threads: $(OBJ_THREADS) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_THREADS) $(KOKKOS_LIBS) $(LIB) -o KokkosAlgorithms_UnitTest_Threads
+
+KokkosAlgorithms_UnitTest_OpenMP: $(OBJ_OPENMP) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_OPENMP) $(KOKKOS_LIBS) $(LIB) -o KokkosAlgorithms_UnitTest_OpenMP
+
+KokkosAlgorithms_UnitTest_Serial: $(OBJ_SERIAL) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_SERIAL) $(KOKKOS_LIBS) $(LIB) -o KokkosAlgorithms_UnitTest_Serial
+
+test-cuda: KokkosAlgorithms_UnitTest_Cuda
+	./KokkosAlgorithms_UnitTest_Cuda
+
+test-threads: KokkosAlgorithms_UnitTest_Threads
+	./KokkosAlgorithms_UnitTest_Threads
+
+test-openmp: KokkosAlgorithms_UnitTest_OpenMP
+	./KokkosAlgorithms_UnitTest_OpenMP
+
+test-serial: KokkosAlgorithms_UnitTest_Serial
+	./KokkosAlgorithms_UnitTest_Serial
+
+build_all: $(TARGETS)
+
+test: $(TEST_TARGETS)
+
+clean: kokkos-clean 
+	rm -f *.o $(TARGETS)
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
+
+gtest-all.o:$(GTEST_PATH)/gtest/gtest-all.cc 
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $(GTEST_PATH)/gtest/gtest-all.cc
+
diff --git a/lib/kokkos/algorithms/unit_tests/TestCuda.cpp b/lib/kokkos/algorithms/unit_tests/TestCuda.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d19c778c4663bff82e50037d2d1b6ffaeeff103d
--- /dev/null
+++ b/lib/kokkos/algorithms/unit_tests/TestCuda.cpp
@@ -0,0 +1,110 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <stdint.h>
+#include <iostream>
+#include <iomanip>
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+#ifdef KOKKOS_HAVE_CUDA
+
+#include <TestRandom.hpp>
+#include <TestSort.hpp>
+
+namespace Test {
+
+class cuda : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+    std::cout << std::setprecision(5) << std::scientific;
+    Kokkos::HostSpace::execution_space::initialize();
+    Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice(0) );
+  }
+  static void TearDownTestCase()
+  {
+    Kokkos::Cuda::finalize();
+    Kokkos::HostSpace::execution_space::finalize();
+  }
+};
+
+void cuda_test_random_xorshift64( int num_draws  )
+{
+  Impl::test_random<Kokkos::Random_XorShift64_Pool<Kokkos::Cuda> >(num_draws);
+}
+
+void cuda_test_random_xorshift1024( int num_draws  )
+{
+  Impl::test_random<Kokkos::Random_XorShift1024_Pool<Kokkos::Cuda> >(num_draws);
+}
+
+
+#define CUDA_RANDOM_XORSHIFT64( num_draws )                                \
+  TEST_F( cuda, Random_XorShift64 ) {   \
+  cuda_test_random_xorshift64(num_draws);                                   \
+  }
+
+#define CUDA_RANDOM_XORSHIFT1024( num_draws )                                \
+  TEST_F( cuda, Random_XorShift1024 ) {   \
+  cuda_test_random_xorshift1024(num_draws);                                   \
+  }
+
+#define CUDA_SORT_UNSIGNED( size )                                \
+  TEST_F( cuda, SortUnsigned ) {   \
+      Impl::test_sort< Kokkos::Cuda, unsigned >(size);                                   \
+  }
+
+CUDA_RANDOM_XORSHIFT64(  132141141 )
+CUDA_RANDOM_XORSHIFT1024( 52428813 )
+CUDA_SORT_UNSIGNED(171)
+
+#undef CUDA_RANDOM_XORSHIFT64
+#undef CUDA_RANDOM_XORSHIFT1024
+#undef CUDA_SORT_UNSIGNED
+}
+
+#endif  /* #ifdef KOKKOS_HAVE_CUDA */
+
diff --git a/lib/kokkos/algorithms/unit_tests/TestOpenMP.cpp b/lib/kokkos/algorithms/unit_tests/TestOpenMP.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4b06dffcb6a068503770229091ab15330bf6af89
--- /dev/null
+++ b/lib/kokkos/algorithms/unit_tests/TestOpenMP.cpp
@@ -0,0 +1,102 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+//----------------------------------------------------------------------------
+#include <TestRandom.hpp>
+#include <TestSort.hpp>
+#include <iomanip>
+
+namespace Test {
+
+#ifdef KOKKOS_HAVE_OPENMP
+class openmp : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+    std::cout << std::setprecision(5) << std::scientific;
+
+    unsigned threads_count = omp_get_max_threads();
+
+    if ( Kokkos::hwloc::available() ) {
+      threads_count = Kokkos::hwloc::get_available_numa_count() *
+                      Kokkos::hwloc::get_available_cores_per_numa();
+    }
+
+    Kokkos::OpenMP::initialize( threads_count );
+  }
+
+  static void TearDownTestCase()
+  {
+    Kokkos::OpenMP::finalize();
+  }
+};
+
+#define OPENMP_RANDOM_XORSHIFT64( num_draws )                                \
+  TEST_F( openmp, Random_XorShift64 ) {   \
+      Impl::test_random<Kokkos::Random_XorShift64_Pool<Kokkos::OpenMP> >(num_draws);                                   \
+  }
+
+#define OPENMP_RANDOM_XORSHIFT1024( num_draws )                                \
+  TEST_F( openmp, Random_XorShift1024 ) {   \
+      Impl::test_random<Kokkos::Random_XorShift1024_Pool<Kokkos::OpenMP> >(num_draws);                                   \
+  }
+
+#define OPENMP_SORT_UNSIGNED( size )                                \
+  TEST_F( openmp, SortUnsigned ) {   \
+      Impl::test_sort< Kokkos::OpenMP, unsigned >(size);                                   \
+  }
+
+OPENMP_RANDOM_XORSHIFT64( 10240000 )
+OPENMP_RANDOM_XORSHIFT1024( 10130144 )
+OPENMP_SORT_UNSIGNED(171)
+
+#undef OPENMP_RANDOM_XORSHIFT64
+#undef OPENMP_RANDOM_XORSHIFT1024
+#undef OPENMP_SORT_UNSIGNED
+#endif
+} // namespace test
+
diff --git a/lib/kokkos/algorithms/unit_tests/TestRandom.hpp b/lib/kokkos/algorithms/unit_tests/TestRandom.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..c906b9f2cdc69735a225698c2bb5dc0e152160cb
--- /dev/null
+++ b/lib/kokkos/algorithms/unit_tests/TestRandom.hpp
@@ -0,0 +1,481 @@
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+
+#ifndef KOKKOS_TEST_DUALVIEW_HPP
+#define KOKKOS_TEST_DUALVIEW_HPP
+
+#include <gtest/gtest.h>
+#include <iostream>
+#include <cstdlib>
+#include <cstdio>
+#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Random.hpp>
+#include <cmath>
+#include <chrono>
+
+namespace Test {
+
+namespace Impl{
+
+// This test runs the random number generators and uses some statistic tests to
+// check the 'goodness' of the random numbers:
+//    (i)   mean:         the mean is expected to be 0.5*RAND_MAX
+//    (ii)  variance:     the variance is 1/3*mean*mean
+//    (iii) covariance:   the covariance is 0
+//    (iv)  1-tupledistr: the mean, variance and covariance of a 1D Histrogram of random numbers
+//    (v)   3-tupledistr: the mean, variance and covariance of a 3D Histrogram of random numbers
+
+#define HIST_DIM3D 24
+#define HIST_DIM1D (HIST_DIM3D*HIST_DIM3D*HIST_DIM3D)
+
+struct RandomProperties {
+  uint64_t count;
+  double mean;
+  double variance;
+  double covariance;
+  double min;
+  double max;
+
+  KOKKOS_INLINE_FUNCTION
+  RandomProperties() {
+    count = 0;
+    mean = 0.0;
+    variance = 0.0;
+    covariance = 0.0;
+    min = 1e64;
+    max = -1e64;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  RandomProperties& operator+=(const RandomProperties& add) {
+    count      += add.count;
+    mean       += add.mean;
+    variance   += add.variance;
+    covariance += add.covariance;
+    min         = add.min<min?add.min:min;
+    max         = add.max>max?add.max:max;
+    return *this;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator+=(const volatile RandomProperties& add) volatile {
+    count      += add.count;
+    mean       += add.mean;
+    variance   += add.variance;
+    covariance += add.covariance;
+    min         = add.min<min?add.min:min;
+    max         = add.max>max?add.max:max;
+  }
+};
+
+template<class GeneratorPool, class Scalar>
+struct test_random_functor {
+  typedef typename GeneratorPool::generator_type rnd_type;
+
+  typedef RandomProperties value_type;
+  typedef typename GeneratorPool::device_type device_type;
+
+  GeneratorPool rand_pool;
+  const double mean;
+
+  // NOTE (mfh 03 Nov 2014): Kokkos::rand::max() is supposed to define
+  // an exclusive upper bound on the range of random numbers that
+  // draw() can generate.  However, for the float specialization, some
+  // implementations might violate this upper bound, due to rounding
+  // error.  Just in case, we leave an extra space at the end of each
+  // dimension, in the View types below.
+  typedef Kokkos::View<int[HIST_DIM1D+1],typename GeneratorPool::device_type> type_1d;
+  type_1d density_1d;
+  typedef Kokkos::View<int[HIST_DIM3D+1][HIST_DIM3D+1][HIST_DIM3D+1],typename GeneratorPool::device_type> type_3d;
+  type_3d density_3d;
+
+  test_random_functor (GeneratorPool rand_pool_, type_1d d1d, type_3d d3d) :
+    rand_pool (rand_pool_),
+    mean (0.5*Kokkos::rand<rnd_type,Scalar>::max ()),
+    density_1d (d1d),
+    density_3d (d3d)
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (int i, RandomProperties& prop) const {
+    using Kokkos::atomic_fetch_add;
+
+    rnd_type rand_gen = rand_pool.get_state();
+    for (int k = 0; k < 1024; ++k) {
+      const Scalar tmp = Kokkos::rand<rnd_type,Scalar>::draw(rand_gen);
+      prop.count++;
+      prop.mean += tmp;
+      prop.variance += (tmp-mean)*(tmp-mean);
+      const Scalar tmp2 = Kokkos::rand<rnd_type,Scalar>::draw(rand_gen);
+      prop.count++;
+      prop.mean += tmp2;
+      prop.variance += (tmp2-mean)*(tmp2-mean);
+      prop.covariance += (tmp-mean)*(tmp2-mean);
+      const Scalar tmp3 = Kokkos::rand<rnd_type,Scalar>::draw(rand_gen);
+      prop.count++;
+      prop.mean += tmp3;
+      prop.variance += (tmp3-mean)*(tmp3-mean);
+      prop.covariance += (tmp2-mean)*(tmp3-mean);
+
+      // NOTE (mfh 03 Nov 2014): Kokkos::rand::max() is supposed to
+      // define an exclusive upper bound on the range of random
+      // numbers that draw() can generate.  However, for the float
+      // specialization, some implementations might violate this upper
+      // bound, due to rounding error.  Just in case, we have left an
+      // extra space at the end of each dimension of density_1d and
+      // density_3d.
+      //
+      // Please note that those extra entries might not get counted in
+      // the histograms.  However, if Kokkos::rand is broken and only
+      // returns values of max(), the histograms will still catch this
+      // indirectly, since none of the other values will be filled in.
+
+      const Scalar theMax = Kokkos::rand<rnd_type, Scalar>::max ();
+
+      const uint64_t ind1_1d = static_cast<uint64_t> (1.0 * HIST_DIM1D * tmp / theMax);
+      const uint64_t ind2_1d = static_cast<uint64_t> (1.0 * HIST_DIM1D * tmp2 / theMax);
+      const uint64_t ind3_1d = static_cast<uint64_t> (1.0 * HIST_DIM1D * tmp3 / theMax);
+
+      const uint64_t ind1_3d = static_cast<uint64_t> (1.0 * HIST_DIM3D * tmp / theMax);
+      const uint64_t ind2_3d = static_cast<uint64_t> (1.0 * HIST_DIM3D * tmp2 / theMax);
+      const uint64_t ind3_3d = static_cast<uint64_t> (1.0 * HIST_DIM3D * tmp3 / theMax);
+
+      atomic_fetch_add (&density_1d(ind1_1d), 1);
+      atomic_fetch_add (&density_1d(ind2_1d), 1);
+      atomic_fetch_add (&density_1d(ind3_1d), 1);
+      atomic_fetch_add (&density_3d(ind1_3d, ind2_3d, ind3_3d), 1);
+    }
+    rand_pool.free_state(rand_gen);
+  }
+};
+
+template<class DeviceType>
+struct test_histogram1d_functor {
+  typedef RandomProperties value_type;
+  typedef typename DeviceType::execution_space execution_space;
+  typedef typename DeviceType::memory_space memory_space;
+
+  // NOTE (mfh 03 Nov 2014): Kokkos::rand::max() is supposed to define
+  // an exclusive upper bound on the range of random numbers that
+  // draw() can generate.  However, for the float specialization, some
+  // implementations might violate this upper bound, due to rounding
+  // error.  Just in case, we leave an extra space at the end of each
+  // dimension, in the View type below.
+  typedef Kokkos::View<int[HIST_DIM1D+1], memory_space> type_1d;
+  type_1d density_1d;
+  double mean;
+
+  test_histogram1d_functor (type_1d d1d, int num_draws) :
+    density_1d (d1d),
+    mean (1.0*num_draws/HIST_DIM1D*3)
+  {
+  }
+
+  KOKKOS_INLINE_FUNCTION void
+  operator() (const typename memory_space::size_type i,
+              RandomProperties& prop) const
+  {
+    typedef typename memory_space::size_type size_type;
+    const double count = density_1d(i);
+    prop.mean += count;
+    prop.variance += 1.0 * (count - mean) * (count - mean);
+    //prop.covariance += 1.0*count*count;
+    prop.min = count < prop.min ? count : prop.min;
+    prop.max = count > prop.max ? count : prop.max;
+    if (i < static_cast<size_type> (HIST_DIM1D-1)) {
+      prop.covariance += (count - mean) * (density_1d(i+1) - mean);
+    }
+  }
+};
+
+template<class DeviceType>
+struct test_histogram3d_functor {
+  typedef RandomProperties value_type;
+  typedef typename DeviceType::execution_space execution_space;
+  typedef typename DeviceType::memory_space memory_space;
+
+  // NOTE (mfh 03 Nov 2014): Kokkos::rand::max() is supposed to define
+  // an exclusive upper bound on the range of random numbers that
+  // draw() can generate.  However, for the float specialization, some
+  // implementations might violate this upper bound, due to rounding
+  // error.  Just in case, we leave an extra space at the end of each
+  // dimension, in the View type below.
+  typedef Kokkos::View<int[HIST_DIM3D+1][HIST_DIM3D+1][HIST_DIM3D+1], memory_space> type_3d;
+  type_3d density_3d;
+  double mean;
+
+  test_histogram3d_functor (type_3d d3d, int num_draws) :
+    density_3d (d3d),
+    mean (1.0*num_draws/HIST_DIM1D)
+  {}
+
+  KOKKOS_INLINE_FUNCTION void
+  operator() (const typename memory_space::size_type i,
+              RandomProperties& prop) const
+  {
+    typedef typename memory_space::size_type size_type;
+    const double count = density_3d(i/(HIST_DIM3D*HIST_DIM3D),
+                                    (i % (HIST_DIM3D*HIST_DIM3D))/HIST_DIM3D,
+                                    i % HIST_DIM3D);
+    prop.mean += count;
+    prop.variance += (count - mean) * (count - mean);
+    if (i < static_cast<size_type> (HIST_DIM1D-1)) {
+      const double count_next = density_3d((i+1)/(HIST_DIM3D*HIST_DIM3D),
+                                           ((i+1)%(HIST_DIM3D*HIST_DIM3D))/HIST_DIM3D,
+                                           (i+1)%HIST_DIM3D);
+      prop.covariance += (count - mean) * (count_next - mean);
+    }
+  }
+};
+
+//
+// Templated test that uses the above functors.
+//
+template <class RandomGenerator,class Scalar>
+struct test_random_scalar {
+  typedef typename RandomGenerator::generator_type rnd_type;
+
+  int pass_mean,pass_var,pass_covar;
+  int pass_hist1d_mean,pass_hist1d_var,pass_hist1d_covar;
+  int pass_hist3d_mean,pass_hist3d_var,pass_hist3d_covar;
+
+  test_random_scalar (typename test_random_functor<RandomGenerator,int>::type_1d& density_1d,
+                      typename test_random_functor<RandomGenerator,int>::type_3d& density_3d,
+                      RandomGenerator& pool,
+                      unsigned int num_draws)
+  {
+    using std::cerr;
+    using std::endl;
+    using Kokkos::parallel_reduce;
+
+    {
+      cerr << " -- Testing randomness properties" << endl;
+
+      RandomProperties result;
+      typedef test_random_functor<RandomGenerator, Scalar> functor_type;
+      parallel_reduce (num_draws/1024, functor_type (pool, density_1d, density_3d), result);
+
+      //printf("Result: %lf %lf %lf\n",result.mean/num_draws/3,result.variance/num_draws/3,result.covariance/num_draws/2);
+      double tolerance = 1.6*sqrt(1.0/num_draws);
+      double mean_expect = 0.5*Kokkos::rand<rnd_type,Scalar>::max();
+      double variance_expect = 1.0/3.0*mean_expect*mean_expect;
+      double mean_eps = mean_expect/(result.mean/num_draws/3)-1.0;
+      double variance_eps = variance_expect/(result.variance/num_draws/3)-1.0;
+      double covariance_eps = result.covariance/num_draws/2/variance_expect;
+      pass_mean  = ((-tolerance < mean_eps) &&
+                    ( tolerance > mean_eps)) ? 1:0;
+      pass_var   = ((-1.5*tolerance < variance_eps) &&
+                    ( 1.5*tolerance > variance_eps)) ? 1:0;
+      pass_covar = ((-2.0*tolerance < covariance_eps) &&
+                    ( 2.0*tolerance > covariance_eps)) ? 1:0;
+      cerr << "Pass: " << pass_mean
+           << " " << pass_var
+           << " " << mean_eps
+           << " " << variance_eps
+           << " " << covariance_eps
+           << " || " << tolerance << endl;
+    }
+    {
+      cerr << " -- Testing 1-D histogram" << endl;
+
+      RandomProperties result;
+      typedef test_histogram1d_functor<typename RandomGenerator::device_type> functor_type;
+      parallel_reduce (HIST_DIM1D, functor_type (density_1d, num_draws), result);
+
+      double tolerance = 6*sqrt(1.0/HIST_DIM1D);
+      double mean_expect = 1.0*num_draws*3/HIST_DIM1D;
+      double variance_expect = 1.0*num_draws*3/HIST_DIM1D*(1.0-1.0/HIST_DIM1D);
+      double covariance_expect = -1.0*num_draws*3/HIST_DIM1D/HIST_DIM1D;
+      double mean_eps = mean_expect/(result.mean/HIST_DIM1D)-1.0;
+      double variance_eps = variance_expect/(result.variance/HIST_DIM1D)-1.0;
+      double covariance_eps = (result.covariance/HIST_DIM1D - covariance_expect)/mean_expect;
+      pass_hist1d_mean  = ((-0.0001 < mean_eps) &&
+                           ( 0.0001 > mean_eps)) ? 1:0;
+      pass_hist1d_var   = ((-0.07 < variance_eps) &&
+                           ( 0.07 > variance_eps)) ? 1:0;
+      pass_hist1d_covar = ((-0.06 < covariance_eps) &&
+                           ( 0.06 > covariance_eps)) ? 1:0;
+
+      cerr << "Density 1D: " << mean_eps
+           << " " << variance_eps
+           << " " << (result.covariance/HIST_DIM1D/HIST_DIM1D)
+           << " || " << tolerance
+           << " " << result.min
+           << " " << result.max
+           << " || " << result.variance/HIST_DIM1D
+           << " " << 1.0*num_draws*3/HIST_DIM1D*(1.0-1.0/HIST_DIM1D)
+           << " || " << result.covariance/HIST_DIM1D
+           << " " << -1.0*num_draws*3/HIST_DIM1D/HIST_DIM1D
+           << endl;
+    }
+    {
+      cerr << " -- Testing 3-D histogram" << endl;
+
+      RandomProperties result;
+      typedef test_histogram3d_functor<typename RandomGenerator::device_type> functor_type;
+      parallel_reduce (HIST_DIM1D, functor_type (density_3d, num_draws), result);
+
+      double tolerance = 6*sqrt(1.0/HIST_DIM1D);
+      double mean_expect = 1.0*num_draws/HIST_DIM1D;
+      double variance_expect = 1.0*num_draws/HIST_DIM1D*(1.0-1.0/HIST_DIM1D);
+      double covariance_expect = -1.0*num_draws/HIST_DIM1D/HIST_DIM1D;
+      double mean_eps = mean_expect/(result.mean/HIST_DIM1D)-1.0;
+      double variance_eps = variance_expect/(result.variance/HIST_DIM1D)-1.0;
+      double covariance_eps = (result.covariance/HIST_DIM1D - covariance_expect)/mean_expect;
+      pass_hist3d_mean  = ((-tolerance < mean_eps) &&
+                           ( tolerance > mean_eps)) ? 1:0;
+      pass_hist3d_var   = ((-1.2*tolerance < variance_eps) &&
+                           ( 1.2*tolerance > variance_eps)) ? 1:0;
+      pass_hist3d_covar = ((-tolerance < covariance_eps) &&
+                           ( tolerance > covariance_eps)) ? 1:0;
+
+      cerr << "Density 3D: " << mean_eps
+           << " " << variance_eps
+           << " " << result.covariance/HIST_DIM1D/HIST_DIM1D
+           << " || " << tolerance
+           << " " << result.min
+           << " " << result.max << endl;
+    }
+  }
+};
+
+template <class RandomGenerator>
+void test_random(unsigned int num_draws)
+{
+  using std::cerr;
+  using std::endl;
+  typename test_random_functor<RandomGenerator,int>::type_1d density_1d("D1d");
+  typename test_random_functor<RandomGenerator,int>::type_3d density_3d("D3d");
+
+
+  uint64_t ticks = std::chrono::high_resolution_clock::now().time_since_epoch().count();
+  cerr << "Test Seed:" << ticks << endl;
+
+  RandomGenerator pool(ticks);
+
+  cerr << "Test Scalar=int" << endl;
+  test_random_scalar<RandomGenerator,int> test_int(density_1d,density_3d,pool,num_draws);
+  ASSERT_EQ( test_int.pass_mean,1);
+  ASSERT_EQ( test_int.pass_var,1);
+  ASSERT_EQ( test_int.pass_covar,1);
+  ASSERT_EQ( test_int.pass_hist1d_mean,1);
+  ASSERT_EQ( test_int.pass_hist1d_var,1);
+  ASSERT_EQ( test_int.pass_hist1d_covar,1);
+  ASSERT_EQ( test_int.pass_hist3d_mean,1);
+  ASSERT_EQ( test_int.pass_hist3d_var,1);
+  ASSERT_EQ( test_int.pass_hist3d_covar,1);
+  deep_copy(density_1d,0);
+  deep_copy(density_3d,0);
+
+  cerr << "Test Scalar=unsigned int" << endl;
+  test_random_scalar<RandomGenerator,unsigned int> test_uint(density_1d,density_3d,pool,num_draws);
+  ASSERT_EQ( test_uint.pass_mean,1);
+  ASSERT_EQ( test_uint.pass_var,1);
+  ASSERT_EQ( test_uint.pass_covar,1);
+  ASSERT_EQ( test_uint.pass_hist1d_mean,1);
+  ASSERT_EQ( test_uint.pass_hist1d_var,1);
+  ASSERT_EQ( test_uint.pass_hist1d_covar,1);
+  ASSERT_EQ( test_uint.pass_hist3d_mean,1);
+  ASSERT_EQ( test_uint.pass_hist3d_var,1);
+  ASSERT_EQ( test_uint.pass_hist3d_covar,1);
+  deep_copy(density_1d,0);
+  deep_copy(density_3d,0);
+
+  cerr << "Test Scalar=int64_t" << endl;
+  test_random_scalar<RandomGenerator,int64_t> test_int64(density_1d,density_3d,pool,num_draws);
+  ASSERT_EQ( test_int64.pass_mean,1);
+  ASSERT_EQ( test_int64.pass_var,1);
+  ASSERT_EQ( test_int64.pass_covar,1);
+  ASSERT_EQ( test_int64.pass_hist1d_mean,1);
+  ASSERT_EQ( test_int64.pass_hist1d_var,1);
+  ASSERT_EQ( test_int64.pass_hist1d_covar,1);
+  ASSERT_EQ( test_int64.pass_hist3d_mean,1);
+  ASSERT_EQ( test_int64.pass_hist3d_var,1);
+  ASSERT_EQ( test_int64.pass_hist3d_covar,1);
+  deep_copy(density_1d,0);
+  deep_copy(density_3d,0);
+
+  cerr << "Test Scalar=uint64_t" << endl;
+  test_random_scalar<RandomGenerator,uint64_t> test_uint64(density_1d,density_3d,pool,num_draws);
+  ASSERT_EQ( test_uint64.pass_mean,1);
+  ASSERT_EQ( test_uint64.pass_var,1);
+  ASSERT_EQ( test_uint64.pass_covar,1);
+  ASSERT_EQ( test_uint64.pass_hist1d_mean,1);
+  ASSERT_EQ( test_uint64.pass_hist1d_var,1);
+  ASSERT_EQ( test_uint64.pass_hist1d_covar,1);
+  ASSERT_EQ( test_uint64.pass_hist3d_mean,1);
+  ASSERT_EQ( test_uint64.pass_hist3d_var,1);
+  ASSERT_EQ( test_uint64.pass_hist3d_covar,1);
+  deep_copy(density_1d,0);
+  deep_copy(density_3d,0);
+
+  cerr << "Test Scalar=float" << endl;
+  test_random_scalar<RandomGenerator,float> test_float(density_1d,density_3d,pool,num_draws);
+  ASSERT_EQ( test_float.pass_mean,1);
+  ASSERT_EQ( test_float.pass_var,1);
+  ASSERT_EQ( test_float.pass_covar,1);
+  ASSERT_EQ( test_float.pass_hist1d_mean,1);
+  ASSERT_EQ( test_float.pass_hist1d_var,1);
+  ASSERT_EQ( test_float.pass_hist1d_covar,1);
+  ASSERT_EQ( test_float.pass_hist3d_mean,1);
+  ASSERT_EQ( test_float.pass_hist3d_var,1);
+  ASSERT_EQ( test_float.pass_hist3d_covar,1);
+  deep_copy(density_1d,0);
+  deep_copy(density_3d,0);
+
+  cerr << "Test Scalar=double" << endl;
+  test_random_scalar<RandomGenerator,double> test_double(density_1d,density_3d,pool,num_draws);
+  ASSERT_EQ( test_double.pass_mean,1);
+  ASSERT_EQ( test_double.pass_var,1);
+  ASSERT_EQ( test_double.pass_covar,1);
+  ASSERT_EQ( test_double.pass_hist1d_mean,1);
+  ASSERT_EQ( test_double.pass_hist1d_var,1);
+  ASSERT_EQ( test_double.pass_hist1d_covar,1);
+  ASSERT_EQ( test_double.pass_hist3d_mean,1);
+  ASSERT_EQ( test_double.pass_hist3d_var,1);
+  ASSERT_EQ( test_double.pass_hist3d_covar,1);
+}
+}
+
+} // namespace Test
+
+#endif //KOKKOS_TEST_UNORDERED_MAP_HPP
diff --git a/lib/kokkos/algorithms/unit_tests/TestSerial.cpp b/lib/kokkos/algorithms/unit_tests/TestSerial.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..741cf97ae13f245fafeb95078222943afda8ed1d
--- /dev/null
+++ b/lib/kokkos/algorithms/unit_tests/TestSerial.cpp
@@ -0,0 +1,99 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+#include <TestRandom.hpp>
+#include <TestSort.hpp>
+#include <iomanip>
+
+
+//----------------------------------------------------------------------------
+
+
+namespace Test {
+
+#ifdef KOKKOS_HAVE_SERIAL
+class serial : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+    std::cout << std::setprecision (5) << std::scientific;
+    Kokkos::Serial::initialize ();
+  }
+
+  static void TearDownTestCase ()
+  {
+    Kokkos::Serial::finalize ();
+  }
+};
+
+#define SERIAL_RANDOM_XORSHIFT64( num_draws )  \
+  TEST_F( serial, Random_XorShift64 ) {                                \
+    Impl::test_random<Kokkos::Random_XorShift64_Pool<Kokkos::Serial> >(num_draws); \
+  }
+
+#define SERIAL_RANDOM_XORSHIFT1024( num_draws )        \
+  TEST_F( serial, Random_XorShift1024 ) {                              \
+    Impl::test_random<Kokkos::Random_XorShift1024_Pool<Kokkos::Serial> >(num_draws); \
+  }
+
+#define SERIAL_SORT_UNSIGNED( size )                                \
+  TEST_F( serial, SortUnsigned ) {   \
+      Impl::test_sort< Kokkos::Serial, unsigned >(size);                                   \
+  }
+
+SERIAL_RANDOM_XORSHIFT64( 10240000 )
+SERIAL_RANDOM_XORSHIFT1024( 10130144 )
+SERIAL_SORT_UNSIGNED(171)
+
+#undef SERIAL_RANDOM_XORSHIFT64
+#undef SERIAL_RANDOM_XORSHIFT1024
+#undef SERIAL_SORT_UNSIGNED
+
+#endif // KOKKOS_HAVE_SERIAL
+} // namespace Test
+
+
diff --git a/lib/kokkos/algorithms/unit_tests/TestSort.hpp b/lib/kokkos/algorithms/unit_tests/TestSort.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ccbcbdd0011bbc577ac8c39b2f593ed35f2546ac
--- /dev/null
+++ b/lib/kokkos/algorithms/unit_tests/TestSort.hpp
@@ -0,0 +1,206 @@
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+
+#ifndef TESTSORT_HPP_
+#define TESTSORT_HPP_
+
+#include <gtest/gtest.h>
+#include<Kokkos_Core.hpp>
+#include<Kokkos_Random.hpp>
+#include<Kokkos_Sort.hpp>
+
+namespace Test {
+
+namespace Impl{
+
+template<class ExecutionSpace, class Scalar>
+struct is_sorted_struct {
+  typedef unsigned int value_type;
+  typedef ExecutionSpace execution_space;
+
+  Kokkos::View<Scalar*,ExecutionSpace> keys;
+
+  is_sorted_struct(Kokkos::View<Scalar*,ExecutionSpace> keys_):keys(keys_) {}
+  KOKKOS_INLINE_FUNCTION
+  void operator() (int i, unsigned int& count) const {
+    if(keys(i)>keys(i+1)) count++;
+  }
+};
+
+template<class ExecutionSpace, class Scalar>
+struct sum {
+  typedef double value_type;
+  typedef ExecutionSpace execution_space;
+
+  Kokkos::View<Scalar*,ExecutionSpace> keys;
+
+  sum(Kokkos::View<Scalar*,ExecutionSpace> keys_):keys(keys_) {}
+  KOKKOS_INLINE_FUNCTION
+  void operator() (int i, double& count) const {
+    count+=keys(i);
+  }
+};
+
+template<class ExecutionSpace, class Scalar>
+struct bin3d_is_sorted_struct {
+  typedef unsigned int value_type;
+  typedef ExecutionSpace execution_space;
+
+  Kokkos::View<Scalar*[3],ExecutionSpace> keys;
+
+  int max_bins;
+  Scalar min;
+  Scalar max;
+
+  bin3d_is_sorted_struct(Kokkos::View<Scalar*[3],ExecutionSpace> keys_,int max_bins_,Scalar min_,Scalar max_):
+    keys(keys_),max_bins(max_bins_),min(min_),max(max_) {
+  }
+  KOKKOS_INLINE_FUNCTION
+  void operator() (int i, unsigned int& count) const {
+    int ix1 = int ((keys(i,0)-min)/max * max_bins);
+    int iy1 = int ((keys(i,1)-min)/max * max_bins);
+    int iz1 = int ((keys(i,2)-min)/max * max_bins);
+    int ix2 = int ((keys(i+1,0)-min)/max * max_bins);
+    int iy2 = int ((keys(i+1,1)-min)/max * max_bins);
+    int iz2 = int ((keys(i+1,2)-min)/max * max_bins);
+
+    if (ix1>ix2)  count++;
+    else if(ix1==ix2) {
+      if (iy1>iy2)  count++;
+      else if ((iy1==iy2) && (iz1>iz2))  count++;
+    }
+  }
+};
+
+template<class ExecutionSpace, class Scalar>
+struct sum3D {
+  typedef double value_type;
+  typedef ExecutionSpace execution_space;
+
+  Kokkos::View<Scalar*[3],ExecutionSpace> keys;
+
+  sum3D(Kokkos::View<Scalar*[3],ExecutionSpace> keys_):keys(keys_) {}
+  KOKKOS_INLINE_FUNCTION
+  void operator() (int i, double& count) const {
+    count+=keys(i,0);
+    count+=keys(i,1);
+    count+=keys(i,2);
+  }
+};
+
+template<class ExecutionSpace, typename KeyType>
+void test_1D_sort(unsigned int n,bool force_kokkos) {
+  typedef Kokkos::View<KeyType*,ExecutionSpace> KeyViewType;
+  KeyViewType keys("Keys",n);
+
+  Kokkos::Random_XorShift64_Pool<ExecutionSpace> g(1931);
+  Kokkos::fill_random(keys,g,Kokkos::Random_XorShift64_Pool<ExecutionSpace>::generator_type::MAX_URAND);
+
+  double sum_before = 0.0;
+  double sum_after = 0.0;
+  unsigned int sort_fails = 0;
+
+  Kokkos::parallel_reduce(n,sum<ExecutionSpace, KeyType>(keys),sum_before);
+
+  Kokkos::sort(keys,force_kokkos);
+
+  Kokkos::parallel_reduce(n,sum<ExecutionSpace, KeyType>(keys),sum_after);
+  Kokkos::parallel_reduce(n-1,is_sorted_struct<ExecutionSpace, KeyType>(keys),sort_fails);
+
+  double ratio = sum_before/sum_after;
+  double epsilon = 1e-10;
+  unsigned int equal_sum = (ratio > (1.0-epsilon)) && (ratio < (1.0+epsilon)) ? 1 : 0;
+
+  ASSERT_EQ(sort_fails,0);
+  ASSERT_EQ(equal_sum,1);
+}
+
+template<class ExecutionSpace, typename KeyType>
+void test_3D_sort(unsigned int n) {
+  typedef Kokkos::View<KeyType*[3],ExecutionSpace > KeyViewType;
+
+  KeyViewType keys("Keys",n*n*n);
+
+  Kokkos::Random_XorShift64_Pool<ExecutionSpace> g(1931);
+  Kokkos::fill_random(keys,g,100.0);
+
+  double sum_before = 0.0;
+  double sum_after = 0.0;
+  unsigned int sort_fails = 0;
+
+  Kokkos::parallel_reduce(keys.dimension_0(),sum3D<ExecutionSpace, KeyType>(keys),sum_before);
+
+  int bin_1d = 1;
+  while( bin_1d*bin_1d*bin_1d*4< (int) keys.dimension_0() ) bin_1d*=2;
+  int bin_max[3] = {bin_1d,bin_1d,bin_1d};
+  typename KeyViewType::value_type min[3] = {0,0,0};
+  typename KeyViewType::value_type max[3] = {100,100,100};
+
+  typedef Kokkos::SortImpl::DefaultBinOp3D< KeyViewType > BinOp;
+  BinOp bin_op(bin_max,min,max);
+  Kokkos::BinSort< KeyViewType , BinOp >
+    Sorter(keys,bin_op,false);
+  Sorter.create_permute_vector();
+  Sorter.template sort< KeyViewType >(keys);
+
+  Kokkos::parallel_reduce(keys.dimension_0(),sum3D<ExecutionSpace, KeyType>(keys),sum_after);
+  Kokkos::parallel_reduce(keys.dimension_0()-1,bin3d_is_sorted_struct<ExecutionSpace, KeyType>(keys,bin_1d,min[0],max[0]),sort_fails);
+
+  double ratio = sum_before/sum_after;
+  double epsilon = 1e-10;
+  unsigned int equal_sum = (ratio > (1.0-epsilon)) && (ratio < (1.0+epsilon)) ? 1 : 0;
+
+  printf("3D Sort Sum: %f %f Fails: %u\n",sum_before,sum_after,sort_fails);
+  ASSERT_EQ(sort_fails,0);
+  ASSERT_EQ(equal_sum,1);
+}
+
+template<class ExecutionSpace, typename KeyType>
+void test_sort(unsigned int N)
+{
+  test_1D_sort<ExecutionSpace,KeyType>(N*N*N, true);
+  test_1D_sort<ExecutionSpace,KeyType>(N*N*N, false);
+  test_3D_sort<ExecutionSpace,KeyType>(N);
+}
+
+}
+}
+#endif /* TESTSORT_HPP_ */
diff --git a/lib/kokkos/algorithms/unit_tests/TestThreads.cpp b/lib/kokkos/algorithms/unit_tests/TestThreads.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a61d6c8bd59bb9758f7ff30124b048150ac0cb92
--- /dev/null
+++ b/lib/kokkos/algorithms/unit_tests/TestThreads.cpp
@@ -0,0 +1,113 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+#include <TestRandom.hpp>
+#include <TestSort.hpp>
+#include <iomanip>
+
+
+//----------------------------------------------------------------------------
+
+
+namespace Test {
+
+#ifdef KOKKOS_HAVE_PTHREAD
+class threads : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+    std::cout << std::setprecision(5) << std::scientific;
+
+    unsigned num_threads = 4;
+
+    if (Kokkos::hwloc::available()) {
+      num_threads = Kokkos::hwloc::get_available_numa_count()
+                    * Kokkos::hwloc::get_available_cores_per_numa()
+                 // * Kokkos::hwloc::get_available_threads_per_core()
+                    ;
+
+    }
+
+    std::cout << "Threads: " << num_threads << std::endl;
+
+    Kokkos::Threads::initialize( num_threads );
+  }
+
+  static void TearDownTestCase()
+  {
+    Kokkos::Threads::finalize();
+  }
+};
+
+#define THREADS_RANDOM_XORSHIFT64( num_draws )                                \
+  TEST_F( threads, Random_XorShift64 ) {   \
+      Impl::test_random<Kokkos::Random_XorShift64_Pool<Kokkos::Threads> >(num_draws);                                   \
+  }
+
+#define THREADS_RANDOM_XORSHIFT1024( num_draws )                                \
+  TEST_F( threads, Random_XorShift1024 ) {   \
+      Impl::test_random<Kokkos::Random_XorShift1024_Pool<Kokkos::Threads> >(num_draws);                                   \
+  }
+
+#define THREADS_SORT_UNSIGNED( size )                                \
+  TEST_F( threads, SortUnsigned ) {   \
+      Impl::test_sort< Kokkos::Threads, double >(size);                                   \
+  }
+
+
+THREADS_RANDOM_XORSHIFT64( 10240000 )
+THREADS_RANDOM_XORSHIFT1024( 10130144 )
+THREADS_SORT_UNSIGNED(171)
+
+#undef THREADS_RANDOM_XORSHIFT64
+#undef THREADS_RANDOM_XORSHIFT1024
+#undef THREADS_SORT_UNSIGNED
+
+#endif
+} // namespace Test
+
+
diff --git a/lib/kokkos/algorithms/unit_tests/UnitTestMain.cpp b/lib/kokkos/algorithms/unit_tests/UnitTestMain.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f952ab3db51028aff0a0ebfe313b2639e353ab87
--- /dev/null
+++ b/lib/kokkos/algorithms/unit_tests/UnitTestMain.cpp
@@ -0,0 +1,50 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+int main(int argc, char *argv[]) {
+  ::testing::InitGoogleTest(&argc,argv);
+  return RUN_ALL_TESTS();
+}
+
diff --git a/lib/kokkos/cmake/Dependencies.cmake b/lib/kokkos/cmake/Dependencies.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..8c51eab4d78b68f9c01e64f63352a22cf8f2086d
--- /dev/null
+++ b/lib/kokkos/cmake/Dependencies.cmake
@@ -0,0 +1,10 @@
+TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
+  SUBPACKAGES_DIRS_CLASSIFICATIONS_OPTREQS
+    #SubPackageName       Directory         Class    Req/Opt
+    #
+    # New Kokkos subpackages:
+    Core                  core              PS       REQUIRED
+    Containers            containers        PS       OPTIONAL
+    Algorithms            algorithms        PS       OPTIONAL
+    Example               example           EX       OPTIONAL
+  )
diff --git a/lib/kokkos/cmake/deps/CUDA.cmake b/lib/kokkos/cmake/deps/CUDA.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..801c20067b9195db5ba5e6cd6fdd62a426e6e294
--- /dev/null
+++ b/lib/kokkos/cmake/deps/CUDA.cmake
@@ -0,0 +1,79 @@
+# @HEADER
+# ************************************************************************
+#
+#            Trilinos: An Object-Oriented Solver Framework
+#                 Copyright (2001) Sandia Corporation
+#
+#
+# Copyright (2001) Sandia Corporation. Under the terms of Contract
+# DE-AC04-94AL85000, there is a non-exclusive license for use of this
+# work by or on behalf of the U.S. Government.  Export of this program
+# may require a license from the United States Government.
+#
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the Corporation nor the names of the
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# NOTICE:  The United States Government is granted for itself and others
+# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide
+# license in this data to reproduce, prepare derivative works, and
+# perform publicly and display publicly.  Beginning five (5) years from
+# July 25, 2001, the United States Government is granted for itself and
+# others acting on its behalf a paid-up, nonexclusive, irrevocable
+# worldwide license in this data to reproduce, prepare derivative works,
+# distribute copies to the public, perform publicly and display
+# publicly, and to permit others to do so.
+#
+# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT
+# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES
+# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR
+# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY
+# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS
+# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS.
+#
+# ************************************************************************
+# @HEADER
+
+# Check for CUDA support
+
+SET(_CUDA_FAILURE OFF)
+
+# Have CMake find CUDA
+IF(NOT _CUDA_FAILURE)
+  FIND_PACKAGE(CUDA 3.2)
+  IF (NOT CUDA_FOUND)
+    SET(_CUDA_FAILURE ON)
+  ENDIF()
+ENDIF()
+
+IF(NOT _CUDA_FAILURE)
+  # if we haven't met failure
+  macro(PACKAGE_ADD_CUDA_LIBRARY cuda_target)
+    TRIBITS_ADD_LIBRARY(${cuda_target} ${ARGN} CUDALIBRARY)
+  endmacro()
+  GLOBAL_SET(TPL_CUDA_LIBRARY_DIRS)
+  GLOBAL_SET(TPL_CUDA_INCLUDE_DIRS ${CUDA_TOOLKIT_INCLUDE})
+  GLOBAL_SET(TPL_CUDA_LIBRARIES ${CUDA_CUDART_LIBRARY} ${CUDA_cublas_LIBRARY} ${CUDA_cufft_LIBRARY})
+  TIBITS_CREATE_IMPORTED_TPL_LIBRARY(CUSPARSE)
+ELSE()
+  SET(TPL_ENABLE_CUDA OFF)
+ENDIF()
diff --git a/lib/kokkos/cmake/deps/CUSPARSE.cmake b/lib/kokkos/cmake/deps/CUSPARSE.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..205f5e2a98898b8247b0f199afcc2e3ac4bc97b4
--- /dev/null
+++ b/lib/kokkos/cmake/deps/CUSPARSE.cmake
@@ -0,0 +1,64 @@
+# @HEADER
+# ************************************************************************
+#
+#            Trilinos: An Object-Oriented Solver Framework
+#                 Copyright (2001) Sandia Corporation
+#
+#
+# Copyright (2001) Sandia Corporation. Under the terms of Contract
+# DE-AC04-94AL85000, there is a non-exclusive license for use of this
+# work by or on behalf of the U.S. Government.  Export of this program
+# may require a license from the United States Government.
+#
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the Corporation nor the names of the
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# NOTICE:  The United States Government is granted for itself and others
+# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide
+# license in this data to reproduce, prepare derivative works, and
+# perform publicly and display publicly.  Beginning five (5) years from
+# July 25, 2001, the United States Government is granted for itself and
+# others acting on its behalf a paid-up, nonexclusive, irrevocable
+# worldwide license in this data to reproduce, prepare derivative works,
+# distribute copies to the public, perform publicly and display
+# publicly, and to permit others to do so.
+#
+# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT
+# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES
+# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR
+# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY
+# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS
+# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS.
+#
+# ************************************************************************
+# @HEADER
+
+include(${TRIBITS_DEPS_DIR}/CUDA.cmake)
+
+IF (TPL_ENABLE_CUDA)
+  GLOBAL_SET(TPL_CUSPARSE_LIBRARY_DIRS)
+  GLOBAL_SET(TPL_CUSPARSE_INCLUDE_DIRS ${TPL_CUDA_INCLUDE_DIRS})
+  GLOBAL_SET(TPL_CUSPARSE_LIBRARIES    ${CUDA_cusparse_LIBRARY})
+  TIBITS_CREATE_IMPORTED_TPL_LIBRARY(CUSPARSE)
+ENDIF()
+
diff --git a/lib/kokkos/cmake/deps/HWLOC.cmake b/lib/kokkos/cmake/deps/HWLOC.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..275abd3a5d4ecfb3ce3b207f978959f6f9019061
--- /dev/null
+++ b/lib/kokkos/cmake/deps/HWLOC.cmake
@@ -0,0 +1,70 @@
+# @HEADER
+# ************************************************************************
+#
+#            Trilinos: An Object-Oriented Solver Framework
+#                 Copyright (2001) Sandia Corporation
+#
+#
+# Copyright (2001) Sandia Corporation. Under the terms of Contract
+# DE-AC04-94AL85000, there is a non-exclusive license for use of this
+# work by or on behalf of the U.S. Government.  Export of this program
+# may require a license from the United States Government.
+#
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the Corporation nor the names of the
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# NOTICE:  The United States Government is granted for itself and others
+# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide
+# license in this data to reproduce, prepare derivative works, and
+# perform publicly and display publicly.  Beginning five (5) years from
+# July 25, 2001, the United States Government is granted for itself and
+# others acting on its behalf a paid-up, nonexclusive, irrevocable
+# worldwide license in this data to reproduce, prepare derivative works,
+# distribute copies to the public, perform publicly and display
+# publicly, and to permit others to do so.
+#
+# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT
+# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES
+# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR
+# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY
+# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS
+# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS.
+#
+# ************************************************************************
+# @HEADER
+
+
+#-----------------------------------------------------------------------------
+#  Hardware locality detection and control library.
+#
+#  Acquisition information:
+#    Date checked:  November 2011
+#    Checked by:    H. Carter Edwards <hcedwar AT sandia.gov>
+#    Source:        http://www.open-mpi.org/projects/hwloc/
+#    Version:       1.3
+#
+
+TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( HWLOC
+  REQUIRED_HEADERS hwloc.h
+  REQUIRED_LIBS_NAMES "hwloc"
+  )
diff --git a/lib/kokkos/cmake/deps/Pthread.cmake b/lib/kokkos/cmake/deps/Pthread.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..46d0a939cad0e6c5479cb20da1d37ba5ca509b8c
--- /dev/null
+++ b/lib/kokkos/cmake/deps/Pthread.cmake
@@ -0,0 +1,83 @@
+# @HEADER
+# ************************************************************************
+#
+#            Trilinos: An Object-Oriented Solver Framework
+#                 Copyright (2001) Sandia Corporation
+#
+#
+# Copyright (2001) Sandia Corporation. Under the terms of Contract
+# DE-AC04-94AL85000, there is a non-exclusive license for use of this
+# work by or on behalf of the U.S. Government.  Export of this program
+# may require a license from the United States Government.
+#
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the Corporation nor the names of the
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# NOTICE:  The United States Government is granted for itself and others
+# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide
+# license in this data to reproduce, prepare derivative works, and
+# perform publicly and display publicly.  Beginning five (5) years from
+# July 25, 2001, the United States Government is granted for itself and
+# others acting on its behalf a paid-up, nonexclusive, irrevocable
+# worldwide license in this data to reproduce, prepare derivative works,
+# distribute copies to the public, perform publicly and display
+# publicly, and to permit others to do so.
+#
+# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT
+# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES
+# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR
+# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY
+# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS
+# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS.
+#
+# ************************************************************************
+# @HEADER
+
+
+SET(USE_THREADS FALSE)
+
+IF(NOT TPL_Pthread_INCLUDE_DIRS AND NOT TPL_Pthread_LIBRARY_DIRS AND NOT TPL_Pthread_LIBRARIES)
+  # Use CMake's Thread finder since it is a bit smarter in determining
+  # whether pthreads is already built into the compiler and doesn't need
+  # a library to link.
+  FIND_PACKAGE(Threads)
+  #If Threads found a copy of pthreads make sure it is one of the cases the tribits
+  #tpl system cannot handle.
+  IF(Threads_FOUND AND CMAKE_USE_PTHREADS_INIT)
+    IF(CMAKE_THREAD_LIBS_INIT STREQUAL "" OR CMAKE_THREAD_LIBS_INIT STREQUAL "-pthread")
+      SET(USE_THREADS TRUE)
+    ENDIF()
+  ENDIF()
+ENDIF()
+
+IF(USE_THREADS)
+  SET(TPL_Pthread_INCLUDE_DIRS "")
+  SET(TPL_Pthread_LIBRARIES "${CMAKE_THREAD_LIBS_INIT}")
+  SET(TPL_Pthread_LIBRARY_DIRS "")
+  TIBITS_CREATE_IMPORTED_TPL_LIBRARY(Pthread)
+ELSE()
+  TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( Pthread
+    REQUIRED_HEADERS pthread.h
+    REQUIRED_LIBS_NAMES pthread
+      )
+ENDIF()
diff --git a/lib/kokkos/cmake/deps/QTHREAD.cmake b/lib/kokkos/cmake/deps/QTHREAD.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..994b72b20096f4462beab51d19e4410cd73bf05b
--- /dev/null
+++ b/lib/kokkos/cmake/deps/QTHREAD.cmake
@@ -0,0 +1,70 @@
+# @HEADER
+# ************************************************************************
+#
+#            Trilinos: An Object-Oriented Solver Framework
+#                 Copyright (2001) Sandia Corporation
+#
+#
+# Copyright (2001) Sandia Corporation. Under the terms of Contract
+# DE-AC04-94AL85000, there is a non-exclusive license for use of this
+# work by or on behalf of the U.S. Government.  Export of this program
+# may require a license from the United States Government.
+#
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the Corporation nor the names of the
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# NOTICE:  The United States Government is granted for itself and others
+# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide
+# license in this data to reproduce, prepare derivative works, and
+# perform publicly and display publicly.  Beginning five (5) years from
+# July 25, 2001, the United States Government is granted for itself and
+# others acting on its behalf a paid-up, nonexclusive, irrevocable
+# worldwide license in this data to reproduce, prepare derivative works,
+# distribute copies to the public, perform publicly and display
+# publicly, and to permit others to do so.
+#
+# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT
+# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES
+# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR
+# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY
+# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS
+# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS.
+#
+# ************************************************************************
+# @HEADER
+
+
+#-----------------------------------------------------------------------------
+#  Hardware locality detection and control library.
+#
+#  Acquisition information:
+#    Date checked:  July 2014
+#    Checked by:    H. Carter Edwards <hcedwar AT sandia.gov>
+#    Source:        https://code.google.com/p/qthreads
+#
+
+TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( QTHREAD
+  REQUIRED_HEADERS qthread.h
+  REQUIRED_LIBS_NAMES "qthread"
+  )
+
diff --git a/lib/kokkos/cmake/tpls/FindTPLCUSPARSE.cmake b/lib/kokkos/cmake/tpls/FindTPLCUSPARSE.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..aad1e2bad7629f3f43ca91135752253a20ac9523
--- /dev/null
+++ b/lib/kokkos/cmake/tpls/FindTPLCUSPARSE.cmake
@@ -0,0 +1,75 @@
+# @HEADER
+# ************************************************************************
+#
+#            Trilinos: An Object-Oriented Solver Framework
+#                 Copyright (2001) Sandia Corporation
+#
+#
+# Copyright (2001) Sandia Corporation. Under the terms of Contract
+# DE-AC04-94AL85000, there is a non-exclusive license for use of this
+# work by or on behalf of the U.S. Government.  Export of this program
+# may require a license from the United States Government.
+#
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the Corporation nor the names of the
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# NOTICE:  The United States Government is granted for itself and others
+# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide
+# license in this data to reproduce, prepare derivative works, and
+# perform publicly and display publicly.  Beginning five (5) years from
+# July 25, 2001, the United States Government is granted for itself and
+# others acting on its behalf a paid-up, nonexclusive, irrevocable
+# worldwide license in this data to reproduce, prepare derivative works,
+# distribute copies to the public, perform publicly and display
+# publicly, and to permit others to do so.
+#
+# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT
+# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES
+# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR
+# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY
+# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS
+# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS.
+#
+# ************************************************************************
+# @HEADER
+
+# Check for CUDA support
+
+IF (NOT TPL_ENABLE_CUDA OR CUDA_VERSION VERSION_LESS "4.1")
+  MESSAGE(FATAL_ERROR "\nCUSPARSE: did not find acceptable version of CUDA libraries (4.1 or greater)")
+ELSE()
+  IF(CMAKE_VERSION VERSION_LESS "2.8.8")
+    # FindCUDA before CMake 2.8.8 does not find cusparse library; therefore, we must
+    find_library(CUDA_cusparse_LIBRARY
+      cusparse
+      HINTS ${CUDA_TOOLKIT_ROOT_DIR}/lib
+      )
+    IF(CUDA_cusparse_LIBRARY STREQUAL "CUDA_cusparse_LIBRARY-NOTFOUND") 
+      MESSAGE(FATAL_ERROR "\nCUSPARSE: could not find cuspasre library.")
+    ENDIF()
+  ENDIF(CMAKE_VERSION VERSION_LESS "2.8.8")
+  GLOBAL_SET(TPL_CUSPARSE_LIBRARY_DIRS)
+  GLOBAL_SET(TPL_CUSPARSE_INCLUDE_DIRS ${TPL_CUDA_INCLUDE_DIRS})
+  GLOBAL_SET(TPL_CUSPARSE_LIBRARIES    ${CUDA_cusparse_LIBRARY})
+ENDIF()
+
diff --git a/lib/kokkos/cmake/tpls/FindTPLHWLOC.cmake b/lib/kokkos/cmake/tpls/FindTPLHWLOC.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..715b3e9bde59379c632fbec7926b425e6189e74d
--- /dev/null
+++ b/lib/kokkos/cmake/tpls/FindTPLHWLOC.cmake
@@ -0,0 +1,71 @@
+# @HEADER
+# ************************************************************************
+#
+#            Trilinos: An Object-Oriented Solver Framework
+#                 Copyright (2001) Sandia Corporation
+#
+#
+# Copyright (2001) Sandia Corporation. Under the terms of Contract
+# DE-AC04-94AL85000, there is a non-exclusive license for use of this
+# work by or on behalf of the U.S. Government.  Export of this program
+# may require a license from the United States Government.
+#
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the Corporation nor the names of the
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# NOTICE:  The United States Government is granted for itself and others
+# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide
+# license in this data to reproduce, prepare derivative works, and
+# perform publicly and display publicly.  Beginning five (5) years from
+# July 25, 2001, the United States Government is granted for itself and
+# others acting on its behalf a paid-up, nonexclusive, irrevocable
+# worldwide license in this data to reproduce, prepare derivative works,
+# distribute copies to the public, perform publicly and display
+# publicly, and to permit others to do so.
+#
+# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT
+# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES
+# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR
+# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY
+# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS
+# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS.
+#
+# ************************************************************************
+# @HEADER
+
+
+#-----------------------------------------------------------------------------
+#  Hardware locality detection and control library.
+#
+#  Acquisition information:
+#    Date checked:  November 2011
+#    Checked by:    H. Carter Edwards <hcedwar AT sandia.gov>
+#    Source:        http://www.open-mpi.org/projects/hwloc/
+#    Version:       1.3
+#
+
+TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( HWLOC
+  REQUIRED_HEADERS hwloc.h
+  REQUIRED_LIBS_NAMES "hwloc"
+  )
+
diff --git a/lib/kokkos/cmake/tpls/FindTPLPthread.cmake b/lib/kokkos/cmake/tpls/FindTPLPthread.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..fc401d7543357f18d225a33efe0cf3bb489170d7
--- /dev/null
+++ b/lib/kokkos/cmake/tpls/FindTPLPthread.cmake
@@ -0,0 +1,82 @@
+# @HEADER
+# ************************************************************************
+#
+#            Trilinos: An Object-Oriented Solver Framework
+#                 Copyright (2001) Sandia Corporation
+#
+#
+# Copyright (2001) Sandia Corporation. Under the terms of Contract
+# DE-AC04-94AL85000, there is a non-exclusive license for use of this
+# work by or on behalf of the U.S. Government.  Export of this program
+# may require a license from the United States Government.
+#
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the Corporation nor the names of the
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# NOTICE:  The United States Government is granted for itself and others
+# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide
+# license in this data to reproduce, prepare derivative works, and
+# perform publicly and display publicly.  Beginning five (5) years from
+# July 25, 2001, the United States Government is granted for itself and
+# others acting on its behalf a paid-up, nonexclusive, irrevocable
+# worldwide license in this data to reproduce, prepare derivative works,
+# distribute copies to the public, perform publicly and display
+# publicly, and to permit others to do so.
+#
+# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT
+# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES
+# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR
+# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY
+# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS
+# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS.
+#
+# ************************************************************************
+# @HEADER
+
+
+SET(USE_THREADS FALSE)
+
+IF(NOT TPL_Pthread_INCLUDE_DIRS AND NOT TPL_Pthread_LIBRARY_DIRS AND NOT TPL_Pthread_LIBRARIES)
+  # Use CMake's Thread finder since it is a bit smarter in determining
+  # whether pthreads is already built into the compiler and doesn't need
+  # a library to link.
+  FIND_PACKAGE(Threads)
+  #If Threads found a copy of pthreads make sure it is one of the cases the tribits
+  #tpl system cannot handle.
+  IF(Threads_FOUND AND CMAKE_USE_PTHREADS_INIT)
+    IF(CMAKE_THREAD_LIBS_INIT STREQUAL "" OR CMAKE_THREAD_LIBS_INIT STREQUAL "-pthread")
+      SET(USE_THREADS TRUE)
+    ENDIF()
+  ENDIF()
+ENDIF()
+
+IF(USE_THREADS)
+  SET(TPL_Pthread_INCLUDE_DIRS "")
+  SET(TPL_Pthread_LIBRARIES "${CMAKE_THREAD_LIBS_INIT}")
+  SET(TPL_Pthread_LIBRARY_DIRS "")
+ELSE()
+  TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( Pthread
+    REQUIRED_HEADERS pthread.h
+    REQUIRED_LIBS_NAMES pthread
+      )
+ENDIF()
diff --git a/lib/kokkos/cmake/tpls/FindTPLQTHREAD.cmake b/lib/kokkos/cmake/tpls/FindTPLQTHREAD.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..994b72b20096f4462beab51d19e4410cd73bf05b
--- /dev/null
+++ b/lib/kokkos/cmake/tpls/FindTPLQTHREAD.cmake
@@ -0,0 +1,70 @@
+# @HEADER
+# ************************************************************************
+#
+#            Trilinos: An Object-Oriented Solver Framework
+#                 Copyright (2001) Sandia Corporation
+#
+#
+# Copyright (2001) Sandia Corporation. Under the terms of Contract
+# DE-AC04-94AL85000, there is a non-exclusive license for use of this
+# work by or on behalf of the U.S. Government.  Export of this program
+# may require a license from the United States Government.
+#
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the Corporation nor the names of the
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# NOTICE:  The United States Government is granted for itself and others
+# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide
+# license in this data to reproduce, prepare derivative works, and
+# perform publicly and display publicly.  Beginning five (5) years from
+# July 25, 2001, the United States Government is granted for itself and
+# others acting on its behalf a paid-up, nonexclusive, irrevocable
+# worldwide license in this data to reproduce, prepare derivative works,
+# distribute copies to the public, perform publicly and display
+# publicly, and to permit others to do so.
+#
+# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT
+# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES
+# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR
+# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY
+# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS
+# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS.
+#
+# ************************************************************************
+# @HEADER
+
+
+#-----------------------------------------------------------------------------
+#  Hardware locality detection and control library.
+#
+#  Acquisition information:
+#    Date checked:  July 2014
+#    Checked by:    H. Carter Edwards <hcedwar AT sandia.gov>
+#    Source:        https://code.google.com/p/qthreads
+#
+
+TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( QTHREAD
+  REQUIRED_HEADERS qthread.h
+  REQUIRED_LIBS_NAMES "qthread"
+  )
+
diff --git a/lib/kokkos/cmake/tribits.cmake b/lib/kokkos/cmake/tribits.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..34cd216f810c9a829dbcdc13ed5e9c3be81752ac
--- /dev/null
+++ b/lib/kokkos/cmake/tribits.cmake
@@ -0,0 +1,485 @@
+INCLUDE(CMakeParseArguments)
+INCLUDE(CTest)
+
+FUNCTION(ASSERT_DEFINED VARS)
+  FOREACH(VAR ${VARS})
+    IF(NOT DEFINED ${VAR})
+      MESSAGE(SEND_ERROR "Error, the variable ${VAR} is not defined!")
+    ENDIF()
+  ENDFOREACH()
+ENDFUNCTION()
+
+MACRO(GLOBAL_SET VARNAME)
+  SET(${VARNAME} ${ARGN} CACHE INTERNAL "")
+ENDMACRO()
+
+MACRO(PREPEND_GLOBAL_SET VARNAME)
+  ASSERT_DEFINED(${VARNAME})
+  GLOBAL_SET(${VARNAME} ${ARGN} ${${VARNAME}})
+ENDMACRO()
+
+FUNCTION(REMOVE_GLOBAL_DUPLICATES VARNAME)
+  ASSERT_DEFINED(${VARNAME})
+  IF (${VARNAME})
+    SET(TMP ${${VARNAME}})
+    LIST(REMOVE_DUPLICATES TMP)
+    GLOBAL_SET(${VARNAME} ${TMP})
+  ENDIF()
+ENDFUNCTION()
+
+MACRO(TRIBITS_ADD_OPTION_AND_DEFINE  USER_OPTION_NAME  MACRO_DEFINE_NAME DOCSTRING  DEFAULT_VALUE)
+  MESSAGE(STATUS "TRIBITS_ADD_OPTION_AND_DEFINE: '${USER_OPTION_NAME}' '${MACRO_DEFINE_NAME}' '${DEFAULT_VALUE}'")
+  SET( ${USER_OPTION_NAME} "${DEFAULT_VALUE}" CACHE BOOL "${DOCSTRING}" )
+  IF(NOT ${MACRO_DEFINE_NAME} STREQUAL "")
+    IF(${USER_OPTION_NAME})
+      GLOBAL_SET(${MACRO_DEFINE_NAME} ON)
+    ELSE()
+      GLOBAL_SET(${MACRO_DEFINE_NAME} OFF)
+    ENDIF()
+  ENDIF()
+ENDMACRO()
+
+FUNCTION(TRIBITS_CONFIGURE_FILE  PACKAGE_NAME_CONFIG_FILE)
+
+  # Configure the file
+  CONFIGURE_FILE(
+    ${PACKAGE_SOURCE_DIR}/cmake/${PACKAGE_NAME_CONFIG_FILE}.in
+    ${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME_CONFIG_FILE}
+    )
+
+ENDFUNCTION()
+
+MACRO(TRIBITS_ADD_DEBUG_OPTION)
+  TRIBITS_ADD_OPTION_AND_DEFINE(
+    ${PROJECT_NAME}_ENABLE_DEBUG
+    HAVE_${PROJECT_NAME_UC}_DEBUG
+    "Enable a host of runtime debug checking."
+    OFF
+    )
+ENDMACRO()
+
+
+MACRO(TRIBITS_ADD_TEST_DIRECTORIES)
+  FOREACH(TEST_DIR ${ARGN})
+    ADD_SUBDIRECTORY(${TEST_DIR})
+  ENDFOREACH()
+ENDMACRO()
+
+MACRO(TRIBITS_ADD_EXAMPLE_DIRECTORIES)
+
+  IF(${PACKAGE_NAME}_ENABLE_EXAMPLES OR ${PARENT_PACKAGE_NAME}_ENABLE_EXAMPLES)
+    FOREACH(EXAMPLE_DIR ${ARGN})
+      ADD_SUBDIRECTORY(${EXAMPLE_DIR})
+    ENDFOREACH()
+  ENDIF()
+
+ENDMACRO()
+
+MACRO(TARGET_TRANSFER_PROPERTY TARGET_NAME PROP_IN PROP_OUT)
+  SET(PROP_VALUES)
+  FOREACH(TARGET_X ${ARGN})
+    LIST(APPEND PROP_VALUES "$<TARGET_PROPERTY:${TARGET_X},${PROP_IN}>")
+  ENDFOREACH()
+  SET_TARGET_PROPERTIES(${TARGET_NAME} PROPERTIES ${PROP_OUT} "${PROP_VALUES}")
+ENDMACRO()
+
+MACRO(ADD_INTERFACE_LIBRARY LIB_NAME)
+  FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/dummy.cpp "")
+  ADD_LIBRARY(${LIB_NAME} STATIC ${CMAKE_CURRENT_BINARY_DIR}/dummy.cpp)
+  SET_TARGET_PROPERTIES(${LIB_NAME} PROPERTIES INTERFACE TRUE)
+ENDMACRO()
+
+# Older versions of cmake does not make include directories transitive
+MACRO(TARGET_LINK_AND_INCLUDE_LIBRARIES TARGET_NAME)
+  TARGET_LINK_LIBRARIES(${TARGET_NAME} LINK_PUBLIC ${ARGN})
+  FOREACH(DEP_LIB ${ARGN})
+    TARGET_INCLUDE_DIRECTORIES(${TARGET_NAME} PUBLIC $<TARGET_PROPERTY:${DEP_LIB},INTERFACE_INCLUDE_DIRECTORIES>)
+    TARGET_INCLUDE_DIRECTORIES(${TARGET_NAME} PUBLIC $<TARGET_PROPERTY:${DEP_LIB},INCLUDE_DIRECTORIES>)
+  ENDFOREACH()
+ENDMACRO()
+
+FUNCTION(TRIBITS_ADD_LIBRARY LIBRARY_NAME)
+
+  SET(options STATIC SHARED TESTONLY NO_INSTALL_LIB_OR_HEADERS CUDALIBRARY)
+  SET(oneValueArgs)
+  SET(multiValueArgs HEADERS HEADERS_INSTALL_SUBDIR NOINSTALLHEADERS SOURCES DEPLIBS IMPORTEDLIBS DEFINES ADDED_LIB_TARGET_NAME_OUT)
+
+  CMAKE_PARSE_ARGUMENTS(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  IF(PARSE_HEADERS)
+    LIST(REMOVE_DUPLICATES PARSE_HEADERS)
+  ENDIF()
+  IF(PARSE_SOURCES)
+    LIST(REMOVE_DUPLICATES PARSE_SOURCES)
+  ENDIF()
+
+  # Local variable to hold all of the libraries that will be directly linked
+  # to this library.
+  SET(LINK_LIBS ${${PACKAGE_NAME}_DEPS})
+
+  # Add dependent libraries passed directly in
+
+  IF (PARSE_IMPORTEDLIBS)
+    LIST(APPEND LINK_LIBS ${PARSE_IMPORTEDLIBS})
+  ENDIF()
+
+  IF (PARSE_DEPLIBS)
+    LIST(APPEND LINK_LIBS ${PARSE_DEPLIBS})
+  ENDIF()
+
+  # Add the library and all the dependencies
+
+  IF (PARSE_DEFINES)
+    ADD_DEFINITIONS(${PARSE_DEFINES})
+  ENDIF()
+
+  IF (PARSE_STATIC)
+    SET(STATIC_KEYWORD "STATIC")
+  ELSE()
+    SET(STATIC_KEYWORD)
+  ENDIF()
+
+  IF (PARSE_SHARED)
+    SET(SHARED_KEYWORD "SHARED")
+  ELSE()
+    SET(SHARED_KEYWORD)
+  ENDIF()
+
+  IF (PARSE_TESTONLY)
+    SET(EXCLUDE_FROM_ALL_KEYWORD "EXCLUDE_FROM_ALL")
+  ELSE()
+    SET(EXCLUDE_FROM_ALL_KEYWORD)
+  ENDIF()
+  IF (NOT PARSE_CUDALIBRARY)
+    ADD_LIBRARY(
+      ${LIBRARY_NAME}
+      ${STATIC_KEYWORD}
+      ${SHARED_KEYWORD}
+      ${EXCLUDE_FROM_ALL_KEYWORD}
+      ${PARSE_HEADERS}
+      ${PARSE_NOINSTALLHEADERS}
+      ${PARSE_SOURCES}
+      )
+  ELSE()
+    CUDA_ADD_LIBRARY(
+      ${LIBRARY_NAME}
+      ${PARSE_HEADERS}
+      ${PARSE_NOINSTALLHEADERS}
+      ${PARSE_SOURCES}
+      )
+  ENDIF()
+
+  TARGET_LINK_AND_INCLUDE_LIBRARIES(${LIBRARY_NAME} ${LINK_LIBS})
+
+  IF (NOT PARSE_TESTONLY OR PARSE_NO_INSTALL_LIB_OR_HEADERS)
+
+    INSTALL(
+      TARGETS ${LIBRARY_NAME}
+      EXPORT ${PROJECT_NAME}
+      RUNTIME DESTINATION bin
+      LIBRARY DESTINATION lib
+      ARCHIVE DESTINATION lib
+      COMPONENT ${PACKAGE_NAME}
+      )
+
+    INSTALL(
+      FILES  ${PARSE_HEADERS}
+      EXPORT ${PROJECT_NAME}
+      DESTINATION include
+      COMPONENT ${PACKAGE_NAME}
+      )
+
+      INSTALL(
+      DIRECTORY  ${PARSE_HEADERS_INSTALL_SUBDIR}
+      EXPORT ${PROJECT_NAME}
+      DESTINATION include
+      COMPONENT ${PACKAGE_NAME}
+      )
+
+  ENDIF()
+
+  IF (NOT PARSE_TESTONLY)
+    PREPEND_GLOBAL_SET(${PACKAGE_NAME}_LIBS ${LIBRARY_NAME})
+    REMOVE_GLOBAL_DUPLICATES(${PACKAGE_NAME}_LIBS)
+  ENDIF()
+
+ENDFUNCTION()
+
+FUNCTION(TRIBITS_ADD_EXECUTABLE EXE_NAME)
+
+  SET(options NOEXEPREFIX NOEXESUFFIX ADD_DIR_TO_NAME INSTALLABLE TESTONLY)
+  SET(oneValueArgs ADDED_EXE_TARGET_NAME_OUT)
+  SET(multiValueArgs SOURCES CATEGORIES HOST XHOST HOSTTYPE XHOSTTYPE DIRECTORY TESTONLYLIBS IMPORTEDLIBS DEPLIBS COMM LINKER_LANGUAGE TARGET_DEFINES DEFINES)
+
+  CMAKE_PARSE_ARGUMENTS(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  IF (PARSE_TARGET_DEFINES)
+    TARGET_COMPILE_DEFINITIONS(${EXE_NAME} PUBLIC ${PARSE_TARGET_DEFINES})
+  ENDIF()
+
+  SET(LINK_LIBS PACKAGE_${PACKAGE_NAME})
+
+  IF (PARSE_TESTONLYLIBS)
+    LIST(APPEND LINK_LIBS ${PARSE_TESTONLYLIBS})
+  ENDIF()
+
+  IF (PARSE_IMPORTEDLIBS)
+    LIST(APPEND LINK_LIBS ${PARSE_IMPORTEDLIBS})
+  ENDIF()
+
+  SET (EXE_SOURCES)
+  IF(PARSE_DIRECTORY)
+    FOREACH( SOURCE_FILE ${PARSE_SOURCES} )
+      IF(IS_ABSOLUTE ${SOURCE_FILE})
+        SET (EXE_SOURCES ${EXE_SOURCES} ${SOURCE_FILE})
+      ELSE()
+        SET (EXE_SOURCES ${EXE_SOURCES} ${PARSE_DIRECTORY}/${SOURCE_FILE})
+      ENDIF()
+    ENDFOREACH( )
+  ELSE()
+    FOREACH( SOURCE_FILE ${PARSE_SOURCES} )
+      SET (EXE_SOURCES ${EXE_SOURCES} ${SOURCE_FILE})
+    ENDFOREACH( )
+  ENDIF()
+
+  SET(EXE_BINARY_NAME ${EXE_NAME})
+  IF(DEFINED PACKAGE_NAME AND NOT PARSE_NOEXEPREFIX)
+    SET(EXE_BINARY_NAME ${PACKAGE_NAME}_${EXE_BINARY_NAME})
+  ENDIF()
+
+  IF (PARSE_TESTONLY)
+    SET(EXCLUDE_FROM_ALL_KEYWORD "EXCLUDE_FROM_ALL")
+  ELSE()
+    SET(EXCLUDE_FROM_ALL_KEYWORD)
+  ENDIF()
+  ADD_EXECUTABLE(${EXE_BINARY_NAME} ${EXCLUDE_FROM_ALL_KEYWORD} ${EXE_SOURCES})
+
+  TARGET_LINK_AND_INCLUDE_LIBRARIES(${EXE_BINARY_NAME} ${LINK_LIBS})
+
+  IF(PARSE_ADDED_EXE_TARGET_NAME_OUT)
+    SET(${PARSE_ADDED_EXE_TARGET_NAME_OUT} ${EXE_BINARY_NAME} PARENT_SCOPE)
+  ENDIF()
+
+  IF(PARSE_INSTALLABLE)
+    INSTALL(
+      TARGETS ${EXE_BINARY_NAME}
+      EXPORT ${PROJECT_NAME}
+        DESTINATION bin
+    )
+  ENDIF()
+ENDFUNCTION()
+
+ADD_CUSTOM_TARGET(check COMMAND ${CMAKE_CTEST_COMMAND} -VV -C ${CMAKE_CFG_INTDIR})
+
+FUNCTION(TRIBITS_ADD_EXECUTABLE_AND_TEST EXE_NAME)
+
+  SET(options STANDARD_PASS_OUTPUT WILL_FAIL)
+  SET(oneValueArgs PASS_REGULAR_EXPRESSION FAIL_REGULAR_EXPRESSION ENVIRONMENT TIMEOUT CATEGORIES ADDED_TESTS_NAMES_OUT ADDED_EXE_TARGET_NAME_OUT)
+  SET(multiValueArgs)
+
+  CMAKE_PARSE_ARGUMENTS(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  TRIBITS_ADD_EXECUTABLE(${EXE_NAME} TESTONLY ADDED_EXE_TARGET_NAME_OUT TEST_NAME ${PARSE_UNPARSED_ARGUMENTS})
+
+  IF(WIN32)
+    ADD_TEST(NAME ${TEST_NAME} WORKING_DIRECTORY ${LIBRARY_OUTPUT_PATH} COMMAND ${TEST_NAME}${CMAKE_EXECUTABLE_SUFFIX})
+  ELSE()
+    ADD_TEST(NAME ${TEST_NAME} COMMAND ${TEST_NAME})
+  ENDIF()
+  ADD_DEPENDENCIES(check ${TEST_NAME})
+
+  IF(PARSE_FAIL_REGULAR_EXPRESSION)
+    SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES FAIL_REGULAR_EXPRESSION ${PARSE_FAIL_REGULAR_EXPRESSION})
+  ENDIF()
+
+  IF(PARSE_PASS_REGULAR_EXPRESSION)
+    SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES PASS_REGULAR_EXPRESSION ${PARSE_PASS_REGULAR_EXPRESSION})
+  ENDIF()
+
+  IF(PARSE_WILL_FAIL)
+    SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES WILL_FAIL ${PARSE_WILL_FAIL})
+  ENDIF()
+
+  IF(PARSE_ADDED_TESTS_NAMES_OUT)
+    SET(${PARSE_ADDED_TESTS_NAMES_OUT} ${TEST_NAME} PARENT_SCOPE)
+  ENDIF()
+
+  IF(PARSE_ADDED_EXE_TARGET_NAME_OUT)
+    SET(${PARSE_ADDED_EXE_TARGET_NAME_OUT} ${TEST_NAME} PARENT_SCOPE)
+  ENDIF()
+
+ENDFUNCTION()
+
+MACRO(TIBITS_CREATE_IMPORTED_TPL_LIBRARY TPL_NAME)
+  ADD_INTERFACE_LIBRARY(TPL_LIB_${TPL_NAME})
+  TARGET_LINK_LIBRARIES(TPL_LIB_${TPL_NAME} LINK_PUBLIC ${TPL_${TPL_NAME}_LIBRARIES})
+  TARGET_INCLUDE_DIRECTORIES(TPL_LIB_${TPL_NAME} INTERFACE ${TPL_${TPL_NAME}_INCLUDE_DIRS})
+ENDMACRO()
+
+FUNCTION(TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES TPL_NAME)
+
+  SET(options MUST_FIND_ALL_LIBS MUST_FIND_ALL_HEADERS NO_PRINT_ENABLE_SUCCESS_FAIL)
+  SET(oneValueArgs)
+  SET(multiValueArgs REQUIRED_HEADERS REQUIRED_LIBS_NAMES)
+
+  CMAKE_PARSE_ARGUMENTS(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  SET(_${TPL_NAME}_ENABLE_SUCCESS TRUE)
+  IF (PARSE_REQUIRED_LIBS_NAMES)
+    FIND_LIBRARY(TPL_${TPL_NAME}_LIBRARIES NAMES ${PARSE_REQUIRED_LIBS_NAMES})
+    IF(NOT TPL_${TPL_NAME}_LIBRARIES)
+      SET(_${TPL_NAME}_ENABLE_SUCCESS FALSE)
+    ENDIF()
+  ENDIF()
+  IF (PARSE_REQUIRED_HEADERS)
+    FIND_PATH(TPL_${TPL_NAME}_INCLUDE_DIRS NAMES ${PARSE_REQUIRED_HEADERS})
+    IF(NOT TPL_${TPL_NAME}_INCLUDE_DIRS)
+      SET(_${TPL_NAME}_ENABLE_SUCCESS FALSE)
+    ENDIF()
+  ENDIF()
+
+
+  IF (_${TPL_NAME}_ENABLE_SUCCESS)
+    TIBITS_CREATE_IMPORTED_TPL_LIBRARY(${TPL_NAME})
+  ENDIF()
+
+ENDFUNCTION()
+
+MACRO(TRIBITS_PROCESS_TPL_DEP_FILE TPL_FILE)
+  GET_FILENAME_COMPONENT(TPL_NAME ${TPL_FILE} NAME_WE)
+  INCLUDE("${TPL_FILE}")
+  IF(TARGET TPL_LIB_${TPL_NAME})
+    MESSAGE(STATUS "Found tpl library: ${TPL_NAME}")
+    SET(TPL_ENABLE_${TPL_NAME} TRUE)
+  ELSE()
+    MESSAGE(STATUS "Tpl library not found: ${TPL_NAME}")
+    SET(TPL_ENABLE_${TPL_NAME} FALSE)
+  ENDIF()
+ENDMACRO()
+
+MACRO(PREPEND_TARGET_SET VARNAME TARGET_NAME TYPE)
+  IF(TYPE STREQUAL "REQUIRED")
+    SET(REQUIRED TRUE)
+  ELSE()
+    SET(REQUIRED FALSE)
+  ENDIF()
+  IF(TARGET ${TARGET_NAME})
+    PREPEND_GLOBAL_SET(${VARNAME} ${TARGET_NAME})
+  ELSE()
+    IF(REQUIRED)
+      MESSAGE(FATAL_ERROR "Missing dependency ${TARGET_NAME}")
+    ENDIF()
+  ENDIF()
+ENDMACRO()
+
+MACRO(TRIBITS_APPEND_PACKAGE_DEPS DEP_LIST TYPE)
+  FOREACH(DEP ${ARGN})
+    PREPEND_GLOBAL_SET(${DEP_LIST} PACKAGE_${DEP})
+  ENDFOREACH()
+ENDMACRO()
+
+MACRO(TRIBITS_APPEND_TPLS_DEPS DEP_LIST TYPE)
+  FOREACH(DEP ${ARGN})
+    PREPEND_TARGET_SET(${DEP_LIST} TPL_LIB_${DEP} ${TYPE})
+  ENDFOREACH()
+ENDMACRO()
+
+MACRO(TRIBITS_ENABLE_TPLS)
+  FOREACH(TPL ${ARGN})
+    IF(TARGET ${TPL})
+      GLOBAL_SET(${PACKAGE_NAME}_ENABLE_${TPL} TRUE)
+    ELSE()
+      GLOBAL_SET(${PACKAGE_NAME}_ENABLE_${TPL} FALSE)
+    ENDIF()
+  ENDFOREACH()
+ENDMACRO()
+
+MACRO(TRIBITS_PACKAGE_DEFINE_DEPENDENCIES)
+
+  SET(options)
+  SET(oneValueArgs)
+  SET(multiValueArgs 
+    LIB_REQUIRED_PACKAGES
+    LIB_OPTIONAL_PACKAGES
+    TEST_REQUIRED_PACKAGES
+    TEST_OPTIONAL_PACKAGES
+    LIB_REQUIRED_TPLS
+    LIB_OPTIONAL_TPLS
+    TEST_REQUIRED_TPLS
+    TEST_OPTIONAL_TPLS
+    REGRESSION_EMAIL_LIST
+    SUBPACKAGES_DIRS_CLASSIFICATIONS_OPTREQS
+  )
+  CMAKE_PARSE_ARGUMENTS(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  GLOBAL_SET(${PACKAGE_NAME}_DEPS "")
+  TRIBITS_APPEND_PACKAGE_DEPS(${PACKAGE_NAME}_DEPS REQUIRED ${PARSE_LIB_REQUIRED_PACKAGES})
+  TRIBITS_APPEND_PACKAGE_DEPS(${PACKAGE_NAME}_DEPS OPTIONAL ${PARSE_LIB_OPTIONAL_PACKAGES})
+  TRIBITS_APPEND_TPLS_DEPS(${PACKAGE_NAME}_DEPS REQUIRED ${PARSE_LIB_REQUIRED_TPLS})
+  TRIBITS_APPEND_TPLS_DEPS(${PACKAGE_NAME}_DEPS OPTIONAL ${PARSE_LIB_OPTIONAL_TPLS})
+
+  GLOBAL_SET(${PACKAGE_NAME}_TEST_DEPS "")
+  TRIBITS_APPEND_PACKAGE_DEPS(${PACKAGE_NAME}_TEST_DEPS REQUIRED ${PARSE_TEST_REQUIRED_PACKAGES})
+  TRIBITS_APPEND_PACKAGE_DEPS(${PACKAGE_NAME}_TEST_DEPS OPTIONAL ${PARSE_TEST_OPTIONAL_PACKAGES})
+  TRIBITS_APPEND_TPLS_DEPS(${PACKAGE_NAME}_TEST_DEPS REQUIRED ${PARSE_TEST_REQUIRED_TPLS})
+  TRIBITS_APPEND_TPLS_DEPS(${PACKAGE_NAME}_TEST_DEPS OPTIONAL ${PARSE_TEST_OPTIONAL_TPLS})
+
+  TRIBITS_ENABLE_TPLS(${PARSE_LIB_REQUIRED_TPLS} ${PARSE_LIB_OPTIONAL_TPLS} ${PARSE_TEST_REQUIRED_TPLS} ${PARSE_TEST_OPTIONAL_TPLS})
+
+ENDMACRO()
+
+MACRO(TRIBITS_SUBPACKAGE NAME)
+  SET(PACKAGE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+  SET(PARENT_PACKAGE_NAME ${PACKAGE_NAME})
+  SET(PACKAGE_NAME ${PACKAGE_NAME}${NAME})
+  STRING(TOUPPER ${PACKAGE_NAME} PACKAGE_NAME_UC)
+
+  ADD_INTERFACE_LIBRARY(PACKAGE_${PACKAGE_NAME})
+
+  GLOBAL_SET(${PACKAGE_NAME}_LIBS "")
+
+  INCLUDE(${PACKAGE_SOURCE_DIR}/cmake/Dependencies.cmake)
+
+ENDMACRO(TRIBITS_SUBPACKAGE)
+
+MACRO(TRIBITS_SUBPACKAGE_POSTPROCESS)
+  TARGET_LINK_AND_INCLUDE_LIBRARIES(PACKAGE_${PACKAGE_NAME} ${${PACKAGE_NAME}_LIBS})
+ENDMACRO(TRIBITS_SUBPACKAGE_POSTPROCESS)
+
+MACRO(TRIBITS_PACKAGE_DECL NAME)
+
+  PROJECT(${NAME})
+  STRING(TOUPPER ${PROJECT_NAME} PROJECT_NAME_UC)
+  SET(PACKAGE_NAME ${PROJECT_NAME})
+  STRING(TOUPPER ${PACKAGE_NAME} PACKAGE_NAME_UC)
+
+  SET(TRIBITS_DEPS_DIR "${CMAKE_SOURCE_DIR}/cmake/deps")
+  FILE(GLOB TPLS_FILES "${TRIBITS_DEPS_DIR}/*.cmake")
+  FOREACH(TPL_FILE ${TPLS_FILES})
+    TRIBITS_PROCESS_TPL_DEP_FILE(${TPL_FILE})
+  ENDFOREACH()
+
+ENDMACRO()
+
+
+MACRO(TRIBITS_PROCESS_SUBPACKAGES)
+  FILE(GLOB SUBPACKAGES RELATIVE ${CMAKE_SOURCE_DIR} */cmake/Dependencies.cmake)
+  FOREACH(SUBPACKAGE ${SUBPACKAGES})
+    GET_FILENAME_COMPONENT(SUBPACKAGE_CMAKE ${SUBPACKAGE} DIRECTORY)
+    GET_FILENAME_COMPONENT(SUBPACKAGE_DIR ${SUBPACKAGE_CMAKE} DIRECTORY)
+    ADD_SUBDIRECTORY(${SUBPACKAGE_DIR})
+  ENDFOREACH()
+ENDMACRO(TRIBITS_PROCESS_SUBPACKAGES)
+
+MACRO(TRIBITS_PACKAGE_DEF)
+ENDMACRO(TRIBITS_PACKAGE_DEF)
+
+MACRO(TRIBITS_EXCLUDE_AUTOTOOLS_FILES)
+ENDMACRO(TRIBITS_EXCLUDE_AUTOTOOLS_FILES)
+
+MACRO(TRIBITS_EXCLUDE_FILES)
+ENDMACRO(TRIBITS_EXCLUDE_FILES)
+
+MACRO(TRIBITS_PACKAGE_POSTPROCESS)
+ENDMACRO(TRIBITS_PACKAGE_POSTPROCESS)
+
diff --git a/lib/kokkos/config/configure_compton_cpu.sh b/lib/kokkos/config/configure_compton_cpu.sh
new file mode 100755
index 0000000000000000000000000000000000000000..17287fb8486977927e4ba29718c79a438378d0a4
--- /dev/null
+++ b/lib/kokkos/config/configure_compton_cpu.sh
@@ -0,0 +1,190 @@
+#!/bin/sh
+#
+# Copy this script, put it outside the Trilinos source directory, and
+# build there.
+#
+# Additional command-line arguments given to this script will be
+# passed directly to CMake.
+#
+
+#
+# Force CMake to re-evaluate build options.
+#
+rm -rf CMake* Trilinos* packages Dart* Testing cmake_install.cmake MakeFile*
+
+#-----------------------------------------------------------------------------
+# Incrementally construct cmake configure options:
+
+CMAKE_CONFIGURE=""
+
+#-----------------------------------------------------------------------------
+# Location of Trilinos source tree:
+
+CMAKE_PROJECT_DIR="${HOME}/Trilinos"
+
+# Location for installation:
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_INSTALL_PREFIX=/home/projects/kokkos/host/`date +%F`"
+
+#-----------------------------------------------------------------------------
+# General build options.
+# Use a variable so options can be propagated to CUDA compiler.
+
+CMAKE_VERBOSE_MAKEFILE=OFF
+CMAKE_BUILD_TYPE=RELEASE
+# CMAKE_BUILD_TYPE=DEBUG
+
+#-----------------------------------------------------------------------------
+# Build for CUDA architecture:
+
+CUDA_ARCH=""
+# CUDA_ARCH="20"
+# CUDA_ARCH="30"
+# CUDA_ARCH="35"
+
+# Build with Intel compiler
+
+INTEL=ON
+
+# Build for MIC architecture:
+
+# INTEL_XEON_PHI=ON
+
+# Build with HWLOC at location:
+
+HWLOC_BASE_DIR="/home/projects/libraries/host/hwloc/1.6.2"
+
+# Location for MPI to use in examples:
+
+MPI_BASE_DIR=""
+
+#-----------------------------------------------------------------------------
+# MPI configuation only used for examples:
+#
+# Must have the MPI_BASE_DIR so that the
+# include path can be passed to the Cuda compiler
+
+if [ -n "${MPI_BASE_DIR}" ] ;
+then
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_MPI:BOOL=ON"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D MPI_BASE_DIR:PATH=${MPI_BASE_DIR}"
+else
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_MPI:BOOL=OFF"
+fi
+
+#-----------------------------------------------------------------------------
+# Pthread configuation:
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_Pthread:BOOL=ON"
+# CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_Pthread:BOOL=OFF"
+
+#-----------------------------------------------------------------------------
+# OpenMP configuation:
+
+# CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_OpenMP:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_OpenMP:BOOL=OFF"
+
+#-----------------------------------------------------------------------------
+#-----------------------------------------------------------------------------
+# Configure packages for kokkos-only:
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_ALL_PACKAGES:BOOL=OFF"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_EXAMPLES:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_TESTS:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_KokkosCore:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_KokkosContainers:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_KokkosExample:BOOL=ON"
+
+#-----------------------------------------------------------------------------
+#-----------------------------------------------------------------------------
+# Hardware locality cmake configuration:
+
+if [ -n "${HWLOC_BASE_DIR}" ] ;
+then
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_HWLOC:BOOL=ON"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D HWLOC_INCLUDE_DIRS:FILEPATH=${HWLOC_BASE_DIR}/include"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D HWLOC_LIBRARY_DIRS:FILEPATH=${HWLOC_BASE_DIR}/lib"
+fi
+
+#-----------------------------------------------------------------------------
+# Cuda cmake configuration:
+
+if [ -n "${CUDA_ARCH}" ] ;
+then
+
+  # Options to CUDA_NVCC_FLAGS must be semi-colon delimited,
+  # this is different than the standard CMAKE_CXX_FLAGS syntax.
+
+  CUDA_NVCC_FLAGS="-gencode;arch=compute_${CUDA_ARCH},code=sm_${CUDA_ARCH}"
+  CUDA_NVCC_FLAGS="${CUDA_NVCC_FLAGS};-Xcompiler;-Wall,-ansi"
+
+  if [ "${CMAKE_BUILD_TYPE}" = "DEBUG" ] ;
+  then
+    CUDA_NVCC_FLAGS="${CUDA_NVCC_FLAGS};-g"
+  else
+    CUDA_NVCC_FLAGS="${CUDA_NVCC_FLAGS};-O3"
+  fi
+
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_CUDA:BOOL=ON"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_CUSPARSE:BOOL=ON"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CUDA_VERBOSE_BUILD:BOOL=OFF"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CUDA_NVCC_FLAGS:STRING=${CUDA_NVCC_FLAGS}"
+
+fi
+
+#-----------------------------------------------------------------------------
+
+if [ "${INTEL}" = "ON" -o "${INTEL_XEON_PHI}" = "ON" ] ;
+then
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_C_COMPILER=icc"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_CXX_COMPILER=icpc"
+fi
+
+#-----------------------------------------------------------------------------
+
+# Cross-compile for Intel Xeon Phi:
+
+if [ "${INTEL_XEON_PHI}" = "ON" ] ;
+then
+
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_SYSTEM_NAME=Linux"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_CXX_FLAGS:STRING=-mmic"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_C_FLAGS:STRING=-mmic"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_Fortran_COMPILER:FILEPATH=ifort"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D BLAS_LIBRARY_DIRS:FILEPATH=${MKLROOT}/lib/mic"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D BLAS_LIBRARY_NAMES='mkl_intel_lp64;mkl_sequential;mkl_core;pthread;m'"
+
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_CHECKED_STL:BOOL=OFF"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_WARNINGS_AS_ERRORS_FLAGS:STRING=''"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D BUILD_SHARED_LIBS:BOOL=OFF"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D DART_TESTING_TIMEOUT:STRING=600"
+
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D LAPACK_LIBRARY_NAMES=''"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_LAPACK_LIBRARIES=''"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_BinUtils=OFF"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_Pthread_LIBRARIES=pthread"
+
+  # Cannot cross-compile fortran compatibility checks on the MIC:
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_Fortran:BOOL=OFF"
+
+  # Tell cmake the answers to compile-and-execute tests
+  # to prevent cmake from executing a cross-compiled program.
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D HAVE_GCC_ABI_DEMANGLE_EXITCODE=0"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D HAVE_TEUCHOS_BLASFLOAT_EXITCODE=0"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D LAPACK_SLAPY2_WORKS_EXITCODE=0"
+
+fi
+
+#-----------------------------------------------------------------------------
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE}"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_VERBOSE_MAKEFILE:BOOL=${CMAKE_VERBOSE_MAKEFILE}"
+
+#-----------------------------------------------------------------------------
+
+echo "cmake ${CMAKE_CONFIGURE} ${CMAKE_PROJECT_DIR}"
+
+cmake ${CMAKE_CONFIGURE} ${CMAKE_PROJECT_DIR}
+
+#-----------------------------------------------------------------------------
+
diff --git a/lib/kokkos/config/configure_compton_mic.sh b/lib/kokkos/config/configure_compton_mic.sh
new file mode 100755
index 0000000000000000000000000000000000000000..7f9aee13f957d503d7fa5a5b1c8ecf924a80e0d8
--- /dev/null
+++ b/lib/kokkos/config/configure_compton_mic.sh
@@ -0,0 +1,186 @@
+#!/bin/sh
+#
+# Copy this script, put it outside the Trilinos source directory, and
+# build there.
+#
+# Additional command-line arguments given to this script will be
+# passed directly to CMake.
+#
+
+#
+# Force CMake to re-evaluate build options.
+#
+rm -rf CMake* Trilinos* packages Dart* Testing cmake_install.cmake MakeFile*
+
+#-----------------------------------------------------------------------------
+# Incrementally construct cmake configure options:
+
+CMAKE_CONFIGURE=""
+
+#-----------------------------------------------------------------------------
+# Location of Trilinos source tree:
+
+CMAKE_PROJECT_DIR="${HOME}/Trilinos"
+
+# Location for installation:
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_INSTALL_PREFIX=/home/projects/kokkos/mic/`date +%F`"
+
+#-----------------------------------------------------------------------------
+# General build options.
+# Use a variable so options can be propagated to CUDA compiler.
+
+CMAKE_VERBOSE_MAKEFILE=OFF
+CMAKE_BUILD_TYPE=RELEASE
+# CMAKE_BUILD_TYPE=DEBUG
+
+#-----------------------------------------------------------------------------
+# Build for CUDA architecture:
+
+CUDA_ARCH=""
+# CUDA_ARCH="20"
+# CUDA_ARCH="30"
+# CUDA_ARCH="35"
+
+# Build for MIC architecture:
+
+INTEL_XEON_PHI=ON
+
+# Build with HWLOC at location:
+
+HWLOC_BASE_DIR="/home/projects/libraries/mic/hwloc/1.6.2"
+
+# Location for MPI to use in examples:
+
+MPI_BASE_DIR=""
+
+#-----------------------------------------------------------------------------
+# MPI configuation only used for examples:
+#
+# Must have the MPI_BASE_DIR so that the
+# include path can be passed to the Cuda compiler
+
+if [ -n "${MPI_BASE_DIR}" ] ;
+then
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_MPI:BOOL=ON"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D MPI_BASE_DIR:PATH=${MPI_BASE_DIR}"
+else
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_MPI:BOOL=OFF"
+fi
+
+#-----------------------------------------------------------------------------
+# Pthread configuation:
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_Pthread:BOOL=ON"
+# CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_Pthread:BOOL=OFF"
+
+#-----------------------------------------------------------------------------
+# OpenMP configuation:
+
+# CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_OpenMP:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_OpenMP:BOOL=OFF"
+
+#-----------------------------------------------------------------------------
+#-----------------------------------------------------------------------------
+# Configure packages for kokkos-only:
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_ALL_PACKAGES:BOOL=OFF"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_EXAMPLES:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_TESTS:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_KokkosCore:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_KokkosContainers:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_KokkosExample:BOOL=ON"
+
+#-----------------------------------------------------------------------------
+#-----------------------------------------------------------------------------
+# Hardware locality cmake configuration:
+
+if [ -n "${HWLOC_BASE_DIR}" ] ;
+then
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_HWLOC:BOOL=ON"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D HWLOC_INCLUDE_DIRS:FILEPATH=${HWLOC_BASE_DIR}/include"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D HWLOC_LIBRARY_DIRS:FILEPATH=${HWLOC_BASE_DIR}/lib"
+fi
+
+#-----------------------------------------------------------------------------
+# Cuda cmake configuration:
+
+if [ -n "${CUDA_ARCH}" ] ;
+then
+
+  # Options to CUDA_NVCC_FLAGS must be semi-colon delimited,
+  # this is different than the standard CMAKE_CXX_FLAGS syntax.
+
+  CUDA_NVCC_FLAGS="-gencode;arch=compute_${CUDA_ARCH},code=sm_${CUDA_ARCH}"
+  CUDA_NVCC_FLAGS="${CUDA_NVCC_FLAGS};-Xcompiler;-Wall,-ansi"
+
+  if [ "${CMAKE_BUILD_TYPE}" = "DEBUG" ] ;
+  then
+    CUDA_NVCC_FLAGS="${CUDA_NVCC_FLAGS};-g"
+  else
+    CUDA_NVCC_FLAGS="${CUDA_NVCC_FLAGS};-O3"
+  fi
+
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_CUDA:BOOL=ON"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_CUSPARSE:BOOL=ON"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CUDA_VERBOSE_BUILD:BOOL=OFF"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CUDA_NVCC_FLAGS:STRING=${CUDA_NVCC_FLAGS}"
+
+fi
+
+#-----------------------------------------------------------------------------
+
+if [ "${INTEL}" = "ON" -o "${INTEL_XEON_PHI}" = "ON" ] ;
+then
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_C_COMPILER=icc"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_CXX_COMPILER=icpc"
+fi
+
+#-----------------------------------------------------------------------------
+
+# Cross-compile for Intel Xeon Phi:
+
+if [ "${INTEL_XEON_PHI}" = "ON" ] ;
+then
+
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_SYSTEM_NAME=Linux"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_CXX_FLAGS:STRING=-mmic"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_C_FLAGS:STRING=-mmic"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_Fortran_COMPILER:FILEPATH=ifort"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D BLAS_LIBRARY_DIRS:FILEPATH=${MKLROOT}/lib/mic"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D BLAS_LIBRARY_NAMES='mkl_intel_lp64;mkl_sequential;mkl_core;pthread;m'"
+
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_CHECKED_STL:BOOL=OFF"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_WARNINGS_AS_ERRORS_FLAGS:STRING=''"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D BUILD_SHARED_LIBS:BOOL=OFF"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D DART_TESTING_TIMEOUT:STRING=600"
+
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D LAPACK_LIBRARY_NAMES=''"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_LAPACK_LIBRARIES=''"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_BinUtils=OFF"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_Pthread_LIBRARIES=pthread"
+
+  # Cannot cross-compile fortran compatibility checks on the MIC:
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_Fortran:BOOL=OFF"
+
+  # Tell cmake the answers to compile-and-execute tests
+  # to prevent cmake from executing a cross-compiled program.
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D HAVE_GCC_ABI_DEMANGLE_EXITCODE=0"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D HAVE_TEUCHOS_BLASFLOAT_EXITCODE=0"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D LAPACK_SLAPY2_WORKS_EXITCODE=0"
+
+fi
+
+#-----------------------------------------------------------------------------
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE}"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_VERBOSE_MAKEFILE:BOOL=${CMAKE_VERBOSE_MAKEFILE}"
+
+#-----------------------------------------------------------------------------
+
+echo "cmake ${CMAKE_CONFIGURE} ${CMAKE_PROJECT_DIR}"
+
+cmake ${CMAKE_CONFIGURE} ${CMAKE_PROJECT_DIR}
+
+#-----------------------------------------------------------------------------
+
diff --git a/lib/kokkos/config/configure_kokkos.sh b/lib/kokkos/config/configure_kokkos.sh
new file mode 100755
index 0000000000000000000000000000000000000000..592e7f593639c617385b487183f8ea6111cbd732
--- /dev/null
+++ b/lib/kokkos/config/configure_kokkos.sh
@@ -0,0 +1,293 @@
+#!/bin/sh
+#
+# Copy this script, put it outside the Trilinos source directory, and
+# build there.
+#
+#-----------------------------------------------------------------------------
+# General build options.
+# Use a variable so options can be propagated to CUDA compiler.
+
+CMAKE_BUILD_TYPE=RELEASE
+# CMAKE_BUILD_TYPE=DEBUG
+
+# Source and installation directories:
+
+TRILINOS_SOURCE_DIR=${HOME}/Trilinos
+TRILINOS_INSTALL_DIR=${HOME}/TrilinosInstall/`date +%F`
+
+#-----------------------------------------------------------------------------
+
+USE_CUDA_ARCH=
+USE_THREAD=
+USE_OPENMP=
+USE_INTEL=
+USE_XEON_PHI=
+HWLOC_BASE_DIR=
+MPI_BASE_DIR=
+BLAS_LIB_DIR=
+LAPACK_LIB_DIR=
+
+if [ 1 ] ; then
+  # Platform 'kokkos-dev' with Cuda, OpenMP, hwloc, mpi, gnu
+  USE_CUDA_ARCH="35"
+  USE_OPENMP=ON
+  HWLOC_BASE_DIR="/home/projects/hwloc/1.7.1/host/gnu/4.4.7"
+  MPI_BASE_DIR="/home/projects/mvapich/2.0.0b/gnu/4.4.7"
+  BLAS_LIB_DIR="/home/projects/blas/host/gnu/lib"
+  LAPACK_LIB_DIR="/home/projects/lapack/host/gnu/lib"
+
+elif [ ] ; then
+  # Platform 'kokkos-dev' with Cuda, Threads, hwloc, mpi, gnu
+  USE_CUDA_ARCH="35"
+  USE_THREAD=ON
+  HWLOC_BASE_DIR="/home/projects/hwloc/1.7.1/host/gnu/4.4.7"
+  MPI_BASE_DIR="/home/projects/mvapich/2.0.0b/gnu/4.4.7"
+  BLAS_LIB_DIR="/home/projects/blas/host/gnu/lib"
+  LAPACK_LIB_DIR="/home/projects/lapack/host/gnu/lib"
+
+elif [ ] ; then
+  # Platform 'kokkos-dev' with Xeon Phi and hwloc
+  USE_OPENMP=ON
+  USE_INTEL=ON
+  USE_XEON_PHI=ON
+  HWLOC_BASE_DIR="/home/projects/hwloc/1.7.1/mic/intel/13.SP1.1.106"
+
+elif [ ] ; then
+  # Platform 'kokkos-nvidia' with Cuda, OpenMP, hwloc, mpi, gnu
+  USE_CUDA_ARCH="20"
+  USE_OPENMP=ON
+  HWLOC_BASE_DIR="/home/sems/common/hwloc/current"
+  MPI_BASE_DIR="/home/sems/common/openmpi/current"
+
+elif [ ] ; then
+  # Platform 'kokkos-nvidia' with Cuda, Threads, hwloc, mpi, gnu
+  USE_CUDA_ARCH="20"
+  USE_THREAD=ON
+  HWLOC_BASE_DIR="/home/sems/common/hwloc/current"
+  MPI_BASE_DIR="/home/sems/common/openmpi/current"
+
+fi
+
+#-----------------------------------------------------------------------------
+# Incrementally construct cmake configure command line options:
+
+CMAKE_CONFIGURE=""
+CMAKE_CXX_FLAGS=""
+
+#-----------------------------------------------------------------------------
+# Configure for Kokkos subpackages and tests:
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_ALL_PACKAGES:BOOL=OFF"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_EXAMPLES:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_TESTS:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_KokkosCore:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_KokkosContainers:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_TpetraKernels:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_KokkosExample:BOOL=ON"
+
+#-----------------------------------------------------------------------------
+
+if [ 1 ] ; then
+
+  # Configure for Tpetra/Kokkos:
+
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D BLAS_LIBRARY_DIRS:FILEPATH=${BLAS_LIB_DIR}"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D LAPACK_LIBRARY_DIRS:FILEPATH=${LAPACK_LIB_DIR}"
+
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_Tpetra:BOOL=ON"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_Kokkos:BOOL=ON"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_TpetraClassic:BOOL=ON"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_TeuchosKokkosCompat:BOOL=ON"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_TeuchosKokkosComm:BOOL=ON"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Tpetra_ENABLE_Kokkos_Refactor:BOOL=ON"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D KokkosClassic_DefaultNode:STRING=Kokkos::Compat::KokkosOpenMPWrapperNode"
+
+  CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS}-DKOKKOS_FAST_COMPILE"
+
+  if [ -n "${USE_CUDA_ARCH}" ] ; then
+
+    CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Kokkos_ENABLE_Cuda:BOOL=ON"
+
+  fi
+
+fi
+
+if [ 1 ] ; then
+
+  # Configure for Stokhos:
+
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_Sacado:BOOL=ON"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_Stokhos:BOOL=ON"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Stokhos_ENABLE_Belos:BOOL=ON"
+
+fi
+
+if [ 1 ] ; then
+
+  # Configure for TrilinosCouplings:
+
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_TrilinosCouplings:BOOL=ON"
+
+fi
+
+#-----------------------------------------------------------------------------
+#-----------------------------------------------------------------------------
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE}"
+
+# CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON"
+
+if [ "${CMAKE_BUILD_TYPE}" == "DEBUG" ] ;
+then
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Kokkos_ENABLE_BOUNDS_CHECK:BOOL=ON"
+fi
+
+#-----------------------------------------------------------------------------
+# Location for installation:
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_INSTALL_PREFIX=${TRILINOS_INSTALL_DIR}"
+
+#-----------------------------------------------------------------------------
+# MPI configuation only used for examples:
+#
+# Must have the MPI_BASE_DIR so that the
+# include path can be passed to the Cuda compiler
+
+if [ -n "${MPI_BASE_DIR}" ] ;
+then
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_MPI:BOOL=ON"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D MPI_BASE_DIR:PATH=${MPI_BASE_DIR}"
+else
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_MPI:BOOL=OFF"
+fi
+
+#-----------------------------------------------------------------------------
+# Kokkos use pthread configuation:
+
+if [ "${USE_THREAD}" = "ON" ] ;
+then
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_Pthread:BOOL=ON"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Kokkos_ENABLE_Pthread:BOOL=ON"
+else
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Kokkos_ENABLE_Pthread:BOOL=OFF"
+fi
+
+#-----------------------------------------------------------------------------
+# Kokkos use OpenMP configuation:
+
+if [ "${USE_OPENMP}" = "ON" ] ;
+then
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_OpenMP:BOOL=ON"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Kokkos_ENABLE_OpenMP:BOOL=ON"
+else
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Kokkos_ENABLE_OpenMP:BOOL=OFF"
+fi
+
+#-----------------------------------------------------------------------------
+# Hardware locality configuration:
+
+if [ -n "${HWLOC_BASE_DIR}" ] ;
+then
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_HWLOC:BOOL=ON"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D HWLOC_INCLUDE_DIRS:FILEPATH=${HWLOC_BASE_DIR}/include"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D HWLOC_LIBRARY_DIRS:FILEPATH=${HWLOC_BASE_DIR}/lib"
+fi
+
+#-----------------------------------------------------------------------------
+# Cuda cmake configuration:
+
+if [ -n "${USE_CUDA_ARCH}" ] ;
+then
+
+  # Options to CUDA_NVCC_FLAGS must be semi-colon delimited,
+  # this is different than the standard CMAKE_CXX_FLAGS syntax.
+
+  CUDA_NVCC_FLAGS="-DKOKKOS_HAVE_CUDA_ARCH=${USE_CUDA_ARCH}0;-gencode;arch=compute_${USE_CUDA_ARCH},code=sm_${USE_CUDA_ARCH}"
+
+  if [ "${USE_OPENMP}" = "ON" ] ;
+  then
+    CUDA_NVCC_FLAGS="${CUDA_NVCC_FLAGS};-Xcompiler;-Wall,-ansi,-fopenmp"
+  else
+    CUDA_NVCC_FLAGS="${CUDA_NVCC_FLAGS};-Xcompiler;-Wall,-ansi"
+  fi
+
+  if [ "${CMAKE_BUILD_TYPE}" = "DEBUG" ] ;
+  then
+    CUDA_NVCC_FLAGS="${CUDA_NVCC_FLAGS};-g"
+  else
+    CUDA_NVCC_FLAGS="${CUDA_NVCC_FLAGS};-O3"
+  fi
+
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_CUDA:BOOL=ON"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_CUSPARSE:BOOL=ON"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CUDA_VERBOSE_BUILD:BOOL=OFF"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CUDA_NVCC_FLAGS:STRING=${CUDA_NVCC_FLAGS}"
+
+fi
+
+#-----------------------------------------------------------------------------
+
+if [ "${USE_INTEL}" = "ON" -o "${USE_XEON_PHI}" = "ON" ] ;
+then
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_C_COMPILER=icc"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_CXX_COMPILER=icpc"
+fi
+
+# Cross-compile for Intel Xeon Phi:
+
+if [ "${USE_XEON_PHI}" = "ON" ] ;
+then
+
+  CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -mmic"
+
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_SYSTEM_NAME=Linux"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_C_FLAGS:STRING=-mmic"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_Fortran_COMPILER:FILEPATH=ifort"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D BLAS_LIBRARY_DIRS:FILEPATH=${MKLROOT}/lib/mic"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D BLAS_LIBRARY_NAMES='mkl_intel_lp64;mkl_sequential;mkl_core;pthread;m'"
+
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_CHECKED_STL:BOOL=OFF"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_WARNINGS_AS_ERRORS_FLAGS:STRING=''"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D BUILD_SHARED_LIBS:BOOL=OFF"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D DART_TESTING_TIMEOUT:STRING=600"
+
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D LAPACK_LIBRARY_NAMES=''"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_LAPACK_LIBRARIES=''"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_BinUtils=OFF"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_Pthread_LIBRARIES=pthread"
+
+  # Cannot cross-compile fortran compatibility checks on the MIC:
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_Fortran:BOOL=OFF"
+
+  # Tell cmake the answers to compile-and-execute tests
+  # to prevent cmake from executing a cross-compiled program.
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D HAVE_GCC_ABI_DEMANGLE_EXITCODE=0"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D HAVE_TEUCHOS_BLASFLOAT_EXITCODE=0"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D LAPACK_SLAPY2_WORKS_EXITCODE=0"
+
+fi
+
+#-----------------------------------------------------------------------------
+#-----------------------------------------------------------------------------
+
+if [ -n "${CMAKE_CXX_FLAGS}" ] ; then
+
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_CXX_FLAGS:STRING='${CMAKE_CXX_FLAGS}'"
+
+fi
+
+#-----------------------------------------------------------------------------
+#
+# Remove CMake output files to force reconfigure from scratch.
+#
+
+rm -rf CMake* Trilinos* packages Dart* Testing cmake_install.cmake MakeFile*
+
+#
+
+echo "cmake ${CMAKE_CONFIGURE} ${TRILINOS_SOURCE_DIR}"
+
+cmake ${CMAKE_CONFIGURE} ${TRILINOS_SOURCE_DIR}
+
+#-----------------------------------------------------------------------------
+
diff --git a/lib/kokkos/config/configure_kokkos_bgq.sh b/lib/kokkos/config/configure_kokkos_bgq.sh
new file mode 100755
index 0000000000000000000000000000000000000000..73236937eaa9b311d06027a23aa070a8f62c9153
--- /dev/null
+++ b/lib/kokkos/config/configure_kokkos_bgq.sh
@@ -0,0 +1,88 @@
+#!/bin/sh
+#
+# Copy this script, put it outside the Trilinos source directory, and
+# build there.
+#
+# Additional command-line arguments given to this script will be
+# passed directly to CMake.
+#
+
+# to build:
+# build on bgq-b[1-12]
+# module load sierra-devel
+# run this configure file
+# make
+
+# to run:
+# ssh bgq-login
+# cd /scratch/username/...
+# export OMP_PROC_BIND and XLSMPOPTS environment variables
+# run with srun
+
+# Note: hwloc does not work to get or set cpubindings on bgq.
+# Use the openmp backend and the openmp environment variables.
+#
+# Only the mpi wrappers seem to be setup for cross-compile,
+# so it is important that this configure enables MPI and uses mpigcc wrappers.
+
+
+
+#
+# Force CMake to re-evaluate build options.
+#
+rm -rf CMake* Trilinos* packages Dart* Testing cmake_install.cmake MakeFile*
+
+#-----------------------------------------------------------------------------
+# Incrementally construct cmake configure options:
+
+CMAKE_CONFIGURE=""
+
+#-----------------------------------------------------------------------------
+# Location of Trilinos source tree:
+
+CMAKE_PROJECT_DIR="../Trilinos"
+
+# Location for installation:
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_INSTALL_PREFIX=../TrilinosInstall/`date +%F`"
+
+#-----------------------------------------------------------------------------
+# General build options.
+# Use a variable so options can be propagated to CUDA compiler.
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_C_COMPILER=mpigcc-4.7.2"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_CXX_COMPILER=mpig++-4.7.2"
+
+CMAKE_VERBOSE_MAKEFILE=OFF
+CMAKE_BUILD_TYPE=RELEASE
+# CMAKE_BUILD_TYPE=DEBUG
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_MPI:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_OpenMP:BOOL=ON"
+
+#-----------------------------------------------------------------------------
+#-----------------------------------------------------------------------------
+# Configure packages for kokkos-only:
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_ALL_PACKAGES:BOOL=OFF"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_EXAMPLES:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_TESTS:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_KokkosCore:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_KokkosContainers:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_TpetraKernels:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_KokkosExample:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Kokkos_ENABLE_Pthread:BOOL=OFF"
+
+#-----------------------------------------------------------------------------
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE}"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_VERBOSE_MAKEFILE:BOOL=${CMAKE_VERBOSE_MAKEFILE}"
+
+#-----------------------------------------------------------------------------
+
+echo "cmake ${CMAKE_CONFIGURE} ${CMAKE_PROJECT_DIR}"
+
+cmake ${CMAKE_CONFIGURE} ${CMAKE_PROJECT_DIR}
+
+#-----------------------------------------------------------------------------
+
diff --git a/lib/kokkos/config/configure_kokkos_dev.sh b/lib/kokkos/config/configure_kokkos_dev.sh
new file mode 100755
index 0000000000000000000000000000000000000000..ac61dec602381b52d96f91a59c0eddbc2d6b5801
--- /dev/null
+++ b/lib/kokkos/config/configure_kokkos_dev.sh
@@ -0,0 +1,216 @@
+#!/bin/sh
+#
+# Copy this script, put it outside the Trilinos source directory, and
+# build there.
+#
+# Additional command-line arguments given to this script will be
+# passed directly to CMake.
+#
+
+#
+# Force CMake to re-evaluate build options.
+#
+rm -rf CMake* Trilinos* packages Dart* Testing cmake_install.cmake MakeFile*
+
+#-----------------------------------------------------------------------------
+# Incrementally construct cmake configure options:
+
+CMAKE_CONFIGURE=""
+
+#-----------------------------------------------------------------------------
+# Location of Trilinos source tree:
+
+CMAKE_PROJECT_DIR="${HOME}/Trilinos"
+
+# Location for installation:
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_INSTALL_PREFIX=${HOME}/TrilinosInstall/`date +%F`"
+
+#-----------------------------------------------------------------------------
+# General build options.
+# Use a variable so options can be propagated to CUDA compiler.
+
+CMAKE_VERBOSE_MAKEFILE=OFF
+CMAKE_BUILD_TYPE=RELEASE
+#CMAKE_BUILD_TYPE=DEBUG
+#CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Kokkos_ENABLE_BOUNDS_CHECK:BOOL=ON"
+
+#-----------------------------------------------------------------------------
+# Build for CUDA architecture:
+
+#CUDA_ARCH=""
+#CUDA_ARCH="20"
+#CUDA_ARCH="30"
+CUDA_ARCH="35"
+
+# Build with OpenMP
+
+OPENMP=ON
+PTHREADS=ON
+
+# Build host code with Intel compiler:
+
+INTEL=OFF
+
+# Build for MIC architecture:
+
+INTEL_XEON_PHI=OFF
+
+# Build with HWLOC at location:
+
+#HWLOC_BASE_DIR=""
+#HWLOC_BASE_DIR="/home/projects/hwloc/1.7.1/host/gnu/4.4.7"
+HWLOC_BASE_DIR="/home/projects/hwloc/1.7.1/host/gnu/4.7.3"
+
+# Location for MPI to use in examples:
+
+#MPI_BASE_DIR=""
+#MPI_BASE_DIR="/home/projects/mvapich/2.0.0b/gnu/4.4.7"
+MPI_BASE_DIR="/home/projects/mvapich/2.0.0b/gnu/4.7.3"
+#MPI_BASE_DIR="/home/projects/openmpi/1.7.3/llvm/2013-12-02/"
+
+#-----------------------------------------------------------------------------
+# MPI configuation only used for examples:
+#
+# Must have the MPI_BASE_DIR so that the
+# include path can be passed to the Cuda compiler
+
+if [ -n "${MPI_BASE_DIR}" ] ;
+then
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_MPI:BOOL=ON"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D MPI_BASE_DIR:PATH=${MPI_BASE_DIR}"
+else
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_MPI:BOOL=OFF"
+fi
+
+#-----------------------------------------------------------------------------
+# Pthread configuation:
+
+if [ "${PTHREADS}" = "ON" ] ;
+then
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_Pthread:BOOL=ON"
+else
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_Pthread:BOOL=OFF"
+fi
+
+#-----------------------------------------------------------------------------
+# OpenMP configuation:
+
+if [ "${OPENMP}" = "ON" ] ;
+then
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_OpenMP:BOOL=ON"
+else
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_OpenMP:BOOL=OFF"
+fi
+
+#-----------------------------------------------------------------------------
+#-----------------------------------------------------------------------------
+# Configure packages for kokkos-only:
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_ALL_PACKAGES:BOOL=OFF"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_EXAMPLES:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_TESTS:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_KokkosCore:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_KokkosContainers:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_TpetraKernels:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_KokkosExample:BOOL=ON"
+
+#-----------------------------------------------------------------------------
+#-----------------------------------------------------------------------------
+# Hardware locality cmake configuration:
+
+if [ -n "${HWLOC_BASE_DIR}" ] ;
+then
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_HWLOC:BOOL=ON"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D HWLOC_INCLUDE_DIRS:FILEPATH=${HWLOC_BASE_DIR}/include"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D HWLOC_LIBRARY_DIRS:FILEPATH=${HWLOC_BASE_DIR}/lib"
+fi
+
+#-----------------------------------------------------------------------------
+# Cuda cmake configuration:
+
+if [ -n "${CUDA_ARCH}" ] ;
+then
+
+  # Options to CUDA_NVCC_FLAGS must be semi-colon delimited,
+  # this is different than the standard CMAKE_CXX_FLAGS syntax.
+
+  CUDA_NVCC_FLAGS="-gencode;arch=compute_${CUDA_ARCH},code=sm_${CUDA_ARCH}"
+
+  if [ "${OPENMP}" = "ON" ] ;
+  then
+    CUDA_NVCC_FLAGS="${CUDA_NVCC_FLAGS};-Xcompiler;-Wall,-ansi,-fopenmp"
+  else
+    CUDA_NVCC_FLAGS="${CUDA_NVCC_FLAGS};-Xcompiler;-Wall,-ansi"
+  fi
+
+  if [ "${CMAKE_BUILD_TYPE}" = "DEBUG" ] ;
+  then
+    CUDA_NVCC_FLAGS="${CUDA_NVCC_FLAGS};-g"
+  else
+    CUDA_NVCC_FLAGS="${CUDA_NVCC_FLAGS};-O3"
+  fi
+
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_CUDA:BOOL=ON"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_CUSPARSE:BOOL=ON"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CUDA_VERBOSE_BUILD:BOOL=OFF"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CUDA_NVCC_FLAGS:STRING=${CUDA_NVCC_FLAGS}"
+
+fi
+
+#-----------------------------------------------------------------------------
+
+if [ "${INTEL}" = "ON" -o "${INTEL_XEON_PHI}" = "ON" ] ;
+then
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_C_COMPILER=icc"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_CXX_COMPILER=icpc"
+fi
+
+#-----------------------------------------------------------------------------
+
+# Cross-compile for Intel Xeon Phi:
+
+if [ "${INTEL_XEON_PHI}" = "ON" ] ;
+then
+
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_SYSTEM_NAME=Linux"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_CXX_FLAGS:STRING=-mmic"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_C_FLAGS:STRING=-mmic"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_Fortran_COMPILER:FILEPATH=ifort"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D BLAS_LIBRARY_DIRS:FILEPATH=${MKLROOT}/lib/mic"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D BLAS_LIBRARY_NAMES='mkl_intel_lp64;mkl_sequential;mkl_core;pthread;m'"
+
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_CHECKED_STL:BOOL=OFF"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_WARNINGS_AS_ERRORS_FLAGS:STRING=''"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D BUILD_SHARED_LIBS:BOOL=OFF"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D DART_TESTING_TIMEOUT:STRING=600"
+
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D LAPACK_LIBRARY_NAMES=''"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_LAPACK_LIBRARIES=''"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_BinUtils=OFF"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_Pthread_LIBRARIES=pthread"
+
+  # Cannot cross-compile fortran compatibility checks on the MIC:
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_Fortran:BOOL=OFF"
+
+  # Tell cmake the answers to compile-and-execute tests
+  # to prevent cmake from executing a cross-compiled program.
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D HAVE_GCC_ABI_DEMANGLE_EXITCODE=0"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D HAVE_TEUCHOS_BLASFLOAT_EXITCODE=0"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D LAPACK_SLAPY2_WORKS_EXITCODE=0"
+
+fi
+
+#-----------------------------------------------------------------------------
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE}"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_VERBOSE_MAKEFILE:BOOL=${CMAKE_VERBOSE_MAKEFILE}"
+
+#-----------------------------------------------------------------------------
+
+echo "cmake ${CMAKE_CONFIGURE} ${CMAKE_PROJECT_DIR}"
+
+cmake ${CMAKE_CONFIGURE} ${CMAKE_PROJECT_DIR}
+
+#-----------------------------------------------------------------------------
+
diff --git a/lib/kokkos/config/configure_kokkos_nvidia.sh b/lib/kokkos/config/configure_kokkos_nvidia.sh
new file mode 100755
index 0000000000000000000000000000000000000000..f78b7dce7845474402d61793084d59cfff579e4a
--- /dev/null
+++ b/lib/kokkos/config/configure_kokkos_nvidia.sh
@@ -0,0 +1,204 @@
+#!/bin/sh
+#
+# Copy this script, put it outside the Trilinos source directory, and
+# build there.
+#
+# Additional command-line arguments given to this script will be
+# passed directly to CMake.
+#
+
+#
+# Force CMake to re-evaluate build options.
+#
+rm -rf CMake* Trilinos* packages Dart* Testing cmake_install.cmake MakeFile*
+
+#-----------------------------------------------------------------------------
+# Incrementally construct cmake configure options:
+
+CMAKE_CONFIGURE=""
+
+#-----------------------------------------------------------------------------
+# Location of Trilinos source tree:
+
+CMAKE_PROJECT_DIR="${HOME}/Trilinos"
+
+# Location for installation:
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_INSTALL_PREFIX=/home/sems/common/kokkos/`date +%F`"
+
+#-----------------------------------------------------------------------------
+# General build options.
+# Use a variable so options can be propagated to CUDA compiler.
+
+CMAKE_VERBOSE_MAKEFILE=OFF
+CMAKE_BUILD_TYPE=RELEASE
+# CMAKE_BUILD_TYPE=DEBUG
+
+#-----------------------------------------------------------------------------
+# Build for CUDA architecture:
+
+# CUDA_ARCH=""
+CUDA_ARCH="20"
+# CUDA_ARCH="30"
+# CUDA_ARCH="35"
+
+# Build with OpenMP
+
+OPENMP=ON
+
+# Build host code with Intel compiler:
+
+# INTEL=ON
+
+# Build for MIC architecture:
+
+# INTEL_XEON_PHI=ON
+
+# Build with HWLOC at location:
+
+HWLOC_BASE_DIR="/home/sems/common/hwloc/current"
+
+# Location for MPI to use in examples:
+
+MPI_BASE_DIR="/home/sems/common/openmpi/current"
+
+#-----------------------------------------------------------------------------
+# MPI configuation only used for examples:
+#
+# Must have the MPI_BASE_DIR so that the
+# include path can be passed to the Cuda compiler
+
+if [ -n "${MPI_BASE_DIR}" ] ;
+then
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_MPI:BOOL=ON"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D MPI_BASE_DIR:PATH=${MPI_BASE_DIR}"
+else
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_MPI:BOOL=OFF"
+fi
+
+#-----------------------------------------------------------------------------
+# Pthread configuation:
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_Pthread:BOOL=ON"
+# CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_Pthread:BOOL=OFF"
+
+#-----------------------------------------------------------------------------
+# OpenMP configuation:
+
+if [ "${OPENMP}" = "ON" ] ;
+then
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_OpenMP:BOOL=ON"
+else
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_OpenMP:BOOL=OFF"
+fi
+
+#-----------------------------------------------------------------------------
+#-----------------------------------------------------------------------------
+# Configure packages for kokkos-only:
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_ALL_PACKAGES:BOOL=OFF"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_EXAMPLES:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_TESTS:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_KokkosCore:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_KokkosContainers:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_KokkosExample:BOOL=ON"
+
+#-----------------------------------------------------------------------------
+#-----------------------------------------------------------------------------
+# Hardware locality cmake configuration:
+
+if [ -n "${HWLOC_BASE_DIR}" ] ;
+then
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_HWLOC:BOOL=ON"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D HWLOC_INCLUDE_DIRS:FILEPATH=${HWLOC_BASE_DIR}/include"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D HWLOC_LIBRARY_DIRS:FILEPATH=${HWLOC_BASE_DIR}/lib"
+fi
+
+#-----------------------------------------------------------------------------
+# Cuda cmake configuration:
+
+if [ -n "${CUDA_ARCH}" ] ;
+then
+
+  # Options to CUDA_NVCC_FLAGS must be semi-colon delimited,
+  # this is different than the standard CMAKE_CXX_FLAGS syntax.
+
+  CUDA_NVCC_FLAGS="-gencode;arch=compute_${CUDA_ARCH},code=sm_${CUDA_ARCH}"
+
+  if [ "${OPENMP}" = "ON" ] ;
+  then
+    CUDA_NVCC_FLAGS="${CUDA_NVCC_FLAGS};-Xcompiler;-Wall,-ansi,-fopenmp"
+  else
+    CUDA_NVCC_FLAGS="${CUDA_NVCC_FLAGS};-Xcompiler;-Wall,-ansi"
+  fi
+
+  if [ "${CMAKE_BUILD_TYPE}" = "DEBUG" ] ;
+  then
+    CUDA_NVCC_FLAGS="${CUDA_NVCC_FLAGS};-g"
+  else
+    CUDA_NVCC_FLAGS="${CUDA_NVCC_FLAGS};-O3"
+  fi
+
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_CUDA:BOOL=ON"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_CUSPARSE:BOOL=ON"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CUDA_VERBOSE_BUILD:BOOL=OFF"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CUDA_NVCC_FLAGS:STRING=${CUDA_NVCC_FLAGS}"
+
+fi
+
+#-----------------------------------------------------------------------------
+
+if [ "${INTEL}" = "ON" -o "${INTEL_XEON_PHI}" = "ON" ] ;
+then
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_C_COMPILER=icc"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_CXX_COMPILER=icpc"
+fi
+
+#-----------------------------------------------------------------------------
+
+# Cross-compile for Intel Xeon Phi:
+
+if [ "${INTEL_XEON_PHI}" = "ON" ] ;
+then
+
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_SYSTEM_NAME=Linux"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_CXX_FLAGS:STRING=-mmic"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_C_FLAGS:STRING=-mmic"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_Fortran_COMPILER:FILEPATH=ifort"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D BLAS_LIBRARY_DIRS:FILEPATH=${MKLROOT}/lib/mic"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D BLAS_LIBRARY_NAMES='mkl_intel_lp64;mkl_sequential;mkl_core;pthread;m'"
+
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_CHECKED_STL:BOOL=OFF"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_WARNINGS_AS_ERRORS_FLAGS:STRING=''"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D BUILD_SHARED_LIBS:BOOL=OFF"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D DART_TESTING_TIMEOUT:STRING=600"
+
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D LAPACK_LIBRARY_NAMES=''"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_LAPACK_LIBRARIES=''"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_BinUtils=OFF"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_Pthread_LIBRARIES=pthread"
+
+  # Cannot cross-compile fortran compatibility checks on the MIC:
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_Fortran:BOOL=OFF"
+
+  # Tell cmake the answers to compile-and-execute tests
+  # to prevent cmake from executing a cross-compiled program.
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D HAVE_GCC_ABI_DEMANGLE_EXITCODE=0"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D HAVE_TEUCHOS_BLASFLOAT_EXITCODE=0"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D LAPACK_SLAPY2_WORKS_EXITCODE=0"
+
+fi
+
+#-----------------------------------------------------------------------------
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE}"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_VERBOSE_MAKEFILE:BOOL=${CMAKE_VERBOSE_MAKEFILE}"
+
+#-----------------------------------------------------------------------------
+
+echo "cmake ${CMAKE_CONFIGURE} ${CMAKE_PROJECT_DIR}"
+
+cmake ${CMAKE_CONFIGURE} ${CMAKE_PROJECT_DIR}
+
+#-----------------------------------------------------------------------------
+
diff --git a/lib/kokkos/config/configure_shannon.sh b/lib/kokkos/config/configure_shannon.sh
new file mode 100755
index 0000000000000000000000000000000000000000..8bd175b0314ffc74c9c6ffb02188d599a1b5a573
--- /dev/null
+++ b/lib/kokkos/config/configure_shannon.sh
@@ -0,0 +1,190 @@
+#!/bin/sh
+#
+# Copy this script, put it outside the Trilinos source directory, and
+# build there.
+#
+# Additional command-line arguments given to this script will be
+# passed directly to CMake.
+#
+
+#
+# Force CMake to re-evaluate build options.
+#
+rm -rf CMake* Trilinos* packages Dart* Testing cmake_install.cmake MakeFile*
+
+#-----------------------------------------------------------------------------
+# Incrementally construct cmake configure options:
+
+CMAKE_CONFIGURE=""
+
+#-----------------------------------------------------------------------------
+# Location of Trilinos source tree:
+
+CMAKE_PROJECT_DIR="${HOME}/Trilinos"
+
+# Location for installation:
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_INSTALL_PREFIX=/home/projects/kokkos/`date +%F`"
+
+#-----------------------------------------------------------------------------
+# General build options.
+# Use a variable so options can be propagated to CUDA compiler.
+
+CMAKE_VERBOSE_MAKEFILE=OFF
+CMAKE_BUILD_TYPE=RELEASE
+# CMAKE_BUILD_TYPE=DEBUG
+
+#-----------------------------------------------------------------------------
+# Build for CUDA architecture:
+
+# CUDA_ARCH=""
+# CUDA_ARCH="20"
+# CUDA_ARCH="30"
+CUDA_ARCH="35"
+
+# Build host code with Intel compiler:
+
+INTEL=ON
+
+# Build for MIC architecture:
+
+# INTEL_XEON_PHI=ON
+
+# Build with HWLOC at location:
+
+HWLOC_BASE_DIR="/home/projects/hwloc/1.6.2"
+
+# Location for MPI to use in examples:
+
+MPI_BASE_DIR=""
+
+#-----------------------------------------------------------------------------
+# MPI configuation only used for examples:
+#
+# Must have the MPI_BASE_DIR so that the
+# include path can be passed to the Cuda compiler
+
+if [ -n "${MPI_BASE_DIR}" ] ;
+then
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_MPI:BOOL=ON"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D MPI_BASE_DIR:PATH=${MPI_BASE_DIR}"
+else
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_MPI:BOOL=OFF"
+fi
+
+#-----------------------------------------------------------------------------
+# Pthread configuation:
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_Pthread:BOOL=ON"
+# CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_Pthread:BOOL=OFF"
+
+#-----------------------------------------------------------------------------
+# OpenMP configuation:
+
+# CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_OpenMP:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_OpenMP:BOOL=OFF"
+
+#-----------------------------------------------------------------------------
+#-----------------------------------------------------------------------------
+# Configure packages for kokkos-only:
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_ALL_PACKAGES:BOOL=OFF"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_EXAMPLES:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_TESTS:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_KokkosCore:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_KokkosContainers:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_KokkosExample:BOOL=ON"
+
+#-----------------------------------------------------------------------------
+#-----------------------------------------------------------------------------
+# Hardware locality cmake configuration:
+
+if [ -n "${HWLOC_BASE_DIR}" ] ;
+then
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_HWLOC:BOOL=ON"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D HWLOC_INCLUDE_DIRS:FILEPATH=${HWLOC_BASE_DIR}/include"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D HWLOC_LIBRARY_DIRS:FILEPATH=${HWLOC_BASE_DIR}/lib"
+fi
+
+#-----------------------------------------------------------------------------
+# Cuda cmake configuration:
+
+if [ -n "${CUDA_ARCH}" ] ;
+then
+
+  # Options to CUDA_NVCC_FLAGS must be semi-colon delimited,
+  # this is different than the standard CMAKE_CXX_FLAGS syntax.
+
+  CUDA_NVCC_FLAGS="-gencode;arch=compute_${CUDA_ARCH},code=sm_${CUDA_ARCH}"
+  CUDA_NVCC_FLAGS="${CUDA_NVCC_FLAGS};-Xcompiler;-Wall,-ansi"
+
+  if [ "${CMAKE_BUILD_TYPE}" = "DEBUG" ] ;
+  then
+    CUDA_NVCC_FLAGS="${CUDA_NVCC_FLAGS};-g"
+  else
+    CUDA_NVCC_FLAGS="${CUDA_NVCC_FLAGS};-O3"
+  fi
+
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_CUDA:BOOL=ON"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_CUSPARSE:BOOL=ON"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CUDA_VERBOSE_BUILD:BOOL=OFF"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CUDA_NVCC_FLAGS:STRING=${CUDA_NVCC_FLAGS}"
+
+fi
+
+#-----------------------------------------------------------------------------
+
+if [ "${INTEL}" = "ON" -o "${INTEL_XEON_PHI}" = "ON" ] ;
+then
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_C_COMPILER=icc"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_CXX_COMPILER=icpc"
+fi
+
+#-----------------------------------------------------------------------------
+
+# Cross-compile for Intel Xeon Phi:
+
+if [ "${INTEL_XEON_PHI}" = "ON" ] ;
+then
+
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_SYSTEM_NAME=Linux"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_CXX_FLAGS:STRING=-mmic"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_C_FLAGS:STRING=-mmic"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_Fortran_COMPILER:FILEPATH=ifort"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D BLAS_LIBRARY_DIRS:FILEPATH=${MKLROOT}/lib/mic"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D BLAS_LIBRARY_NAMES='mkl_intel_lp64;mkl_sequential;mkl_core;pthread;m'"
+
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_CHECKED_STL:BOOL=OFF"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_WARNINGS_AS_ERRORS_FLAGS:STRING=''"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D BUILD_SHARED_LIBS:BOOL=OFF"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D DART_TESTING_TIMEOUT:STRING=600"
+
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D LAPACK_LIBRARY_NAMES=''"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_LAPACK_LIBRARIES=''"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_BinUtils=OFF"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_Pthread_LIBRARIES=pthread"
+
+  # Cannot cross-compile fortran compatibility checks on the MIC:
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_Fortran:BOOL=OFF"
+
+  # Tell cmake the answers to compile-and-execute tests
+  # to prevent cmake from executing a cross-compiled program.
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D HAVE_GCC_ABI_DEMANGLE_EXITCODE=0"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D HAVE_TEUCHOS_BLASFLOAT_EXITCODE=0"
+  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D LAPACK_SLAPY2_WORKS_EXITCODE=0"
+
+fi
+
+#-----------------------------------------------------------------------------
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE}"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_VERBOSE_MAKEFILE:BOOL=${CMAKE_VERBOSE_MAKEFILE}"
+
+#-----------------------------------------------------------------------------
+
+echo "cmake ${CMAKE_CONFIGURE} ${CMAKE_PROJECT_DIR}"
+
+cmake ${CMAKE_CONFIGURE} ${CMAKE_PROJECT_DIR}
+
+#-----------------------------------------------------------------------------
+
diff --git a/lib/kokkos/config/configure_tpetra_kokkos_cuda_nvcc_wrapper.sh b/lib/kokkos/config/configure_tpetra_kokkos_cuda_nvcc_wrapper.sh
new file mode 100755
index 0000000000000000000000000000000000000000..0baa83aefee66f6db0e131c2a0c77ce62d7545b6
--- /dev/null
+++ b/lib/kokkos/config/configure_tpetra_kokkos_cuda_nvcc_wrapper.sh
@@ -0,0 +1,140 @@
+#!/bin/bash
+#
+# This script uses CUDA, OpenMP, and MPI.
+#
+# Before invoking this script, set the OMPI_CXX environment variable
+# to point to nvcc_wrapper, wherever it happens to live.  (If you use
+# an MPI implementation other than OpenMPI, set the corresponding
+# environment variable instead.)
+#
+
+rm -f CMakeCache.txt;
+rm -rf CMakeFiles
+EXTRA_ARGS=$@
+MPI_PATH="/opt/mpi/openmpi/1.8.2/nvcc-gcc/4.8.3-6.5"
+CUDA_PATH="/opt/nvidia/cuda/6.5.14"
+
+#
+# As long as there are any .cu files in Trilinos, we'll need to set
+# CUDA_NVCC_FLAGS.  If Trilinos gets rid of all of its .cu files and
+# lets nvcc_wrapper handle them as .cpp files, then we won't need to
+# set CUDA_NVCC_FLAGS.  As it is, given that we need to set
+# CUDA_NVCC_FLAGS, we must make sure that they are the same flags as
+# nvcc_wrapper passes to nvcc.
+#
+CUDA_NVCC_FLAGS="-gencode;arch=compute_35,code=sm_35;-I${MPI_PATH}/include"
+CUDA_NVCC_FLAGS="${CUDA_NVCC_FLAGS};-Xcompiler;-Wall,-ansi,-fopenmp"
+CUDA_NVCC_FLAGS="${CUDA_NVCC_FLAGS};-O3;-DKOKKOS_USE_CUDA_UVM"
+
+cmake \
+  -D CMAKE_INSTALL_PREFIX:PATH="$PWD/../install/" \
+  -D CMAKE_BUILD_TYPE:STRING=DEBUG \
+  -D CMAKE_CXX_FLAGS:STRING="-g -Wall" \
+  -D CMAKE_C_FLAGS:STRING="-g -Wall" \
+  -D CMAKE_FORTRAN_FLAGS:STRING="" \
+  -D CMAKE_SHARED_LIBRARY_LINK_CXX_FLAGS="" \
+  -D Trilinos_ENABLE_Triutils=OFF \
+  -D Trilinos_ENABLE_INSTALL_CMAKE_CONFIG_FILES:BOOL=OFF \
+  -D Trilinos_ENABLE_DEBUG:BOOL=OFF \
+  -D Trilinos_ENABLE_CHECKED_STL:BOOL=OFF \
+  -D Trilinos_ENABLE_EXPLICIT_INSTANTIATION:BOOL=OFF \
+  -D Trilinos_WARNINGS_AS_ERRORS_FLAGS:STRING="" \
+  -D Trilinos_ENABLE_ALL_PACKAGES:BOOL=OFF \
+  -D Trilinos_ENABLE_ALL_OPTIONAL_PACKAGES:BOOL=OFF \
+  -D BUILD_SHARED_LIBS:BOOL=OFF \
+  -D DART_TESTING_TIMEOUT:STRING=600 \
+  -D CMAKE_VERBOSE_MAKEFILE:BOOL=OFF \
+  \
+  \
+  -D CMAKE_CXX_COMPILER:FILEPATH="${MPI_PATH}/bin/mpicxx" \
+  -D CMAKE_C_COMPILER:FILEPATH="${MPI_PATH}/bin/mpicc" \
+  -D MPI_CXX_COMPILER:FILEPATH="${MPI_PATH}/bin/mpicxx" \
+  -D MPI_C_COMPILER:FILEPATH="${MPI_PATH}/bin/mpicc" \
+  -D CMAKE_Fortran_COMPILER:FILEPATH="${MPI_PATH}/bin/mpif77" \
+  -D MPI_EXEC:FILEPATH="${MPI_PATH}/bin/mpirun" \
+  -D MPI_EXEC_POST_NUMPROCS_FLAGS:STRING="-bind-to;socket;--map-by;socket;env;CUDA_MANAGED_FORCE_DEVICE_ALLOC=1;CUDA_LAUNCH_BLOCKING=1;OMP_NUM_THREADS=2" \
+  \
+  \
+  -D Trilinos_ENABLE_CXX11:BOOL=OFF \
+  -D TPL_ENABLE_MPI:BOOL=ON \
+  -D Trilinos_ENABLE_OpenMP:BOOL=ON \
+  -D Trilinos_ENABLE_ThreadPool:BOOL=ON \
+  \
+  \
+  -D TPL_ENABLE_CUDA:BOOL=ON \
+  -D CUDA_TOOLKIT_ROOT_DIR:FILEPATH="${CUDA_PATH}" \
+  -D CUDA_PROPAGATE_HOST_FLAGS:BOOL=OFF \
+  -D TPL_ENABLE_Thrust:BOOL=OFF \
+  -D Thrust_INCLUDE_DIRS:FILEPATH="${CUDA_PATH}/include" \
+  -D TPL_ENABLE_CUSPARSE:BOOL=OFF \
+  -D TPL_ENABLE_Cusp:BOOL=OFF \
+  -D Cusp_INCLUDE_DIRS="/home/crtrott/Software/cusp" \
+  -D CUDA_VERBOSE_BUILD:BOOL=OFF \
+  -D CUDA_NVCC_FLAGS:STRING=${CUDA_NVCC_FLAGS} \
+  \
+  \
+  -D TPL_ENABLE_HWLOC=OFF \
+  -D HWLOC_INCLUDE_DIRS="/usr/local/software/hwloc/current/include" \
+  -D HWLOC_LIBRARY_DIRS="/usr/local/software/hwloc/current/lib" \
+  -D TPL_ENABLE_BinUtils=OFF \
+  -D TPL_ENABLE_BLAS:STRING=ON \
+  -D TPL_ENABLE_LAPACK:STRING=ON \
+  -D TPL_ENABLE_MKL:STRING=OFF \
+  -D TPL_ENABLE_HWLOC:STRING=OFF \
+  -D TPL_ENABLE_GTEST:STRING=ON \
+  -D TPL_ENABLE_SuperLU=ON \
+  -D TPL_ENABLE_BLAS=ON \
+  -D TPL_ENABLE_LAPACK=ON \
+  -D TPL_SuperLU_LIBRARIES="/home/crtrott/Software/SuperLU_4.3/lib/libsuperlu_4.3.a" \
+  -D TPL_SuperLU_INCLUDE_DIRS="/home/crtrott/Software/SuperLU_4.3/SRC" \
+  \
+  \
+  -D Trilinos_Enable_Kokkos:BOOL=ON \
+  -D Trilinos_ENABLE_KokkosCore:BOOL=ON \
+  -D Trilinos_ENABLE_TeuchosKokkosCompat:BOOL=ON \
+  -D Trilinos_ENABLE_KokkosContainers:BOOL=ON \
+  -D Trilinos_ENABLE_TpetraKernels:BOOL=ON \
+  -D Trilinos_ENABLE_KokkosAlgorithms:BOOL=ON \
+  -D Trilinos_ENABLE_TeuchosKokkosComm:BOOL=ON \
+  -D Trilinos_ENABLE_KokkosExample:BOOL=ON \
+  -D Kokkos_ENABLE_EXAMPLES:BOOL=ON \
+  -D Kokkos_ENABLE_TESTS:BOOL=OFF \
+  -D KokkosClassic_DefaultNode:STRING="Kokkos::Compat::KokkosCudaWrapperNode" \
+  -D TpetraClassic_ENABLE_OpenMPNode=OFF \
+  -D TpetraClassic_ENABLE_TPINode=OFF \
+  -D TpetraClassic_ENABLE_MKL=OFF \
+  -D Kokkos_ENABLE_Cuda_UVM=ON \
+  \
+  \
+  -D Trilinos_ENABLE_Teuchos:BOOL=ON \
+  -D Teuchos_ENABLE_COMPLEX:BOOL=OFF \
+  \
+  \
+  -D Trilinos_ENABLE_Tpetra:BOOL=ON \
+  -D Tpetra_ENABLE_KokkosCore=ON \
+  -D Tpetra_ENABLE_Kokkos_DistObject=OFF \
+  -D Tpetra_ENABLE_Kokkos_Refactor=ON \
+  -D Tpetra_ENABLE_TESTS=ON \
+  -D Tpetra_ENABLE_EXAMPLES=ON \
+  -D Tpetra_ENABLE_MPI_CUDA_RDMA:BOOL=ON \
+  \
+  \
+  -D Trilinos_ENABLE_Belos=OFF \
+  -D Trilinos_ENABLE_Amesos=OFF \
+  -D Trilinos_ENABLE_Amesos2=OFF \
+  -D Trilinos_ENABLE_Ifpack=OFF \
+  -D Trilinos_ENABLE_Ifpack2=OFF \
+  -D Trilinos_ENABLE_Epetra=OFF \
+  -D Trilinos_ENABLE_EpetraExt=OFF \
+  -D Trilinos_ENABLE_Zoltan=OFF \
+  -D Trilinos_ENABLE_Zoltan2=OFF \
+  -D Trilinos_ENABLE_MueLu=OFF \
+  -D Belos_ENABLE_TESTS=ON \
+  -D Belos_ENABLE_EXAMPLES=ON \
+  -D MueLu_ENABLE_TESTS=ON \
+  -D MueLu_ENABLE_EXAMPLES=ON \
+  -D Ifpack2_ENABLE_TESTS=ON \
+  -D Ifpack2_ENABLE_EXAMPLES=ON \
+  $EXTRA_ARGS \
+${HOME}/Trilinos
+
diff --git a/lib/kokkos/config/kokkos-trilinos-integration-procedure.txt b/lib/kokkos/config/kokkos-trilinos-integration-procedure.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9f56f2fd48d30da63f28662431711c8b20d1f4a5
--- /dev/null
+++ b/lib/kokkos/config/kokkos-trilinos-integration-procedure.txt
@@ -0,0 +1,153 @@
+// -------------------------------------------------------------------------------- //
+
+The following steps are for workstations/servers with the SEMS environment installed.
+
+// -------------------------------------------------------------------------------- //
+Summary:
+
+- Step 1: Rigorous testing of Kokkos' develop branch for each backend (Serial, OpenMP, Threads, Cuda) with all supported compilers.
+
+- Step 2: Snapshot Kokkos' develop branch into current Trilinos develop branch.
+
+- Step 3: Build and test Trilinos with combinations of compilers, types, backends.
+
+- Step 4: Promote Kokkos develop branch to master if the snapshot does not cause any new tests to fail; else track/fix causes of new failures.
+
+- Step 5: Snapshot Kokkos tagged master branch into Trilinos and push Trilinos.
+// -------------------------------------------------------------------------------- //
+
+
+// -------------------------------------------------------------------------------- //
+
+Step 1:
+  1.1. Update kokkos develop branch (NOT a fork)
+
+         (From kokkos directory):
+         git fetch --all
+         git checkout develop
+         git reset --hard origin/develop
+
+  1.2. Create a testing directory - here the directory is created within the kokkos directory
+
+         mkdir testing
+         cd testing
+
+  1.3. Run the test_all_sandia script; various compiler and build-list options can be specified
+
+         ../config/test_all_sandia
+
+  1.4 Clean repository of untracked files
+
+        cd ../
+        git clean -df
+
+// -------------------------------------------------------------------------------- //
+
+Step 2:
+  2.1 Update Trilinos develop branch
+
+        (From Trilinos directory):
+        git checkout develop
+        git fetch --all
+        git reset --hard origin/develop
+        git clean -df
+
+  2.2 Snapshot Kokkos into Trilinos - this requires python/2.7.9 and that both Trilinos and Kokkos be clean - no untracked or modified files
+
+        module load python/2.7.9
+        python KOKKOS_PATH/config/snapshot.py KOKKOS_PATH TRILINOS_PATH/packages
+
+// -------------------------------------------------------------------------------- //
+
+Step 3:
+  3.1. Build and test Trilinos with 3 different configurations; a configure-all script is provided in Trilinos and should be modified to test each of the following 3 configurations with appropriate environment variable(s):
+
+      - GCC/4.7.2-OpenMP/Complex
+          Run tests with the following environment variable:
+
+            export OMP_NUM_THREADS=2
+
+
+      - Intel/15.0.2-Serial/NoComplex
+
+
+      - GCC/4.8.4/CUDA/7.5.18-Cuda/Serial/NoComplex
+          Run tests with the following environment variables:
+
+            export CUDA_LAUNCH_BLOCKING=1
+            export CUDA_MANAGED_FORCE_DEVICE_ALLOC=1
+
+
+        mkdir Build
+        cd Build
+        cp TRILINOS_PATH/sampleScripts/Sandia-SEMS/configure-all ./
+            ** Set the path to Trilinos appropriately within the configure-all script **
+        source $SEMS_MODULE_ROOT/utils/sems-modules-init.sh kokkos
+        source configure-all
+        make -k  (-k means "keep going" to get past build errors; -j12 can also be specified to build with 12 threads, for example)
+        ctest
+
+  3.2. Compare the failed test output to the test output on the dashboard ( testing.sandia.gov/cdash select Trilinos ); investigate and fix problems if new tests fail after the Kokkos snapshot
+
+// -------------------------------------------------------------------------------- //
+
+Step 4:
+  4.1. Once all Trilinos tests pass promote Kokkos develop branch to master on Github
+
+       - DO NOT fast-forward the merge!!!!
+
+       (From kokkos directory):
+       git checkout master
+       git fetch --all
+       # Ensure we are on the current origin/master
+       git reset --hard origin/master
+       git merge --no-ff origin/develop
+
+  4.2. Update the tag in kokkos/config/master_history.txt
+       Tag description: MajorNumber.MinorNumber.WeeksSinceMinorNumberUpdate
+       Tag format: #.#.##
+
+       # Prepend master_history.txt with 
+       
+       # tag: #.#.##
+       # date: mm/dd/yyyy
+       # master: sha1
+       # develop: sha1
+       # -----------------------
+
+       git commit --amend -a
+
+       git tag -a #.#.##
+         tag: #.#.##
+         date: mm/dd/yyyy
+         master: sha1
+         develop: sha1
+
+       git push --follow-tags origin master
+
+// -------------------------------------------------------------------------------- //
+
+Step 5:
+  5.1. Make sure Trilinos is up-to-date - chances are other changes have been committed since the integration testing process began. If a substantial change has occurred that may be affected by the snapshot the testing procedure may need to be repeated
+
+       (From Trilinos directory):
+       git checkout develop
+       git fetch --all
+       git reset --hard origin/develop
+       git clean -df
+
+  5.2. Snapshot Kokkos master branch into Trilinos
+
+       (From kokkos directory):
+       git fetch --all
+       git checkout tags/#.#.##
+       git clean -df
+
+       python KOKKOS_PATH/config/snapshot.py KOKKOS_PATH TRILINOS_PATH/packages
+       
+  5.3. Push the updated develop branch of Trilinos to Github - congratulations!!!
+
+       (From Trilinos directory):
+       git push
+
+// -------------------------------------------------------------------------------- //
diff --git a/lib/kokkos/config/kokkos_dev/config-core-all.sh b/lib/kokkos/config/kokkos_dev/config-core-all.sh
new file mode 100755
index 0000000000000000000000000000000000000000..fa588c778f68330ff130364e9425d5a6aefa357c
--- /dev/null
+++ b/lib/kokkos/config/kokkos_dev/config-core-all.sh
@@ -0,0 +1,113 @@
+#!/bin/sh
+#
+# Copy this script, put it outside the Trilinos source directory, and
+# build there.
+#
+#-----------------------------------------------------------------------------
+# Building on 'kokkos-dev.sandia.gov' with enabled capabilities:
+#
+#   Cuda, OpenMP, Threads, Qthread, hwloc
+#
+# module loaded on 'kokkos-dev.sandia.gov' for this build
+#
+#  module load  cmake/2.8.11.2  gcc/4.8.3  cuda/6.5.14  nvcc-wrapper/gnu
+#
+# The 'nvcc-wrapper' module should load a script that matches
+# kokkos/config/nvcc_wrapper
+#
+#-----------------------------------------------------------------------------
+# Source and installation directories:
+
+TRILINOS_SOURCE_DIR=${HOME}/Trilinos
+TRILINOS_INSTALL_DIR=${HOME}/TrilinosInstall/`date +%F`
+
+CMAKE_CONFIGURE=""
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_INSTALL_PREFIX=${TRILINOS_INSTALL_DIR}"
+
+#-----------------------------------------------------------------------------
+# Debug/optimized
+
+# CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_BUILD_TYPE:STRING=DEBUG"
+# CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Kokkos_ENABLE_BOUNDS_CHECK:BOOL=ON"
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_BUILD_TYPE:STRING=RELEASE"
+
+#-----------------------------------------------------------------------------
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_CXX_FLAGS:STRING=-Wall"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_C_COMPILER=gcc"
+
+#-----------------------------------------------------------------------------
+# Cuda using GNU, use the nvcc_wrapper to build CUDA source
+
+# CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_CXX_COMPILER=g++"
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_CXX_COMPILER=nvcc_wrapper"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_CUDA:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_CUSPARSE:BOOL=ON"
+
+#-----------------------------------------------------------------------------
+# Configure for Kokkos subpackages and tests:
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_Fortran:BOOL=OFF"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_ALL_PACKAGES:BOOL=OFF"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_EXAMPLES:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_TESTS:BOOL=ON"
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_KokkosCore:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_KokkosContainers:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_KokkosAlgorithms:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_TpetraKernels:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_KokkosExample:BOOL=ON"
+
+#-----------------------------------------------------------------------------
+# Hardware locality configuration:
+
+HWLOC_BASE_DIR="/home/projects/hwloc/1.7.1/host/gnu/4.7.3"
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_HWLOC:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D HWLOC_INCLUDE_DIRS:FILEPATH=${HWLOC_BASE_DIR}/include"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D HWLOC_LIBRARY_DIRS:FILEPATH=${HWLOC_BASE_DIR}/lib"
+
+#-----------------------------------------------------------------------------
+# Pthread
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_Pthread:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Kokkos_ENABLE_Pthread:BOOL=ON"
+
+#-----------------------------------------------------------------------------
+# OpenMP
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_OpenMP:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Kokkos_ENABLE_OpenMP:BOOL=ON"
+
+#-----------------------------------------------------------------------------
+# Qthread
+
+QTHREAD_BASE_DIR="/home/projects/qthreads/2014-07-08/host/gnu/4.7.3"
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_QTHREAD:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D QTHREAD_INCLUDE_DIRS:FILEPATH=${QTHREAD_BASE_DIR}/include"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D QTHREAD_LIBRARY_DIRS:FILEPATH=${QTHREAD_BASE_DIR}/lib"
+
+#-----------------------------------------------------------------------------
+# C++11
+
+#  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_CXX11:BOOL=ON"
+#  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Kokkos_ENABLE_CXX11:BOOL=ON"
+
+#-----------------------------------------------------------------------------
+#
+# Remove CMake output files to force reconfigure from scratch.
+#
+
+rm -rf CMake* Trilinos* packages Dart* Testing cmake_install.cmake MakeFile*
+
+#
+
+echo cmake ${CMAKE_CONFIGURE} ${TRILINOS_SOURCE_DIR}
+
+cmake ${CMAKE_CONFIGURE} ${TRILINOS_SOURCE_DIR}
+
+#-----------------------------------------------------------------------------
+
diff --git a/lib/kokkos/config/kokkos_dev/config-core-cuda-omp-hwloc.sh b/lib/kokkos/config/kokkos_dev/config-core-cuda-omp-hwloc.sh
new file mode 100755
index 0000000000000000000000000000000000000000..c2e17bb9443ad37576b490149d63e1d7b9f9b1ef
--- /dev/null
+++ b/lib/kokkos/config/kokkos_dev/config-core-cuda-omp-hwloc.sh
@@ -0,0 +1,104 @@
+#!/bin/sh
+#
+# Copy this script, put it outside the Trilinos source directory, and
+# build there.
+#
+#-----------------------------------------------------------------------------
+# Building on 'kokkos-dev.sandia.gov' with enabled capabilities:
+#
+#   Cuda, OpenMP, hwloc
+#
+# module loaded on 'kokkos-dev.sandia.gov' for this build
+#
+#  module load  cmake/2.8.11.2  gcc/4.8.3  cuda/6.5.14  nvcc-wrapper/gnu
+#
+# The 'nvcc-wrapper' module should load a script that matches
+# kokkos/config/nvcc_wrapper
+#
+#-----------------------------------------------------------------------------
+# Source and installation directories:
+
+TRILINOS_SOURCE_DIR=${HOME}/Trilinos
+TRILINOS_INSTALL_DIR=${HOME}/TrilinosInstall/`date +%F`
+
+CMAKE_CONFIGURE=""
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_INSTALL_PREFIX=${TRILINOS_INSTALL_DIR}"
+
+#-----------------------------------------------------------------------------
+# Debug/optimized
+
+# CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_BUILD_TYPE:STRING=DEBUG"
+# CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Kokkos_ENABLE_BOUNDS_CHECK:BOOL=ON"
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_BUILD_TYPE:STRING=RELEASE"
+
+#-----------------------------------------------------------------------------
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_CXX_FLAGS:STRING=-Wall"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_C_COMPILER=gcc"
+
+#-----------------------------------------------------------------------------
+# Cuda using GNU, use the nvcc_wrapper to build CUDA source
+
+# CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_CXX_COMPILER=g++"
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_CXX_COMPILER=nvcc_wrapper"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_CUDA:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_CUSPARSE:BOOL=ON"
+
+#-----------------------------------------------------------------------------
+# Configure for Kokkos subpackages and tests:
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_Fortran:BOOL=OFF"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_ALL_PACKAGES:BOOL=OFF"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_EXAMPLES:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_TESTS:BOOL=ON"
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_KokkosCore:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_KokkosContainers:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_KokkosAlgorithms:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_TpetraKernels:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_KokkosExample:BOOL=ON"
+
+#-----------------------------------------------------------------------------
+# Hardware locality configuration:
+
+HWLOC_BASE_DIR="/home/projects/hwloc/1.7.1/host/gnu/4.7.3"
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_HWLOC:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D HWLOC_INCLUDE_DIRS:FILEPATH=${HWLOC_BASE_DIR}/include"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D HWLOC_LIBRARY_DIRS:FILEPATH=${HWLOC_BASE_DIR}/lib"
+
+#-----------------------------------------------------------------------------
+# Pthread explicitly OFF so tribits doesn't automatically turn it on
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_Pthread:BOOL=OFF"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Kokkos_ENABLE_Pthread:BOOL=OFF"
+
+#-----------------------------------------------------------------------------
+# OpenMP
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_OpenMP:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Kokkos_ENABLE_OpenMP:BOOL=ON"
+
+#-----------------------------------------------------------------------------
+# C++11
+
+#  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_CXX11:BOOL=ON"
+#  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Kokkos_ENABLE_CXX11:BOOL=ON"
+
+#-----------------------------------------------------------------------------
+#
+# Remove CMake output files to force reconfigure from scratch.
+#
+
+rm -rf CMake* Trilinos* packages Dart* Testing cmake_install.cmake MakeFile*
+
+#
+
+echo cmake ${CMAKE_CONFIGURE} ${TRILINOS_SOURCE_DIR}
+
+cmake ${CMAKE_CONFIGURE} ${TRILINOS_SOURCE_DIR}
+
+#-----------------------------------------------------------------------------
+
diff --git a/lib/kokkos/config/kokkos_dev/config-core-cuda.sh b/lib/kokkos/config/kokkos_dev/config-core-cuda.sh
new file mode 100755
index 0000000000000000000000000000000000000000..39b72d5ce136ff2ea00c6e1cc4a049eb02d606ee
--- /dev/null
+++ b/lib/kokkos/config/kokkos_dev/config-core-cuda.sh
@@ -0,0 +1,88 @@
+#!/bin/sh
+#
+# Copy this script, put it outside the Trilinos source directory, and
+# build there.
+#
+#-----------------------------------------------------------------------------
+# Building on 'kokkos-dev.sandia.gov' with enabled capabilities:
+#
+#   Cuda
+#
+# module loaded on 'kokkos-dev.sandia.gov' for this build
+#
+#  module load  cmake/2.8.11.2  gcc/4.8.3  cuda/6.5.14  nvcc-wrapper/gnu
+#
+# The 'nvcc-wrapper' module should load a script that matches
+# kokkos/config/nvcc_wrapper
+#
+#-----------------------------------------------------------------------------
+# Source and installation directories:
+
+TRILINOS_SOURCE_DIR=${HOME}/Trilinos
+TRILINOS_INSTALL_DIR=${HOME}/TrilinosInstall/`date +%F`
+
+CMAKE_CONFIGURE=""
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_INSTALL_PREFIX=${TRILINOS_INSTALL_DIR}"
+
+#-----------------------------------------------------------------------------
+# Debug/optimized
+
+# CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_BUILD_TYPE:STRING=DEBUG"
+# CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Kokkos_ENABLE_BOUNDS_CHECK:BOOL=ON"
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_BUILD_TYPE:STRING=RELEASE"
+
+#-----------------------------------------------------------------------------
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_CXX_FLAGS:STRING=-Wall"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_C_COMPILER=gcc"
+
+#-----------------------------------------------------------------------------
+# Cuda using GNU, use the nvcc_wrapper to build CUDA source
+
+# CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_CXX_COMPILER=g++"
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_CXX_COMPILER=nvcc_wrapper"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_CUDA:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_CUSPARSE:BOOL=ON"
+
+# Pthread explicitly OFF, otherwise tribits will automatically turn it on
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_Pthread:BOOL=OFF"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Kokkos_ENABLE_Pthread:BOOL=OFF"
+
+#-----------------------------------------------------------------------------
+# Configure for Kokkos subpackages and tests:
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_Fortran:BOOL=OFF"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_ALL_PACKAGES:BOOL=OFF"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_EXAMPLES:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_TESTS:BOOL=ON"
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_KokkosCore:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_KokkosContainers:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_KokkosAlgorithms:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_TpetraKernels:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_KokkosExample:BOOL=ON"
+
+#-----------------------------------------------------------------------------
+# C++11
+
+#  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_CXX11:BOOL=ON"
+#  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Kokkos_ENABLE_CXX11:BOOL=ON"
+
+#-----------------------------------------------------------------------------
+#
+# Remove CMake output files to force reconfigure from scratch.
+#
+
+rm -rf CMake* Trilinos* packages Dart* Testing cmake_install.cmake MakeFile*
+
+#
+
+echo cmake ${CMAKE_CONFIGURE} ${TRILINOS_SOURCE_DIR}
+
+cmake ${CMAKE_CONFIGURE} ${TRILINOS_SOURCE_DIR}
+
+#-----------------------------------------------------------------------------
+
diff --git a/lib/kokkos/config/kokkos_dev/config-core-cxx11-omp.sh b/lib/kokkos/config/kokkos_dev/config-core-cxx11-omp.sh
new file mode 100755
index 0000000000000000000000000000000000000000..b83a535416064febc732a4fd0d7dbf34274b1adf
--- /dev/null
+++ b/lib/kokkos/config/kokkos_dev/config-core-cxx11-omp.sh
@@ -0,0 +1,84 @@
+#!/bin/sh
+#
+# Copy this script, put it outside the Trilinos source directory, and
+# build there.
+#
+#-----------------------------------------------------------------------------
+# Building on 'kokkos-dev.sandia.gov' with enabled capabilities:
+#
+#   C++11, OpenMP
+#
+# module loaded on 'kokkos-dev.sandia.gov' for this build
+#
+#  module load  cmake/2.8.11.2  gcc/4.8.3
+#
+#-----------------------------------------------------------------------------
+# Source and installation directories:
+
+TRILINOS_SOURCE_DIR=${HOME}/Trilinos
+TRILINOS_INSTALL_DIR=${HOME}/TrilinosInstall/`date +%F`
+
+CMAKE_CONFIGURE=""
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_INSTALL_PREFIX=${TRILINOS_INSTALL_DIR}"
+
+#-----------------------------------------------------------------------------
+# Debug/optimized
+
+# CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_BUILD_TYPE:STRING=DEBUG"
+# CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Kokkos_ENABLE_BOUNDS_CHECK:BOOL=ON"
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_BUILD_TYPE:STRING=RELEASE"
+
+#-----------------------------------------------------------------------------
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_CXX_FLAGS:STRING=-Wall"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_C_COMPILER=gcc"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_CXX_COMPILER=g++"
+
+#-----------------------------------------------------------------------------
+# Configure for Kokkos subpackages and tests:
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_Fortran:BOOL=OFF"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_ALL_PACKAGES:BOOL=OFF"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_EXAMPLES:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_TESTS:BOOL=ON"
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_KokkosCore:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_KokkosContainers:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_KokkosAlgorithms:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_TpetraKernels:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_KokkosExample:BOOL=ON"
+
+#-----------------------------------------------------------------------------
+# Pthread explicitly OFF so tribits doesn't automatically activate
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_Pthread:BOOL=OFF"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Kokkos_ENABLE_Pthread:BOOL=OFF"
+
+#-----------------------------------------------------------------------------
+# OpenMP
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_OpenMP:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Kokkos_ENABLE_OpenMP:BOOL=ON"
+
+#-----------------------------------------------------------------------------
+# C++11
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_CXX11:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Kokkos_ENABLE_CXX11:BOOL=ON"
+
+#-----------------------------------------------------------------------------
+#
+# Remove CMake output files to force reconfigure from scratch.
+#
+
+rm -rf CMake* Trilinos* packages Dart* Testing cmake_install.cmake MakeFile*
+
+#
+
+echo cmake ${CMAKE_CONFIGURE} ${TRILINOS_SOURCE_DIR}
+
+cmake ${CMAKE_CONFIGURE} ${TRILINOS_SOURCE_DIR}
+
+#-----------------------------------------------------------------------------
+
diff --git a/lib/kokkos/config/kokkos_dev/config-core-dbg-none.sh b/lib/kokkos/config/kokkos_dev/config-core-dbg-none.sh
new file mode 100755
index 0000000000000000000000000000000000000000..d2e06a4ebd92080b255754b80b2af6ba93662090
--- /dev/null
+++ b/lib/kokkos/config/kokkos_dev/config-core-dbg-none.sh
@@ -0,0 +1,78 @@
+#!/bin/sh
+#
+# Copy this script, put it outside the Trilinos source directory, and
+# build there.
+#
+#-----------------------------------------------------------------------------
+# Building on 'kokkos-dev.sandia.gov' with enabled capabilities:
+#
+#   <none>
+#
+# module loaded on 'kokkos-dev.sandia.gov' for this build
+#
+#  module load  cmake/2.8.11.2  gcc/4.8.3
+#
+#-----------------------------------------------------------------------------
+# Source and installation directories:
+
+TRILINOS_SOURCE_DIR=${HOME}/Trilinos
+TRILINOS_INSTALL_DIR=${HOME}/TrilinosInstall/`date +%F`
+
+CMAKE_CONFIGURE=""
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_INSTALL_PREFIX=${TRILINOS_INSTALL_DIR}"
+
+#-----------------------------------------------------------------------------
+# Debug/optimized
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_BUILD_TYPE:STRING=DEBUG"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Kokkos_ENABLE_BOUNDS_CHECK:BOOL=ON"
+
+# CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_BUILD_TYPE:STRING=RELEASE"
+
+#-----------------------------------------------------------------------------
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_CXX_FLAGS:STRING=-Wall"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_C_COMPILER=gcc"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_CXX_COMPILER=g++"
+
+#-----------------------------------------------------------------------------
+# Configure for Kokkos subpackages and tests:
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_Fortran:BOOL=OFF"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_ALL_PACKAGES:BOOL=OFF"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_EXAMPLES:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_TESTS:BOOL=ON"
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_KokkosCore:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_KokkosContainers:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_KokkosAlgorithms:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_TpetraKernels:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_KokkosExample:BOOL=ON"
+
+#-----------------------------------------------------------------------------
+# Kokkos Pthread explicitly OFF, TPL Pthread ON for gtest
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_Pthread:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Kokkos_ENABLE_Pthread:BOOL=OFF"
+
+#-----------------------------------------------------------------------------
+# C++11
+
+#  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_CXX11:BOOL=ON"
+#  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Kokkos_ENABLE_CXX11:BOOL=ON"
+
+#-----------------------------------------------------------------------------
+#
+# Remove CMake output files to force reconfigure from scratch.
+#
+
+rm -rf CMake* Trilinos* packages Dart* Testing cmake_install.cmake MakeFile*
+
+#
+
+echo cmake ${CMAKE_CONFIGURE} ${TRILINOS_SOURCE_DIR}
+
+cmake ${CMAKE_CONFIGURE} ${TRILINOS_SOURCE_DIR}
+
+#-----------------------------------------------------------------------------
+
diff --git a/lib/kokkos/config/kokkos_dev/config-core-intel-cuda-omp.sh b/lib/kokkos/config/kokkos_dev/config-core-intel-cuda-omp.sh
new file mode 100755
index 0000000000000000000000000000000000000000..e2ab1f1c00168ed3ea646c9f297dc040e2c5a33f
--- /dev/null
+++ b/lib/kokkos/config/kokkos_dev/config-core-intel-cuda-omp.sh
@@ -0,0 +1,89 @@
+#!/bin/sh
+#
+# Copy this script, put it outside the Trilinos source directory, and
+# build there.
+#
+#-----------------------------------------------------------------------------
+# Building on 'kokkos-dev.sandia.gov' with enabled capabilities:
+#
+#   Intel, OpenMP, Cuda
+#
+# module loaded on 'kokkos-dev.sandia.gov' for this build
+#
+#  module load  cmake/2.8.11.2  cuda/7.0.4  intel/2015.0.090  nvcc-wrapper/intel
+#
+#-----------------------------------------------------------------------------
+# Source and installation directories:
+
+TRILINOS_SOURCE_DIR=${HOME}/Trilinos
+TRILINOS_INSTALL_DIR=${HOME}/TrilinosInstall/`date +%F`
+
+CMAKE_CONFIGURE=""
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_INSTALL_PREFIX=${TRILINOS_INSTALL_DIR}"
+
+#-----------------------------------------------------------------------------
+# Debug/optimized
+
+# CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_BUILD_TYPE:STRING=DEBUG"
+# CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Kokkos_ENABLE_BOUNDS_CHECK:BOOL=ON"
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_BUILD_TYPE:STRING=RELEASE"
+
+#-----------------------------------------------------------------------------
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_CXX_FLAGS:STRING=-Wall"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_C_COMPILER=icc"
+
+# CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_CXX_COMPILER=icpc"
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_CXX_COMPILER=nvcc_wrapper"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_CUDA:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_CUSPARSE:BOOL=ON"
+
+#-----------------------------------------------------------------------------
+# Configure for Kokkos subpackages and tests:
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_Fortran:BOOL=OFF"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_ALL_PACKAGES:BOOL=OFF"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_EXAMPLES:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_TESTS:BOOL=ON"
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_KokkosCore:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_KokkosContainers:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_KokkosAlgorithms:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_TpetraKernels:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_KokkosExample:BOOL=ON"
+
+#-----------------------------------------------------------------------------
+# Pthread explicitly OFF
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_Pthread:BOOL=OFF"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Kokkos_ENABLE_Pthread:BOOL=OFF"
+
+#-----------------------------------------------------------------------------
+# OpenMP
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_OpenMP:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Kokkos_ENABLE_OpenMP:BOOL=ON"
+
+#-----------------------------------------------------------------------------
+# C++11
+
+#  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_CXX11:BOOL=ON"
+#  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Kokkos_ENABLE_CXX11:BOOL=ON"
+
+#-----------------------------------------------------------------------------
+#
+# Remove CMake output files to force reconfigure from scratch.
+#
+
+rm -rf CMake* Trilinos* packages Dart* Testing cmake_install.cmake MakeFile*
+
+#
+
+echo cmake ${CMAKE_CONFIGURE} ${TRILINOS_SOURCE_DIR}
+
+cmake ${CMAKE_CONFIGURE} ${TRILINOS_SOURCE_DIR}
+
+#-----------------------------------------------------------------------------
+
diff --git a/lib/kokkos/config/kokkos_dev/config-core-intel-omp.sh b/lib/kokkos/config/kokkos_dev/config-core-intel-omp.sh
new file mode 100755
index 0000000000000000000000000000000000000000..fd56d41161a567bca1eb3601600cdc9fbe3b0104
--- /dev/null
+++ b/lib/kokkos/config/kokkos_dev/config-core-intel-omp.sh
@@ -0,0 +1,84 @@
+#!/bin/sh
+#
+# Copy this script, put it outside the Trilinos source directory, and
+# build there.
+#
+#-----------------------------------------------------------------------------
+# Building on 'kokkos-dev.sandia.gov' with enabled capabilities:
+#
+#   Intel, OpenMP
+#
+# module loaded on 'kokkos-dev.sandia.gov' for this build
+#
+#  module load  cmake/2.8.11.2  intel/13.SP1.1.106
+#
+#-----------------------------------------------------------------------------
+# Source and installation directories:
+
+TRILINOS_SOURCE_DIR=${HOME}/Trilinos
+TRILINOS_INSTALL_DIR=${HOME}/TrilinosInstall/`date +%F`
+
+CMAKE_CONFIGURE=""
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_INSTALL_PREFIX=${TRILINOS_INSTALL_DIR}"
+
+#-----------------------------------------------------------------------------
+# Debug/optimized
+
+# CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_BUILD_TYPE:STRING=DEBUG"
+# CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Kokkos_ENABLE_BOUNDS_CHECK:BOOL=ON"
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_BUILD_TYPE:STRING=RELEASE"
+
+#-----------------------------------------------------------------------------
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_CXX_FLAGS:STRING=-Wall"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_C_COMPILER=icc"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_CXX_COMPILER=icpc"
+
+#-----------------------------------------------------------------------------
+# Configure for Kokkos subpackages and tests:
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_Fortran:BOOL=OFF"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_ALL_PACKAGES:BOOL=OFF"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_EXAMPLES:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_TESTS:BOOL=ON"
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_KokkosCore:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_KokkosContainers:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_KokkosAlgorithms:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_TpetraKernels:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_KokkosExample:BOOL=ON"
+
+#-----------------------------------------------------------------------------
+# Pthread explicitly OFF
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_Pthread:BOOL=OFF"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Kokkos_ENABLE_Pthread:BOOL=OFF"
+
+#-----------------------------------------------------------------------------
+# OpenMP
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_OpenMP:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Kokkos_ENABLE_OpenMP:BOOL=ON"
+
+#-----------------------------------------------------------------------------
+# C++11
+
+#  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_CXX11:BOOL=ON"
+#  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Kokkos_ENABLE_CXX11:BOOL=ON"
+
+#-----------------------------------------------------------------------------
+#
+# Remove CMake output files to force reconfigure from scratch.
+#
+
+rm -rf CMake* Trilinos* packages Dart* Testing cmake_install.cmake MakeFile*
+
+#
+
+echo cmake ${CMAKE_CONFIGURE} ${TRILINOS_SOURCE_DIR}
+
+cmake ${CMAKE_CONFIGURE} ${TRILINOS_SOURCE_DIR}
+
+#-----------------------------------------------------------------------------
+
diff --git a/lib/kokkos/config/kokkos_dev/config-core-omp.sh b/lib/kokkos/config/kokkos_dev/config-core-omp.sh
new file mode 100755
index 0000000000000000000000000000000000000000..f91ecd525488c40a1d92c9143e727a4a287dfefb
--- /dev/null
+++ b/lib/kokkos/config/kokkos_dev/config-core-omp.sh
@@ -0,0 +1,77 @@
+#!/bin/sh
+#
+# Copy this script, put it outside the Trilinos source directory, and
+# build there.
+#
+#-----------------------------------------------------------------------------
+# Building on 'kokkos-dev.sandia.gov' with enabled capabilities:
+#
+#   OpenMP
+#
+# module loaded on 'kokkos-dev.sandia.gov' for this build
+#
+#  module load  cmake/2.8.11.2  gcc/4.8.3
+#
+#-----------------------------------------------------------------------------
+# Source and installation directories:
+
+TRILINOS_SOURCE_DIR=${HOME}/Trilinos
+TRILINOS_INSTALL_DIR=${HOME}/TrilinosInstall/`date +%F`
+
+CMAKE_CONFIGURE=""
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_INSTALL_PREFIX=${TRILINOS_INSTALL_DIR}"
+
+#-----------------------------------------------------------------------------
+# Debug/optimized
+
+# CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_BUILD_TYPE:STRING=DEBUG"
+# CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Kokkos_ENABLE_BOUNDS_CHECK:BOOL=ON"
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_BUILD_TYPE:STRING=RELEASE"
+
+#-----------------------------------------------------------------------------
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_CXX_FLAGS:STRING=-Wall"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_C_COMPILER=gcc"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_CXX_COMPILER=g++"
+
+#-----------------------------------------------------------------------------
+# Configure for Kokkos subpackages and tests:
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_Fortran:BOOL=OFF"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_ALL_PACKAGES:BOOL=OFF"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_EXAMPLES:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_TESTS:BOOL=ON"
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_KokkosCore:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_KokkosContainers:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_KokkosAlgorithms:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_TpetraKernels:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_KokkosExample:BOOL=ON"
+
+#-----------------------------------------------------------------------------
+# OpenMP
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_OpenMP:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Kokkos_ENABLE_OpenMP:BOOL=ON"
+
+# Pthread explicitly OFF, otherwise tribits will automatically turn it on
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_Pthread:BOOL=OFF"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Kokkos_ENABLE_Pthread:BOOL=OFF"
+
+#-----------------------------------------------------------------------------
+#
+# Remove CMake output files to force reconfigure from scratch.
+#
+
+rm -rf CMake* Trilinos* packages Dart* Testing cmake_install.cmake MakeFile*
+
+#
+
+echo cmake ${CMAKE_CONFIGURE} ${TRILINOS_SOURCE_DIR}
+
+cmake ${CMAKE_CONFIGURE} ${TRILINOS_SOURCE_DIR}
+
+#-----------------------------------------------------------------------------
+
diff --git a/lib/kokkos/config/kokkos_dev/config-core-threads-hwloc.sh b/lib/kokkos/config/kokkos_dev/config-core-threads-hwloc.sh
new file mode 100755
index 0000000000000000000000000000000000000000..19ab96902340f6ad757fd84546afc2061bdad024
--- /dev/null
+++ b/lib/kokkos/config/kokkos_dev/config-core-threads-hwloc.sh
@@ -0,0 +1,87 @@
+#!/bin/sh
+#
+# Copy this script, put it outside the Trilinos source directory, and
+# build there.
+#
+#-----------------------------------------------------------------------------
+# Building on 'kokkos-dev.sandia.gov' with enabled capabilities:
+#
+#   Threads, hwloc
+#
+# module loaded on 'kokkos-dev.sandia.gov' for this build
+#
+#  module load  cmake/2.8.11.2  gcc/4.8.3
+#
+#-----------------------------------------------------------------------------
+# Source and installation directories:
+
+TRILINOS_SOURCE_DIR=${HOME}/Trilinos
+TRILINOS_INSTALL_DIR=${HOME}/TrilinosInstall/`date +%F`
+
+CMAKE_CONFIGURE=""
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_INSTALL_PREFIX=${TRILINOS_INSTALL_DIR}"
+
+#-----------------------------------------------------------------------------
+# Debug/optimized
+
+# CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_BUILD_TYPE:STRING=DEBUG"
+# CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Kokkos_ENABLE_BOUNDS_CHECK:BOOL=ON"
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_BUILD_TYPE:STRING=RELEASE"
+
+#-----------------------------------------------------------------------------
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_CXX_FLAGS:STRING=-Wall"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_C_COMPILER=gcc"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D CMAKE_CXX_COMPILER=g++"
+
+#-----------------------------------------------------------------------------
+# Configure for Kokkos subpackages and tests:
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_Fortran:BOOL=OFF"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_ALL_PACKAGES:BOOL=OFF"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_EXAMPLES:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_TESTS:BOOL=ON"
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_KokkosCore:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_KokkosContainers:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_KokkosAlgorithms:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_TpetraKernels:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_KokkosExample:BOOL=ON"
+
+#-----------------------------------------------------------------------------
+# Hardware locality configuration:
+
+HWLOC_BASE_DIR="/home/projects/hwloc/1.7.1/host/gnu/4.7.3"
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_HWLOC:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D HWLOC_INCLUDE_DIRS:FILEPATH=${HWLOC_BASE_DIR}/include"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D HWLOC_LIBRARY_DIRS:FILEPATH=${HWLOC_BASE_DIR}/lib"
+
+#-----------------------------------------------------------------------------
+# Pthread
+
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_Pthread:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Kokkos_ENABLE_Pthread:BOOL=ON"
+
+#-----------------------------------------------------------------------------
+# C++11
+
+#  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_CXX11:BOOL=ON"
+#  CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Kokkos_ENABLE_CXX11:BOOL=ON"
+
+#-----------------------------------------------------------------------------
+#
+# Remove CMake output files to force reconfigure from scratch.
+#
+
+rm -rf CMake* Trilinos* packages Dart* Testing cmake_install.cmake MakeFile*
+
+#
+
+echo cmake ${CMAKE_CONFIGURE} ${TRILINOS_SOURCE_DIR}
+
+cmake ${CMAKE_CONFIGURE} ${TRILINOS_SOURCE_DIR}
+
+#-----------------------------------------------------------------------------
+
diff --git a/lib/kokkos/config/master_history.txt b/lib/kokkos/config/master_history.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f2eb674578f2c14442376210dfd1080050fe3917
--- /dev/null
+++ b/lib/kokkos/config/master_history.txt
@@ -0,0 +1,3 @@
+tag:  2.01.00    date: 07:21:2016    master: xxxxxxxx    develop: fa6dfcc4
+tag:  2.01.06    date: 09:02:2016    master: 9afaa87f    develop: 555f1a3a
+
diff --git a/lib/kokkos/config/nvcc_wrapper b/lib/kokkos/config/nvcc_wrapper
new file mode 100755
index 0000000000000000000000000000000000000000..6093cb61bdaf5a3f030406b8e149580b818920d0
--- /dev/null
+++ b/lib/kokkos/config/nvcc_wrapper
@@ -0,0 +1,280 @@
+#!/bin/bash
+#
+# This shell script (nvcc_wrapper) wraps both the host compiler and
+# NVCC, if you are building legacy C or C++ code with CUDA enabled.
+# The script remedies some differences between the interface of NVCC
+# and that of the host compiler, in particular for linking.
+# It also means that a legacy code doesn't need separate .cu files;
+# it can just use .cpp files.
+#
+# Default settings: change those according to your machine.  For
+# example, you may have have two different wrappers with either icpc
+# or g++ as their back-end compiler.  The defaults can be overwritten
+# by using the usual arguments (e.g., -arch=sm_30 -ccbin icpc).
+
+default_arch="sm_35"
+#default_arch="sm_50"
+
+#
+# The default C++ compiler.
+#
+host_compiler=${NVCC_WRAPPER_DEFAULT_COMPILER:-"g++"}
+#host_compiler="icpc"
+#host_compiler="/usr/local/gcc/4.8.3/bin/g++"
+#host_compiler="/usr/local/gcc/4.9.1/bin/g++"
+
+#
+# Internal variables
+#
+
+# C++ files
+cpp_files=""
+
+# Host compiler arguments
+xcompiler_args=""
+
+# Cuda (NVCC) only arguments
+cuda_args=""
+
+# Arguments for both NVCC and Host compiler
+shared_args=""
+
+# Linker arguments
+xlinker_args=""
+
+# Object files passable to NVCC
+object_files=""
+
+# Link objects for the host linker only
+object_files_xlinker=""
+
+# Shared libraries with version numbers are not handled correctly by NVCC
+shared_versioned_libraries_host=""
+shared_versioned_libraries=""
+
+# Does the User set the architecture 
+arch_set=0
+
+# Does the user overwrite the host compiler
+ccbin_set=0
+
+#Error code of compilation
+error_code=0
+
+# Do a dry run without actually compiling
+dry_run=0
+
+# Skip NVCC compilation and use host compiler directly
+host_only=0
+
+# Enable workaround for CUDA 6.5 for pragma ident 
+replace_pragma_ident=0
+
+# Mark first host compiler argument
+first_xcompiler_arg=1
+
+temp_dir=${TMPDIR:-/tmp}
+
+# Check if we have an optimization argument already
+optimization_applied=0
+
+#echo "Arguments: $# $@"
+
+while [ $# -gt 0 ]
+do
+  case $1 in
+  #show the executed command
+  --show|--nvcc-wrapper-show)
+    dry_run=1
+    ;;
+  #run host compilation only
+  --host-only)
+    host_only=1
+    ;;
+  #replace '#pragma ident' with '#ident' this is needed to compile OpenMPI due to a configure script bug and a non standardized behaviour of pragma with macros
+  --replace-pragma-ident)
+    replace_pragma_ident=1
+    ;;
+  #handle source files to be compiled as cuda files
+  *.cpp|*.cxx|*.cc|*.C|*.c++|*.cu)
+    cpp_files="$cpp_files $1"
+    ;;
+   # Ensure we only have one optimization flag because NVCC doesn't allow muliple
+  -O*)
+    if [ $optimization_applied -eq 1 ]; then
+       echo "nvcc_wrapper - *warning* you have set multiple optimization flags (-O*), only the first is used because nvcc can only accept a single optimization setting."
+    else
+       shared_args="$shared_args $1"
+       optimization_applied=1
+    fi
+    ;;
+  #Handle shared args (valid for both nvcc and the host compiler)
+  -D*|-c|-I*|-L*|-l*|-g|--help|--version|-E|-M|-shared)
+    shared_args="$shared_args $1"
+    ;;
+  #Handle shared args that have an argument
+  -o|-MT)
+    shared_args="$shared_args $1 $2"
+    shift
+    ;;
+  #Handle known nvcc args
+  -gencode*|--dryrun|--verbose|--keep|--keep-dir*|-G|--relocatable-device-code*|-lineinfo|-expt-extended-lambda|--resource-usage|-Xptxas*)
+    cuda_args="$cuda_args $1"
+    ;;
+  #Handle known nvcc args that have an argument
+  -rdc|-maxrregcount|--default-stream)
+    cuda_args="$cuda_args $1 $2"
+    shift
+    ;;
+  #Handle c++11 setting
+  --std=c++11|-std=c++11)
+    shared_args="$shared_args $1"
+    ;;
+  #strip of -std=c++98 due to nvcc warnings and Tribits will place both -std=c++11 and -std=c++98
+  -std=c++98|--std=c++98)
+    ;;
+  #strip of pedantic because it produces endless warnings about #LINE added by the preprocessor
+  -pedantic|-Wpedantic|-ansi)
+    ;;
+  #strip -Xcompiler because we add it
+  -Xcompiler)
+    if [ $first_xcompiler_arg -eq 1 ]; then
+      xcompiler_args="$2"
+      first_xcompiler_arg=0
+    else
+      xcompiler_args="$xcompiler_args,$2"
+    fi
+    shift
+    ;;
+  #strip of "-x cu" because we add that
+  -x)
+    if [[ $2 != "cu" ]]; then
+      if [ $first_xcompiler_arg -eq 1 ]; then
+        xcompiler_args="-x,$2"
+        first_xcompiler_arg=0
+      else
+        xcompiler_args="$xcompiler_args,-x,$2"
+      fi
+    fi
+    shift
+    ;;
+  #Handle -ccbin (if its not set we can set it to a default value)
+  -ccbin)
+    cuda_args="$cuda_args $1 $2"
+    ccbin_set=1
+    host_compiler=$2
+    shift
+    ;;
+  #Handle -arch argument (if its not set use a default
+  -arch*)
+    cuda_args="$cuda_args $1"
+    arch_set=1
+    ;;
+  #Handle -Xcudafe argument
+  -Xcudafe)
+    cuda_args="$cuda_args -Xcudafe $2"
+    shift
+    ;;
+  #Handle args that should be sent to the linker
+  -Wl*)
+    xlinker_args="$xlinker_args -Xlinker ${1:4:${#1}}"
+    host_linker_args="$host_linker_args ${1:4:${#1}}"
+    ;;
+  #Handle object files: -x cu applies to all input files, so give them to linker, except if only linking
+  *.a|*.so|*.o|*.obj)
+    object_files="$object_files $1"
+    object_files_xlinker="$object_files_xlinker -Xlinker $1"
+    ;;
+  #Handle object files which always need to use "-Xlinker": -x cu applies to all input files, so give them to linker, except if only linking
+  *.dylib)
+    object_files="$object_files -Xlinker $1"
+    object_files_xlinker="$object_files_xlinker -Xlinker $1"
+    ;;
+  #Handle shared libraries with *.so.* names which nvcc can't do.
+  *.so.*)
+    shared_versioned_libraries_host="$shared_versioned_libraries_host $1"
+    shared_versioned_libraries="$shared_versioned_libraries -Xlinker $1"
+  ;;
+  #All other args are sent to the host compiler
+  *)
+    if [ $first_xcompiler_arg -eq 1 ]; then
+      xcompiler_args=$1
+      first_xcompiler_arg=0
+    else 
+      xcompiler_args="$xcompiler_args,$1"
+    fi
+    ;;
+  esac
+
+  shift
+done
+
+#Add default host compiler if necessary
+if [ $ccbin_set -ne 1 ]; then
+  cuda_args="$cuda_args -ccbin $host_compiler"
+fi
+
+#Add architecture command
+if [ $arch_set -ne 1 ]; then
+  cuda_args="$cuda_args -arch=$default_arch"
+fi
+
+#Compose compilation command
+nvcc_command="nvcc $cuda_args $shared_args $xlinker_args $shared_versioned_libraries"
+if [ $first_xcompiler_arg -eq 0 ]; then
+  nvcc_command="$nvcc_command -Xcompiler $xcompiler_args"
+fi
+
+#Compose host only command
+host_command="$host_compiler $shared_args $xcompiler_args $host_linker_args $shared_versioned_libraries_host"
+
+#nvcc does not accept '#pragma ident SOME_MACRO_STRING' but it does accept '#ident SOME_MACRO_STRING'
+if [ $replace_pragma_ident -eq 1 ]; then
+  cpp_files2=""
+  for file in $cpp_files
+  do
+    var=`grep pragma ${file} | grep ident | grep "#"`
+    if [ "${#var}" -gt 0 ]
+    then
+      sed 's/#[\ \t]*pragma[\ \t]*ident/#ident/g' $file > $temp_dir/nvcc_wrapper_tmp_$file
+      cpp_files2="$cpp_files2 $temp_dir/nvcc_wrapper_tmp_$file"
+    else
+      cpp_files2="$cpp_files2 $file"
+    fi
+  done
+  cpp_files=$cpp_files2
+  #echo $cpp_files
+fi
+
+if [ "$cpp_files" ]; then
+  nvcc_command="$nvcc_command $object_files_xlinker -x cu $cpp_files"
+else
+  nvcc_command="$nvcc_command $object_files"
+fi
+
+if [ "$cpp_files" ]; then
+  host_command="$host_command $object_files $cpp_files"
+else
+  host_command="$host_command $object_files"
+fi
+
+#Print command for dryrun
+if [ $dry_run -eq 1 ]; then
+  if [ $host_only -eq 1 ]; then
+    echo $host_command
+  else
+    echo $nvcc_command
+  fi
+  exit 0
+fi
+
+#Run compilation command
+if [ $host_only -eq 1 ]; then
+  $host_command
+else
+  $nvcc_command
+fi
+error_code=$?
+
+#Report error code
+exit $error_code
diff --git a/lib/kokkos/config/snapshot.py b/lib/kokkos/config/snapshot.py
new file mode 100755
index 0000000000000000000000000000000000000000..d816cd0c9c1962c0c95c672467319474a72669fb
--- /dev/null
+++ b/lib/kokkos/config/snapshot.py
@@ -0,0 +1,279 @@
+#! /usr/bin/env python
+
+"""
+Snapshot a project into another project and perform the necessary repo actions
+to provide a commit message that can be used to trace back to the exact point
+in the source repository.
+"""
+
+#todo:
+#  Support svn
+#  Allow renaming of the source dir in the destination path
+#  Check if a new snapshot is necessary?
+#
+
+import sys
+
+#check the version number so that there is a good error message when argparse is not available.
+#This checks for exactly 2.7 which is bad, but it is a python 2 script and argparse was introduced
+#in 2.7 which is also the last version of python 2. If this script is updated for python 3 this
+#will need to change, but for now it is not safe to allow 3.x to run this.
+if sys.version_info[:2] != (2, 7):
+  print "Error snapshot requires python 2.7 detected version is %d.%d." % (sys.version_info[0], sys.version_info[1])
+  sys.exit(1)
+
+import subprocess, argparse, re, doctest, os, datetime, traceback
+
+def parse_cmdline(description):
+  parser = argparse.ArgumentParser(usage="snapshot.py [options] source destination", description=description)
+
+  parser.add_argument("-n", "--no-comit", action="store_false", dest="create_commit", default=True,
+                      help="Do not perform a commit or create a commit message.")
+  parser.add_argument("-v", "--verbose", action="store_true", dest="verbose_mode", default=False,
+                      help="Enable verbose mode.")
+  parser.add_argument("-d", "--debug", action="store_true", dest="debug_mode", default=False,
+                      help="Enable debugging output.")
+  parser.add_argument("--no-validate-repo", action="store_true", dest="no_validate_repo", default=False,
+                      help="Reduce the validation that the source and destination repos are clean to a warning.")
+  parser.add_argument("--source-repo", choices=["git","none"], default="",
+                      help="Type of repository of the source, use none to skip all repository operations.")
+  parser.add_argument("--dest-repo", choices=["git","none"], default="",
+                      help="Type of repository of the destination, use none to skip all repository operations.")
+
+  parser.add_argument("source",      help="Source project to snapshot from.")
+  parser.add_argument("destination", help="Destination to snapshot too.")
+
+  options = parser.parse_args()
+  options = validate_options(options)
+  return options
+#end parseCmdline
+
+def validate_options(options):
+  apparent_source_repo_type="none"
+  apparent_dest_repo_type="none"
+
+  #prevent user from accidentally giving us a path that rsync will treat differently than expected.
+  options.source      = options.source.rstrip(os.sep)
+  options.destination = options.destination.rstrip(os.sep)
+
+  options.source      = os.path.abspath(options.source)
+  options.destination = os.path.abspath(options.destination)
+  
+  if os.path.exists(options.source):
+    apparent_source_repo_type, source_root = deterimine_repo_type(options.source)
+  else:
+    raise RuntimeError("Could not find source directory of %s." % options.source)
+  options.source_root = source_root
+
+  if not os.path.exists(options.destination):
+    print "Could not find destination directory of %s so it will be created." % options.destination
+    os.makedirs(options.destination)
+
+  apparent_dest_repo_type, dest_root = deterimine_repo_type(options.destination)
+  options.dest_root = dest_root
+
+  #error on svn repo types for now
+  if apparent_source_repo_type == "svn" or apparent_dest_repo_type == "svn":
+    raise RuntimeError("SVN repositories are not supported at this time.")
+
+  if options.source_repo == "":
+    #source repo type is not specified to just using the apparent type.
+    options.source_repo = apparent_source_repo_type
+  else:
+    if options.source_repo != "none" and options.source_repo != apparent_source_repo_type:
+      raise RuntimeError("Specified source repository type of %s conflicts with determined type of %s" % \
+        (options.source_repo, apparent_source_repo_type))
+
+  if options.dest_repo == "":
+    #destination repo type is not specified to just using the apparent type.
+    options.dest_repo = apparent_dest_repo_type
+  else:
+    if options.dest_repo != "none" and options.dest_repo != apparent_dest_repo_type:
+      raise RuntimeError("Specified destination repository type of %s conflicts with determined type of %s" % \
+        (options.dest_repo, apparent_dest_repo_type))
+
+  return options
+#end validate_options
+
+def run_cmd(cmd, options, working_dir="."):
+  cmd_str = " ".join(cmd)
+  if options.verbose_mode:
+    print "Running command '%s' in dir %s." % (cmd_str, working_dir)
+
+  proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=working_dir)
+  proc_stdout, proc_stderr = proc.communicate()
+  ret_val = proc.wait()
+
+  if options.debug_mode:
+    print "==== %s stdout start ====" % cmd_str
+    print proc_stdout
+    print "==== %s stdout end ====" % cmd_str
+    print "==== %s stderr ====" % cmd_str
+    print proc_stderr
+    print "==== %s stderr ====" % cmd_str
+ 
+  if ret_val != 0:
+    raise RuntimeError("Command '%s' failed with error code %d. Error message:%s%s%sstdout:%s" % \
+      (cmd_str, ret_val, os.linesep, proc_stderr, os.linesep, proc_stdout))
+
+  return proc_stdout, proc_stderr
+#end run_cmd
+
+def deterimine_repo_type(location):
+  apparent_repo_type = "none"
+
+  while location != "":
+    if os.path.exists(os.path.join(location, ".git")):
+      apparent_repo_type = "git"
+      break
+    elif os.path.exists(os.path.join(location, ".svn")):
+      apparent_repo_type = "svn"
+      break
+    else:
+      location = location[:location.rfind(os.sep)]
+
+  return apparent_repo_type, location
+
+#end deterimine_repo_type
+
+def rsync(source, dest, options):
+  rsync_cmd = ["rsync", "-ar", "--delete"]
+  if options.debug_mode:
+    rsync_cmd.append("-v")
+
+  if options.source_repo == "git":
+    rsync_cmd.append("--exclude=.git")
+
+  rsync_cmd.append(options.source)
+  rsync_cmd.append(options.destination)
+  run_cmd(rsync_cmd, options)
+#end rsync
+
+def create_commit_message(commit_id, commit_log, project_name, project_location):
+  eol = os.linesep
+  message = "Snapshot of %s from commit %s" % (project_name, commit_id)
+  message += eol * 2
+  message += "From repository at %s" % project_location
+  message += eol * 2
+  message += "At commit:" + eol
+  message += commit_log
+  return message
+#end create_commit_message
+
+def find_git_commit_information(options):
+  r"""
+  >>> class fake_options:
+  ...   source="."
+  ...   verbose_mode=False
+  ...   debug_mode=False
+  >>> myoptions = fake_options()
+  >>> find_git_commit_information(myoptions)[2:]
+  ('sems', 'software.sandia.gov:/git/sems')
+  """
+  git_log_cmd = ["git", "log", "-1"]
+  
+  output, error = run_cmd(git_log_cmd, options, options.source)
+  
+  commit_match = re.match("commit ([0-9a-fA-F]+)", output)
+  commit_id = commit_match.group(1)
+  commit_log = output
+  
+  git_remote_cmd = ["git", "remote", "-v"]
+  output, error = run_cmd(git_remote_cmd, options, options.source)
+  
+  remote_match = re.search("origin\s([^ ]*/([^ ]+))", output, re.MULTILINE)
+  if not remote_match:
+    raise RuntimeError("Could not find origin of repo at %s. Consider using none for source repo type." % (options.source))
+
+  source_location = remote_match.group(1)
+  source_name     = remote_match.group(2).strip()
+  
+  if source_name[-1] == "/":
+    source_name = source_name[:-1]
+
+  return commit_id, commit_log, source_name, source_location
+
+#end find_git_commit_information
+
+def do_git_commit(message, options):
+  if options.verbose_mode:
+    print "Commiting to destination repository."
+
+  git_add_cmd = ["git", "add", "-A"]
+  run_cmd(git_add_cmd, options, options.destination)
+  
+  git_commit_cmd = ["git", "commit", "-m%s" % message]
+  run_cmd(git_commit_cmd, options, options.destination)
+  
+  git_log_cmd = ["git", "log", "--format=%h", "-1"]
+  commit_sha1, error = run_cmd(git_log_cmd, options, options.destination)
+
+  print "Commit %s was made to %s." % (commit_sha1.strip(), options.dest_root)
+#end do_git_commit
+
+def verify_git_repo_clean(location, options):
+  git_status_cmd = ["git", "status", "--porcelain"]
+  output, error = run_cmd(git_status_cmd, options, location)
+  
+  if output != "":
+    if options.no_validate_repo == False:
+      raise RuntimeError("%s is not clean.%sPlease commit or stash all changes before running snapshot."
+        % (location, os.linesep))
+    else:
+      print "WARNING: %s is not clean. Proceeding anyway." % location
+      print "WARNING:   This could lead to differences in the source and destination."
+      print "WARNING:   It could also lead to extra files being included in the snapshot commit."
+
+#end verify_git_repo_clean
+
+def main(options):
+  if options.verbose_mode:
+    print "Snapshotting %s to %s." % (options.source, options.destination)
+
+  if options.source_repo == "git":
+    verify_git_repo_clean(options.source, options)
+    commit_id, commit_log, repo_name, repo_location = find_git_commit_information(options)
+  elif options.source_repo == "none":
+    commit_id     = "N/A"
+    commit_log    = "Unknown commit from %s snapshotted at: %s" % (options.source, datetime.datetime.now())
+    repo_name     = options.source
+    repo_location = options.source
+    
+  commit_message = create_commit_message(commit_id, commit_log, repo_name, repo_location) + os.linesep*2
+  
+  if options.dest_repo == "git":
+    verify_git_repo_clean(options.destination, options)
+
+  rsync(options.source, options.destination, options)
+  
+  if options.dest_repo == "git":
+    do_git_commit(commit_message, options)
+  elif options.dest_repo == "none":
+    file_name = "snapshot_message.txt"
+    message_file = open(file_name, "w")
+    message_file.write(commit_message)
+    message_file.close()
+    cwd = os.getcwd()
+    print "No commit done by request. Please use file at:"
+    print "%s%sif you wish to commit this to a repo later." % (cwd+"/"+file_name, os.linesep)
+  
+  
+  
+  
+#end main
+
+if (__name__ == "__main__"):
+  if ("--test" in sys.argv):
+    doctest.testmod()
+    sys.exit(0)
+
+  try:    
+    options = parse_cmdline(__doc__)
+    main(options)
+  except RuntimeError, e:
+    print "Error occured:", e
+    if "--debug" in sys.argv:
+      traceback.print_exc()
+    sys.exit(1)
+  else:  
+    sys.exit(0)
diff --git a/lib/kokkos/config/test_all_sandia b/lib/kokkos/config/test_all_sandia
new file mode 100755
index 0000000000000000000000000000000000000000..aac036a8f37abfedabac7a4849289ecb3cbdfcd0
--- /dev/null
+++ b/lib/kokkos/config/test_all_sandia
@@ -0,0 +1,539 @@
+#!/bin/bash -e
+
+#
+# Global config
+#
+
+set -o pipefail
+
+# Determine current machine
+
+MACHINE=""
+HOSTNAME=$(hostname)
+if [[ "$HOSTNAME" =~ (white|ride).* ]]; then
+    MACHINE=white
+elif [[ "$HOSTNAME" =~ .*bowman.* ]]; then
+    MACHINE=bowman
+elif [[ "$HOSTNAME" =~ node.* ]]; then # Warning: very generic name
+    MACHINE=shepard
+elif [ ! -z "$SEMS_MODULEFILES_ROOT" ]; then
+    MACHINE=sems
+else
+    echo "Unrecognized machine" >&2
+    exit 1
+fi
+
+GCC_BUILD_LIST="OpenMP,Pthread,Serial,OpenMP_Serial,Pthread_Serial"
+IBM_BUILD_LIST="OpenMP,Serial,OpenMP_Serial"
+INTEL_BUILD_LIST="OpenMP,Pthread,Serial,OpenMP_Serial,Pthread_Serial"
+CLANG_BUILD_LIST="Pthread,Serial,Pthread_Serial"
+CUDA_BUILD_LIST="Cuda_OpenMP,Cuda_Pthread,Cuda_Serial"
+
+GCC_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wignored-qualifiers,-Wempty-body,-Wclobbered,-Wuninitialized"
+IBM_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized"
+CLANG_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized"
+INTEL_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized"
+CUDA_WARNING_FLAGS=""
+
+# Default. Machine specific can override
+DEBUG=False
+ARGS=""
+CUSTOM_BUILD_LIST=""
+DRYRUN=False
+BUILD_ONLY=False
+declare -i NUM_JOBS_TO_RUN_IN_PARALLEL=3
+TEST_SCRIPT=False
+SKIP_HWLOC=False
+
+ARCH_FLAG=""
+
+#
+# Machine specific config
+#
+
+if [ "$MACHINE" = "sems" ]; then
+    source /projects/modulefiles/utils/sems-modules-init.sh
+    source /projects/modulefiles/utils/kokkos-modules-init.sh
+
+    BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>/base,hwloc/1.10.1/<COMPILER_NAME>/<COMPILER_VERSION>/base"
+    CUDA_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>,gcc/4.7.2/base"
+
+    # Format: (compiler module-list build-list exe-name warning-flag)
+    COMPILERS=("gcc/4.7.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+               "gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+               "gcc/4.9.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+               "gcc/5.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+               "intel/14.0.4 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+               "intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+               "intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+               "clang/3.5.2 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
+               "clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
+               "cuda/6.5.14 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
+               "cuda/7.0.28 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
+               "cuda/7.5.18 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
+    )
+
+elif [ "$MACHINE" = "white" ]; then
+    source /etc/profile.d/modules.sh
+    SKIP_HWLOC=True
+    export SLURM_TASKS_PER_NODE=32
+
+    BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>"
+    IBM_MODULE_LIST="<COMPILER_NAME>/xl/<COMPILER_VERSION>"
+    CUDA_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>,gcc/4.9.2"
+
+    # Don't do pthread on white
+    GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial"
+
+    # Format: (compiler module-list build-list exe-name warning-flag)
+    COMPILERS=("gcc/4.9.2 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+               "gcc/5.3.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+               "ibm/13.1.3 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS"
+    )
+
+    ARCH_FLAG="--arch=Power8"
+    NUM_JOBS_TO_RUN_IN_PARALLEL=8
+
+elif [ "$MACHINE" = "bowman" ]; then
+    source /etc/profile.d/modules.sh
+    SKIP_HWLOC=True
+    export SLURM_TASKS_PER_NODE=32
+
+    BASE_MODULE_LIST="<COMPILER_NAME>/compilers/<COMPILER_VERSION>"
+
+    OLD_INTEL_BUILD_LIST="Pthread,Serial,Pthread_Serial"
+
+    # Format: (compiler module-list build-list exe-name warning-flag)
+    COMPILERS=("intel/16.2.181 $BASE_MODULE_LIST $OLD_INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+               "intel/17.0.064 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+    )
+
+    ARCH_FLAG="--arch=KNL"
+    NUM_JOBS_TO_RUN_IN_PARALLEL=8
+
+elif [ "$MACHINE" = "shepard" ]; then
+    source /etc/profile.d/modules.sh
+    SKIP_HWLOC=True
+    export SLURM_TASKS_PER_NODE=32
+
+    BASE_MODULE_LIST="<COMPILER_NAME>/compilers/<COMPILER_VERSION>"
+
+    OLD_INTEL_BUILD_LIST="Pthread,Serial,Pthread_Serial"
+
+    # Format: (compiler module-list build-list exe-name warning-flag)
+    COMPILERS=("intel/16.2.181 $BASE_MODULE_LIST $OLD_INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+               "intel/17.0.064 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+    )
+
+    ARCH_FLAG="--arch=HSW"
+    NUM_JOBS_TO_RUN_IN_PARALLEL=8
+
+else
+    echo "Unhandled machine $MACHINE" >&2
+    exit 1
+fi
+
+export OMP_NUM_THREADS=4
+
+declare -i NUM_RESULTS_TO_KEEP=7
+
+RESULT_ROOT_PREFIX=TestAll
+
+SCRIPT_KOKKOS_ROOT=$( cd "$( dirname "$0" )" && cd .. && pwd )
+
+#
+# Handle arguments
+#
+
+while [[ $# > 0 ]]
+do
+key="$1"
+case $key in
+--kokkos-path*)
+KOKKOS_PATH="${key#*=}"
+;;
+--build-list*)
+CUSTOM_BUILD_LIST="${key#*=}"
+;;
+--debug*)
+DEBUG=True
+;;
+--build-only*)
+BUILD_ONLY=True
+;;
+--test-script*)
+TEST_SCRIPT=True
+;;
+--skip-hwloc*)
+SKIP_HWLOC=True
+;;
+--num*)
+NUM_JOBS_TO_RUN_IN_PARALLEL="${key#*=}"
+;;
+--dry-run*)
+DRYRUN=True
+;;
+--help)
+echo "test_all_sandia <ARGS> <OPTIONS>:"
+echo "--kokkos-path=/Path/To/Kokkos: Path to the Kokkos root directory"
+echo "    Defaults to root repo containing this script"
+echo "--debug: Run tests in debug. Defaults to False"
+echo "--test-script: Test this script, not Kokkos"
+echo "--skip-hwloc: Do not do hwloc tests"
+echo "--num=N: Number of jobs to run in parallel "
+echo "--dry-run: Just print what would be executed"
+echo "--build-only: Just do builds, don't run anything"
+echo "--build-list=BUILD,BUILD,BUILD..."
+echo "    Provide a comma-separated list of builds instead of running all builds"
+echo "    Valid items:"
+echo "      OpenMP, Pthread, Serial, OpenMP_Serial, Pthread_Serial"
+echo "      Cuda_OpenMP, Cuda_Pthread, Cuda_Serial"
+echo ""
+
+echo "ARGS: list of expressions matching compilers to test"
+echo "  supported compilers sems"
+for COMPILER_DATA in "${COMPILERS[@]}"; do
+    ARR=($COMPILER_DATA)
+    COMPILER=${ARR[0]}
+    echo "    $COMPILER"
+done
+echo ""
+
+echo "Examples:"
+echo "  Run all tests"
+echo "  % test_all_sandia"
+echo ""
+echo "  Run all gcc tests"
+echo "  % test_all_sandia gcc"
+echo ""
+echo "  Run all gcc/4.7.2 and all intel tests"
+echo "  % test_all_sandia gcc/4.7.2 intel"
+echo ""
+echo "  Run all tests in debug"
+echo "  % test_all_sandia --debug"
+echo ""
+echo "  Run gcc/4.7.2 and only do OpenMP and OpenMP_Serial builds"
+echo "  % test_all_sandia gcc/4.7.2 --build-list=OpenMP,OpenMP_Serial"
+echo ""
+echo "If you want to kill the tests, do:"
+echo "  hit ctrl-z"
+echo "  % kill -9 %1"
+echo
+exit 0
+;;
+*)
+# args, just append
+ARGS="$ARGS $1"
+;;
+esac
+shift
+done
+
+# set kokkos path
+if [ -z "$KOKKOS_PATH" ]; then
+    KOKKOS_PATH=$SCRIPT_KOKKOS_ROOT
+else
+    # Ensure KOKKOS_PATH is abs path
+    KOKKOS_PATH=$( cd $KOKKOS_PATH && pwd )
+fi
+
+# set build type
+if [ "$DEBUG" = "True" ]; then
+    BUILD_TYPE=debug
+else
+    BUILD_TYPE=release
+fi
+
+# If no args provided, do all compilers
+if [ -z "$ARGS" ]; then
+    ARGS='?'
+fi
+
+# Process args to figure out which compilers to test
+COMPILERS_TO_TEST=""
+for ARG in $ARGS; do
+    for COMPILER_DATA in "${COMPILERS[@]}"; do
+        ARR=($COMPILER_DATA)
+        COMPILER=${ARR[0]}
+        if [[ "$COMPILER" = $ARG* ]]; then
+            if [[ "$COMPILERS_TO_TEST" != *${COMPILER}* ]]; then
+                COMPILERS_TO_TEST="$COMPILERS_TO_TEST $COMPILER"
+            else
+                echo "Tried to add $COMPILER twice"
+            fi
+        fi
+    done
+done
+
+#
+# Functions
+#
+
+# get_compiler_name <COMPILER>
+get_compiler_name() {
+    echo $1 | cut -d/ -f1
+}
+
+# get_compiler_version <COMPILER>
+get_compiler_version() {
+    echo $1 | cut -d/ -f2
+}
+
+# Do not call directly
+get_compiler_data() {
+    local compiler=$1
+    local item=$2
+    local compiler_name=$(get_compiler_name $compiler)
+    local compiler_vers=$(get_compiler_version $compiler)
+
+    local compiler_data
+    for compiler_data in "${COMPILERS[@]}" ; do
+        local arr=($compiler_data)
+        if [ "$compiler" = "${arr[0]}" ]; then
+            echo "${arr[$item]}" | tr , ' ' | sed -e "s/<COMPILER_NAME>/$compiler_name/g" -e "s/<COMPILER_VERSION>/$compiler_vers/g"
+            return 0
+        fi
+    done
+
+    # Not found
+    echo "Unreconized compiler $compiler" >&2
+    exit 1
+}
+
+#
+# For all getters, usage: <GETTER> <COMPILER>
+#
+
+get_compiler_modules() {
+    get_compiler_data $1 1
+}
+
+get_compiler_build_list() {
+    get_compiler_data $1 2
+}
+
+get_compiler_exe_name() {
+    get_compiler_data $1 3
+}
+
+get_compiler_warning_flags() {
+    get_compiler_data $1 4
+}
+
+run_cmd() {
+    echo "RUNNING: $*"
+    if [ "$DRYRUN" != "True" ]; then
+	eval "$* 2>&1"
+    fi
+}
+
+# report_and_log_test_results <SUCCESS> <DESC> <COMMENT>
+report_and_log_test_result() {
+    # Use sane var names
+    local success=$1; local desc=$2; local comment=$3;
+
+    if [ "$success" = "0" ]; then
+	echo "  PASSED $desc"
+        echo $comment > $PASSED_DIR/$desc
+    else
+        # For failures, comment should be the name of the phase that failed
+	echo "  FAILED $desc" >&2
+        echo $comment > $FAILED_DIR/$desc
+        cat ${desc}.${comment}.log
+    fi
+}
+
+setup_env() {
+    local compiler=$1
+    local compiler_modules=$(get_compiler_modules $compiler)
+
+    module purge
+
+    local mod
+    for mod in $compiler_modules; do
+        echo "Loading module $mod"
+	module load $mod 2>&1
+        # It is ridiculously hard to check for the success of a loaded
+        # module. Module does not return error codes and piping to grep
+        # causes module to run in a subshell.
+        module list 2>&1 | grep "$mod" >& /dev/null || return 1
+    done
+
+    return 0
+}
+
+# single_build_and_test <COMPILER> <BUILD> <BUILD_TYPE>
+single_build_and_test() {
+    # Use sane var names
+    local compiler=$1; local build=$2; local build_type=$3;
+
+    # set up env
+    mkdir -p $ROOT_DIR/$compiler/"${build}-$build_type"
+    cd $ROOT_DIR/$compiler/"${build}-$build_type"
+    local desc=$(echo "${compiler}-${build}-${build_type}" | sed 's:/:-:g')
+    setup_env $compiler >& ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; }
+
+    # Set up flags
+    local compiler_warning_flags=$(get_compiler_warning_flags $compiler)
+    local compiler_exe=$(get_compiler_exe_name $compiler)
+
+    if [[ "$build_type" = hwloc* ]]; then
+        local extra_args=--with-hwloc=$(dirname $(dirname $(which hwloc-info)))
+    fi
+
+    if [[ "$build_type" = *debug* ]]; then
+        local extra_args="$extra_args --debug"
+        local cxxflags="-g $compiler_warning_flags"
+    else
+        local cxxflags="-O3 $compiler_warning_flags"
+    fi
+
+    if [[ "$compiler" == cuda* ]]; then
+        cxxflags="--keep --keep-dir=$(pwd) $cxxflags"
+        export TMPDIR=$(pwd)
+    fi
+
+    # cxxflags="-DKOKKOS_USING_EXP_VIEW=1 $cxxflags"
+
+    echo "  Starting job $desc"
+
+    local comment="no_comment"
+
+    if [ "$TEST_SCRIPT" = "True" ]; then
+        local rand=$[ 1 + $[ RANDOM % 10 ]]
+        sleep $rand
+        if [ $rand -gt 5 ]; then
+            run_cmd ls fake_problem >& ${desc}.configure.log || { report_and_log_test_result 1 $desc configure && return 0; }
+        fi
+    else
+        run_cmd ${KOKKOS_PATH}/generate_makefile.bash --with-devices=$build $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; }
+        local -i build_start_time=$(date +%s)
+        run_cmd make build-test >& ${desc}.build.log || { report_and_log_test_result 1 ${desc} build && return 0; }
+        local -i build_end_time=$(date +%s)
+        comment="build_time=$(($build_end_time-$build_start_time))"
+        if [[ "$BUILD_ONLY" == False ]]; then
+            run_cmd make test >& ${desc}.test.log || { report_and_log_test_result 1 ${desc} test && return 0; }
+            local -i run_end_time=$(date +%s)
+            comment="$comment run_time=$(($run_end_time-$build_end_time))"
+        fi
+    fi
+
+    report_and_log_test_result 0 $desc "$comment"
+
+    return 0
+}
+
+# wait_for_jobs <NUM-JOBS>
+wait_for_jobs() {
+    local -i max_jobs=$1
+    local -i num_active_jobs=$(jobs | wc -l)
+    while [ $num_active_jobs -ge $max_jobs ]
+    do
+        sleep 1
+        num_active_jobs=$(jobs | wc -l)
+        jobs >& /dev/null
+    done
+}
+
+# run_in_background <COMPILER> <BUILD> <BUILD_TYPE>
+run_in_background() {
+    local compiler=$1
+
+    local -i num_jobs=$NUM_JOBS_TO_RUN_IN_PARALLEL
+    if [[ "$BUILD_ONLY" == True ]]; then
+        num_jobs=8
+    else
+        if [[ "$compiler" == cuda* ]]; then
+            num_jobs=1
+        fi
+    fi
+    wait_for_jobs $num_jobs
+
+    single_build_and_test $* &
+}
+
+# build_and_test_all <COMPILER>
+build_and_test_all() {
+    # Get compiler data
+    local compiler=$1
+    if [ -z "$CUSTOM_BUILD_LIST" ]; then
+	local compiler_build_list=$(get_compiler_build_list $compiler)
+    else
+	local compiler_build_list=$(echo "$CUSTOM_BUILD_LIST" | tr , ' ')
+    fi
+
+    # do builds
+    local build
+    for build in $compiler_build_list
+    do
+	run_in_background $compiler $build $BUILD_TYPE
+
+        # If not cuda, do a hwloc test too
+        if [[ "$compiler" != cuda* && "$SKIP_HWLOC" == False ]]; then
+            run_in_background $compiler $build "hwloc-$BUILD_TYPE"
+        fi
+    done
+
+    return 0
+}
+
+get_test_root_dir() {
+    local existing_results=$(find . -maxdepth 1 -name "$RESULT_ROOT_PREFIX*" | sort)
+    local -i num_existing_results=$(echo $existing_results | tr ' ' '\n' | wc -l)
+    local -i num_to_delete=${num_existing_results}-${NUM_RESULTS_TO_KEEP}
+
+    if [ $num_to_delete -gt 0 ]; then
+        /bin/rm -rf $(echo $existing_results | tr ' ' '\n' | head -n $num_to_delete)
+    fi
+
+    echo $(pwd)/${RESULT_ROOT_PREFIX}_$(date +"%Y-%m-%d_%H.%M.%S")
+}
+
+wait_summarize_and_exit() {
+    wait_for_jobs 1
+
+    echo "#######################################################"
+    echo "PASSED TESTS"
+    echo "#######################################################"
+
+    local passed_test
+    for passed_test in $(\ls -1 $PASSED_DIR | sort)
+    do
+        echo $passed_test $(cat $PASSED_DIR/$passed_test)
+    done
+
+    echo "#######################################################"
+    echo "FAILED TESTS"
+    echo "#######################################################"
+
+    local failed_test
+    local -i rv=0
+    for failed_test in $(\ls -1 $FAILED_DIR | sort)
+    do
+        echo $failed_test "("$(cat $FAILED_DIR/$failed_test)" failed)"
+        rv=$rv+1
+    done
+
+    exit $rv
+}
+
+#
+# Main
+#
+
+ROOT_DIR=$(get_test_root_dir)
+mkdir -p $ROOT_DIR
+cd $ROOT_DIR
+
+PASSED_DIR=$ROOT_DIR/results/passed
+FAILED_DIR=$ROOT_DIR/results/failed
+mkdir -p $PASSED_DIR
+mkdir -p $FAILED_DIR
+
+echo "Going to test compilers: " $COMPILERS_TO_TEST
+for COMPILER in $COMPILERS_TO_TEST; do
+    echo "Testing compiler $COMPILER"
+    build_and_test_all $COMPILER
+done
+
+wait_summarize_and_exit
diff --git a/lib/kokkos/config/testing_scripts/README b/lib/kokkos/config/testing_scripts/README
new file mode 100644
index 0000000000000000000000000000000000000000..455afffd840514e98686dadcd2c46a774590456c
--- /dev/null
+++ b/lib/kokkos/config/testing_scripts/README
@@ -0,0 +1,5 @@
+jenkins_test_driver is designed to be run through Jenkins as a
+multiconfiguration job. It relies on a number of environment variables that will
+only be set when run in that context. It is possible to override these if you
+know the Jenkins job setup. It is not recommended that a non-expert try to run
+this script directly.
diff --git a/lib/kokkos/config/testing_scripts/jenkins_test_driver b/lib/kokkos/config/testing_scripts/jenkins_test_driver
new file mode 100755
index 0000000000000000000000000000000000000000..9cba7fa51856b59706a8c6655aef42b39847766b
--- /dev/null
+++ b/lib/kokkos/config/testing_scripts/jenkins_test_driver
@@ -0,0 +1,83 @@
+#!/bin/bash -x
+
+echo "Building for BUILD_TYPE = ${BUILD_TYPE}"
+echo "Building with HOST_COMPILER = ${HOST_COMPILER}"
+echo "Building in ${WORKSPACE}"
+
+module use /home/projects/modulefiles
+
+BUILD_TYPE=`echo $BUILD_TYPE | tr "~" " "`
+build_options=""
+for item in ${BUILD_TYPE}; do
+  build_options="$build_options --with-$item"
+done
+
+kokkos_path=${WORKSPACE}/kokkos
+gtest_path=${WORKSPACE}/kokkos/tpls/gtest
+
+echo ${WORKSPACE}
+pwd
+
+#extract information from the provided parameters.
+host_compiler_brand=`echo $HOST_COMPILER | grep -o "^[a-zA-Z]*"`
+cuda_compiler=`echo $BUILD_TYPE | grep -o "cuda_[^ ]*"`
+
+host_compiler_module=`echo $HOST_COMPILER | tr "_" "/"`
+cuda_compiler_module=`echo $cuda_compiler | tr "_" "/"`
+build_path=`echo $BUILD_TYPE | tr " " "_"`
+
+module load $host_compiler_module
+module load $cuda_compiler_module
+
+case $host_compiler_brand in
+  gcc)
+    module load nvcc-wrapper/gnu
+    compiler=g++
+    ;;
+  intel)
+    module load nvcc-wrapper/intel
+    compiler=icpc
+    ;;
+  *)
+    echo "Unrecognized compiler brand."
+    exit 1
+    ;;
+esac
+
+#if cuda is on we need to set the host compiler for the
+#nvcc wrapper and make the wrapper the compiler.
+if [ $cuda_compiler != "" ]; then
+  export NVCC_WRAPPER_DEFAULT_COMPILER=$compiler
+  compiler=$kokkos_path/config/nvcc_wrapper
+fi
+
+if [ $host_compiler_brand == "intel" -a $cuda_compiler != "" ]; then
+  echo "Intel compilers are not supported with cuda at this time."
+  exit 0
+fi
+
+rm -rf test-$build_path
+mkdir test-$build_path
+cd test-$build_path
+
+/bin/bash $kokkos_path/generate_makefile.bash $build_options --kokkos-path="$kokkos_path" --with-gtest="$gtest_path" --compiler=$compiler 2>&1 |tee configure.out
+
+if [ ${PIPESTATUS[0]} != 0 ]; then
+  echo "Configure failed."
+  exit 1
+fi
+
+make build-test 2>&1 | tee build.log
+
+if [ ${PIPESTATUS[0]} != 0 ]; then
+  echo "Build failed."
+  exit 1
+fi
+
+make test 2>&1 | tee test.log
+
+grep "FAIL" test.log
+if [ $? == 0 ]; then
+  echo "Tests failed."
+  exit 1
+fi
diff --git a/lib/kokkos/config/testing_scripts/obj_size_opt_check b/lib/kokkos/config/testing_scripts/obj_size_opt_check
new file mode 100755
index 0000000000000000000000000000000000000000..47c84d1a92a8a288115ecf0d416d57b349fb69b4
--- /dev/null
+++ b/lib/kokkos/config/testing_scripts/obj_size_opt_check
@@ -0,0 +1,287 @@
+#! /usr/bin/env python
+
+"""
+Compute the size at which the current compiler will start to
+significantly scale back optimization.
+
+The CPP file being modified will need the following tags.
+// JGF_DUPLICATE_BEGIN - Put before start of function to duplicate
+// JGF_DUPLICATE_END - Put after end of function to duplcate
+// JGF_DUPE function_name(args); - Put anywhere where it's legal to
+put a function call but not in your timing section.
+
+The program will need to output the string:
+FOM: <number>
+This will represent the program's performance
+"""
+
+import argparse, sys, os, doctest, subprocess, re, time
+
+VERBOSE = False
+
+###############################################################################
+def parse_command_line(args, description):
+###############################################################################
+    parser = argparse.ArgumentParser(
+        usage="""\n%s <cppfile> <build-command> <run-command> [--verbose]
+OR
+%s --help
+OR
+%s --test
+
+\033[1mEXAMPLES:\033[0m
+    > %s foo.cpp 'make -j4' foo
+""" % ((os.path.basename(args[0]), ) * 4),
+
+description=description,
+
+formatter_class=argparse.ArgumentDefaultsHelpFormatter
+)
+
+    parser.add_argument("cppfile", help="Name of file to modify.")
+
+    parser.add_argument("buildcmd", help="Build command")
+
+    parser.add_argument("execmd", help="Run command")
+
+    parser.add_argument("-v", "--verbose", action="store_true",
+                        help="Print extra information")
+
+    parser.add_argument("-s", "--start", type=int, default=1,
+                        help="Starting number of dupes")
+
+    parser.add_argument("-e", "--end", type=int, default=1000,
+                        help="Ending number of dupes")
+
+    parser.add_argument("-n", "--repeat", type=int, default=10,
+                        help="Number of times to repeat an individial execution. Best value will be taken.")
+
+    parser.add_argument("-t", "--template", action="store_true",
+                        help="Use templating instead of source copying to increase object size")
+
+    parser.add_argument("-c", "--csv", action="store_true",
+                        help="Print results as CSV")
+
+    args = parser.parse_args(args[1:])
+
+    if (args.verbose):
+        global VERBOSE
+        VERBOSE = True
+
+    return args.cppfile, args.buildcmd, args.execmd, args.start, args.end, args.repeat, args.template, args.csv
+
+###############################################################################
+def verbose_print(msg, override=None):
+###############################################################################
+    if ( (VERBOSE and not override is False) or override):
+        print msg
+
+###############################################################################
+def error_print(msg):
+###############################################################################
+    print >> sys.stderr, msg
+
+###############################################################################
+def expect(condition, error_msg):
+###############################################################################
+    """
+    Similar to assert except doesn't generate an ugly stacktrace. Useful for
+    checking user error, not programming error.
+    """
+    if (not condition):
+        raise SystemExit("FAIL: %s" % error_msg)
+
+###############################################################################
+def run_cmd(cmd, ok_to_fail=False, input_str=None, from_dir=None, verbose=None,
+            arg_stdout=subprocess.PIPE, arg_stderr=subprocess.PIPE):
+###############################################################################
+    verbose_print("RUN: %s" % cmd, verbose)
+
+    if (input_str is not None):
+        stdin = subprocess.PIPE
+    else:
+        stdin = None
+
+    proc = subprocess.Popen(cmd,
+                            shell=True,
+                            stdout=arg_stdout,
+                            stderr=arg_stderr,
+                            stdin=stdin,
+                            cwd=from_dir)
+    output, errput = proc.communicate(input_str)
+    output = output.strip() if output is not None else output
+    stat = proc.wait()
+
+    if (ok_to_fail):
+        return stat, output, errput
+    else:
+        if (arg_stderr is not None):
+            errput = errput if errput is not None else open(arg_stderr.name, "r").read()
+            expect(stat == 0, "Command: '%s' failed with error '%s'" % (cmd, errput))
+        else:
+            expect(stat == 0, "Command: '%s' failed. See terminal output" % cmd)
+        return output
+
+###############################################################################
+def build_and_run(source, cppfile, buildcmd, execmd, repeat):
+###############################################################################
+    open(cppfile, 'w').writelines(source)
+
+    run_cmd(buildcmd)
+
+    best = None
+    for i in xrange(repeat):
+        wait_for_quiet_machine()
+        output = run_cmd(execmd)
+
+        current = None
+        fom_regex = re.compile(r'^FOM: ([0-9.]+)$')
+        for line in output.splitlines():
+            m = fom_regex.match(line)
+            if (m is not None):
+                current = float(m.groups()[0])
+                break
+
+        expect(current is not None, "No lines in output matched FOM regex")
+
+        if (best is None or best < current):
+            best = current
+
+    return best
+
+###############################################################################
+def wait_for_quiet_machine():
+###############################################################################
+    while(True):
+        time.sleep(2)
+
+        # The first iteration of top gives garbage results
+        idle_pct_raw = run_cmd("top -bn2 | grep 'Cpu(s)' | tr ',' ' ' | tail -n 1 | awk '{print $5}'")
+
+        idle_pct_re = re.compile(r'^([0-9.]+)%id$')
+        m = idle_pct_re.match(idle_pct_raw)
+
+        expect(m is not None, "top not returning output in expected form")
+
+        idle_pct = float(m.groups()[0])
+        if (idle_pct < 95):
+            error_print("Machine is too busy, waiting for it to become free")
+        else:
+            break
+
+###############################################################################
+def add_n_dupes(curr_lines, num_dupes, template):
+###############################################################################
+    function_name  = None
+    function_invocation = None
+    function_lines = []
+
+    function_re = re.compile(r'^.* (\w+) *[(]')
+    function_inv_re = re.compile(r'^.*JGF_DUPE: +(.+)$')
+
+    # Get function lines
+    record = False
+    definition_insertion_point = None
+    invocation_insertion_point = None
+    for idx, line in enumerate(curr_lines):
+        if ("JGF_DUPLICATE_BEGIN" in line):
+            record = True
+            m = function_re.match(curr_lines[idx+1])
+            expect(m is not None, "Could not find function in line '%s'" % curr_lines[idx+1])
+            function_name = m.groups()[0]
+
+        elif ("JGF_DUPLICATE_END" in line):
+            record = False
+            definition_insertion_point = idx + 1
+
+        elif (record):
+            function_lines.append(line)
+
+        elif ("JGF_DUPE" in line):
+            m = function_inv_re.match(line)
+            expect(m is not None, "Could not find function invocation example in line '%s'" % line)
+            function_invocation = m.groups()[0]
+            invocation_insertion_point = idx + 1
+
+    expect(function_name is not None, "Could not find name of dupe function")
+    expect(function_invocation is not None, "Could not find function invocation point")
+
+    expect(definition_insertion_point < invocation_insertion_point, "fix me")
+
+    dupe_func_defs = []
+    dupe_invocations = ["int jgf_rand = std::rand();\n", "if (false) {}\n"]
+
+    for i in xrange(num_dupes):
+        if (not template):
+            dupe_func = list(function_lines)
+            dupe_func[0] = dupe_func[0].replace(function_name, "%s%d" % (function_name, i))
+            dupe_func_defs.extend(dupe_func)
+
+        dupe_invocations.append("else if (jgf_rand == %d) " % i)
+        if (template):
+            dupe_call = function_invocation.replace(function_name, "%s<%d>" % (function_name, i)) + "\n"
+        else:
+            dupe_call = function_invocation.replace(function_name, "%s%d" % (function_name, i))  + "\n"
+        dupe_invocations.append(dupe_call)
+
+    curr_lines[invocation_insertion_point:invocation_insertion_point] = dupe_invocations
+    curr_lines[definition_insertion_point:definition_insertion_point] = dupe_func_defs
+
+###############################################################################
+def report(num_dupes, curr_lines, object_file, orig_fom, curr_fom, csv=False, is_first_report=False):
+###############################################################################
+    fom_change = (curr_fom - orig_fom) / orig_fom
+
+    if (csv):
+        if (is_first_report):
+            print "num_dupes, obj_byte_size, loc, fom, pct_diff"
+
+        print "%s, %s, %s, %s, %s" % (num_dupes, os.path.getsize(object_file), len(curr_lines), curr_fom, fom_change*100)
+    else:
+        print "========================================================"
+        print "For number of dupes:", num_dupes
+        print "Object file size (bytes):", os.path.getsize(object_file)
+        print "Lines of code:", len(curr_lines)
+        print "Field of merit:", curr_fom
+        print "Change pct:", fom_change*100
+
+###############################################################################
+def obj_size_opt_check(cppfile, buildcmd, execmd, start, end, repeat, template, csv=False):
+###############################################################################
+    orig_source_lines = open(cppfile, 'r').readlines()
+
+    backup_file = "%s.orig" % cppfile
+    object_file = "%s.o" % os.path.splitext(cppfile)[0]
+    os.rename(cppfile, backup_file)
+
+    orig_fom = build_and_run(orig_source_lines, cppfile, buildcmd, execmd, repeat)
+    report(0, orig_source_lines, object_file, orig_fom, orig_fom, csv=csv, is_first_report=True)
+
+    i = start
+    while (i < end):
+        curr_lines = list(orig_source_lines)
+        add_n_dupes(curr_lines, i, template)
+
+        curr_fom = build_and_run(curr_lines, cppfile, buildcmd, execmd, repeat)
+
+        report(i, curr_lines, object_file, orig_fom, curr_fom, csv=csv)
+
+        i *= 2 # make growth function configurable?
+
+    os.remove(cppfile)
+    os.rename(backup_file, cppfile)
+
+###############################################################################
+def _main_func(description):
+###############################################################################
+    if ("--test" in sys.argv):
+        test_results = doctest.testmod(verbose=True)
+        sys.exit(1 if test_results.failed > 0 else 0)
+
+    cppfile, buildcmd, execmd, start, end, repeat, template, csv = parse_command_line(sys.argv, description)
+
+    obj_size_opt_check(cppfile, buildcmd, execmd, start, end, repeat, template, csv)
+
+###############################################################################
+if (__name__ == "__main__"):
+    _main_func(__doc__)
diff --git a/lib/kokkos/containers/CMakeLists.txt b/lib/kokkos/containers/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..894935fa0110efc132b9a9680bb54c2cf9b11415
--- /dev/null
+++ b/lib/kokkos/containers/CMakeLists.txt
@@ -0,0 +1,10 @@
+
+
+TRIBITS_SUBPACKAGE(Containers)
+
+ADD_SUBDIRECTORY(src)
+
+TRIBITS_ADD_TEST_DIRECTORIES(unit_tests)
+TRIBITS_ADD_TEST_DIRECTORIES(performance_tests)
+
+TRIBITS_SUBPACKAGE_POSTPROCESS()
diff --git a/lib/kokkos/containers/cmake/Dependencies.cmake b/lib/kokkos/containers/cmake/Dependencies.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..1d71d8af341181f689a6a8bf63036b67584cb138
--- /dev/null
+++ b/lib/kokkos/containers/cmake/Dependencies.cmake
@@ -0,0 +1,5 @@
+TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
+  LIB_REQUIRED_PACKAGES KokkosCore
+  LIB_OPTIONAL_TPLS Pthread CUDA HWLOC
+  TEST_OPTIONAL_TPLS CUSPARSE
+  )
diff --git a/lib/kokkos/containers/cmake/KokkosContainers_config.h.in b/lib/kokkos/containers/cmake/KokkosContainers_config.h.in
new file mode 100644
index 0000000000000000000000000000000000000000..d91fdda1e353eddb2088ff86327e142676c9a6c9
--- /dev/null
+++ b/lib/kokkos/containers/cmake/KokkosContainers_config.h.in
@@ -0,0 +1,4 @@
+#ifndef KOKKOS_CONTAINERS_CONFIG_H
+#define KOKKOS_CONTAINERS_CONFIG_H
+
+#endif
diff --git a/lib/kokkos/containers/performance_tests/CMakeLists.txt b/lib/kokkos/containers/performance_tests/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..726d403452bab92dfaab0a3275d9be42af6afa4f
--- /dev/null
+++ b/lib/kokkos/containers/performance_tests/CMakeLists.txt
@@ -0,0 +1,37 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../src )
+
+SET(SOURCES
+  TestMain.cpp 
+  TestCuda.cpp
+  )
+
+IF(Kokkos_ENABLE_Pthread)
+  LIST( APPEND SOURCES TestThreads.cpp)
+ENDIF()
+
+IF(Kokkos_ENABLE_OpenMP)
+  LIST( APPEND SOURCES TestOpenMP.cpp)
+ENDIF()
+
+# Per #374, we always want to build this test, but we only want to run
+# it as a PERFORMANCE test.  That's why we separate building the test
+# from running the test.
+
+TRIBITS_ADD_EXECUTABLE(
+  PerfTestExec
+  SOURCES ${SOURCES}
+  COMM serial mpi
+  TESTONLYLIBS kokkos_gtest
+  )
+
+TRIBITS_ADD_TEST(
+  PerformanceTest
+  NAME PerfTestExec
+  COMM serial mpi
+  NUM_MPI_PROCS 1
+  CATEGORIES PERFORMANCE
+  FAIL_REGULAR_EXPRESSION "  FAILED  "
+  )
diff --git a/lib/kokkos/containers/performance_tests/Makefile b/lib/kokkos/containers/performance_tests/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..e7abaf44ce07fb725bb1947d86b573ac6a15dae4
--- /dev/null
+++ b/lib/kokkos/containers/performance_tests/Makefile
@@ -0,0 +1,81 @@
+KOKKOS_PATH = ../..
+
+GTEST_PATH = ../../TPL/gtest
+
+vpath %.cpp ${KOKKOS_PATH}/containers/performance_tests
+
+default: build_all
+	echo "End Build"
+
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+	CXX = $(NVCC_WRAPPER)
+	CXXFLAGS ?= -O3
+	LINK = $(CXX)
+	LDFLAGS ?= -lpthread
+else
+	CXX ?= g++
+	CXXFLAGS ?= -O3
+	LINK ?= $(CXX)
+	LDFLAGS ?= -lpthread
+endif
+
+KOKKOS_CXXFLAGS += -I$(GTEST_PATH) -I${KOKKOS_PATH}/containers/performance_tests
+
+TEST_TARGETS = 
+TARGETS = 
+
+ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+	OBJ_CUDA = TestCuda.o TestMain.o gtest-all.o
+	TARGETS += KokkosContainers_PerformanceTest_Cuda
+	TEST_TARGETS += test-cuda
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
+	OBJ_THREADS = TestThreads.o TestMain.o gtest-all.o
+	TARGETS += KokkosContainers_PerformanceTest_Threads
+	TEST_TARGETS += test-threads
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
+	OBJ_OPENMP = TestOpenMP.o TestMain.o gtest-all.o
+	TARGETS += KokkosContainers_PerformanceTest_OpenMP
+	TEST_TARGETS += test-openmp
+endif
+
+KokkosContainers_PerformanceTest_Cuda: $(OBJ_CUDA) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_CUDA) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_PerformanceTest_Cuda
+
+KokkosContainers_PerformanceTest_Threads: $(OBJ_THREADS) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_THREADS) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_PerformanceTest_Threads
+
+KokkosContainers_PerformanceTest_OpenMP: $(OBJ_OPENMP) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_OPENMP) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_PerformanceTest_OpenMP
+
+test-cuda: KokkosContainers_PerformanceTest_Cuda
+	./KokkosContainers_PerformanceTest_Cuda
+
+test-threads: KokkosContainers_PerformanceTest_Threads
+	./KokkosContainers_PerformanceTest_Threads
+
+test-openmp: KokkosContainers_PerformanceTest_OpenMP
+	./KokkosContainers_PerformanceTest_OpenMP
+
+
+build_all: $(TARGETS)
+
+test: $(TEST_TARGETS)
+
+clean: kokkos-clean 
+	rm -f *.o $(TARGETS)
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
+
+gtest-all.o:$(GTEST_PATH)/gtest/gtest-all.cc 
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $(GTEST_PATH)/gtest/gtest-all.cc
+
diff --git a/lib/kokkos/containers/performance_tests/TestCuda.cpp b/lib/kokkos/containers/performance_tests/TestCuda.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8183adaa60b8226fdd5979253cc619ff90e701ba
--- /dev/null
+++ b/lib/kokkos/containers/performance_tests/TestCuda.cpp
@@ -0,0 +1,109 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <stdint.h>
+#include <string>
+#include <iostream>
+#include <iomanip>
+#include <sstream>
+#include <fstream>
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+#if defined( KOKKOS_HAVE_CUDA )
+
+#include <TestDynRankView.hpp>
+
+#include <Kokkos_UnorderedMap.hpp>
+
+#include <TestGlobal2LocalIds.hpp>
+
+#include <TestUnorderedMapPerformance.hpp>
+
+namespace Performance {
+
+class cuda : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+    std::cout << std::setprecision(5) << std::scientific;
+    Kokkos::HostSpace::execution_space::initialize();
+    Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice(0) );
+  }
+  static void TearDownTestCase()
+  {
+    Kokkos::Cuda::finalize();
+    Kokkos::HostSpace::execution_space::finalize();
+  }
+};
+
+TEST_F( cuda, dynrankview_perf ) 
+{
+  std::cout << "Cuda" << std::endl;
+  std::cout << " DynRankView vs View: Initialization Only " << std::endl;
+  test_dynrankview_op_perf<Kokkos::Cuda>( 4096 );
+}
+
+TEST_F( cuda, global_2_local)
+{
+  std::cout << "Cuda" << std::endl;
+  std::cout << "size, create, generate, fill, find" << std::endl;
+  for (unsigned i=Performance::begin_id_size; i<=Performance::end_id_size; i *= Performance::id_step)
+    test_global_to_local_ids<Kokkos::Cuda>(i);
+}
+
+TEST_F( cuda, unordered_map_performance_near)
+{
+  Perf::run_performance_tests<Kokkos::Cuda,true>("cuda-near");
+}
+
+TEST_F( cuda, unordered_map_performance_far)
+{
+  Perf::run_performance_tests<Kokkos::Cuda,false>("cuda-far");
+}
+
+}
+
+#endif  /* #if defined( KOKKOS_HAVE_CUDA ) */
diff --git a/lib/kokkos/containers/performance_tests/TestDynRankView.hpp b/lib/kokkos/containers/performance_tests/TestDynRankView.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..aab6e6988fc847360f02474daab52110a18ef8ef
--- /dev/null
+++ b/lib/kokkos/containers/performance_tests/TestDynRankView.hpp
@@ -0,0 +1,265 @@
+
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+
+#ifndef KOKKOS_TEST_DYNRANKVIEW_HPP
+#define KOKKOS_TEST_DYNRANKVIEW_HPP
+
+#include <Kokkos_Core.hpp>
+#include <Kokkos_DynRankView.hpp>
+#include <vector>
+
+#include <impl/Kokkos_Timer.hpp>
+
+// Compare performance of DynRankView to View, specific focus on the parenthesis operators
+
+namespace Performance {
+
+//View functor
+template <typename DeviceType>
+struct InitViewFunctor {
+  typedef Kokkos::View<double***, DeviceType> inviewtype;
+  inviewtype _inview;
+
+  InitViewFunctor( inviewtype &inview_ ) : _inview(inview_)
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int i) const {
+    for (unsigned j = 0; j < _inview.dimension(1); ++j) {
+      for (unsigned k = 0; k < _inview.dimension(2); ++k) {
+        _inview(i,j,k) = i/2 -j*j + k/3;
+      }
+    }
+  }
+
+  struct SumComputationTest
+  {
+    typedef Kokkos::View<double***, DeviceType> inviewtype;
+    inviewtype _inview;
+
+    typedef Kokkos::View<double*, DeviceType> outviewtype;
+    outviewtype _outview;
+
+    KOKKOS_INLINE_FUNCTION
+    SumComputationTest(inviewtype &inview_ , outviewtype &outview_) : _inview(inview_), _outview(outview_) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const int i) const {
+      for (unsigned j = 0; j < _inview.dimension(1); ++j) {
+        for (unsigned k = 0; k < _inview.dimension(2); ++k) {
+          _outview(i) += _inview(i,j,k) ;
+        }
+      }
+    }
+  };
+
+};
+
+template <typename DeviceType>
+struct InitStrideViewFunctor {
+  typedef Kokkos::View<double***, Kokkos::LayoutStride, DeviceType> inviewtype;
+  inviewtype _inview;
+
+  InitStrideViewFunctor( inviewtype &inview_ ) : _inview(inview_)
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int i) const {
+    for (unsigned j = 0; j < _inview.dimension(1); ++j) {
+      for (unsigned k = 0; k < _inview.dimension(2); ++k) {
+        _inview(i,j,k) = i/2 -j*j + k/3;
+      }
+    }
+  }
+
+};
+
+template <typename DeviceType>
+struct InitViewRank7Functor {
+  typedef Kokkos::View<double*******, DeviceType> inviewtype;
+  inviewtype _inview;
+
+  InitViewRank7Functor( inviewtype &inview_ ) : _inview(inview_)
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int i) const {
+    for (unsigned j = 0; j < _inview.dimension(1); ++j) {
+      for (unsigned k = 0; k < _inview.dimension(2); ++k) {
+        _inview(i,j,k,0,0,0,0) = i/2 -j*j + k/3;
+      }
+    }
+  }
+
+};
+
+//DynRankView functor
+template <typename DeviceType>
+struct InitDynRankViewFunctor {
+  typedef Kokkos::DynRankView<double, DeviceType> inviewtype;
+  inviewtype _inview;
+
+  InitDynRankViewFunctor( inviewtype &inview_ ) : _inview(inview_)
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int i) const {
+    for (unsigned j = 0; j < _inview.dimension(1); ++j) {
+      for (unsigned k = 0; k < _inview.dimension(2); ++k) {
+        _inview(i,j,k) = i/2 -j*j + k/3;
+      }
+    }
+  }
+
+  struct SumComputationTest
+  {
+    typedef Kokkos::DynRankView<double, DeviceType> inviewtype;
+    inviewtype _inview;
+
+    typedef Kokkos::DynRankView<double, DeviceType> outviewtype;
+    outviewtype _outview;
+
+    KOKKOS_INLINE_FUNCTION
+    SumComputationTest(inviewtype &inview_ , outviewtype &outview_) : _inview(inview_), _outview(outview_) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const int i) const {
+      for (unsigned j = 0; j < _inview.dimension(1); ++j) {
+        for (unsigned k = 0; k < _inview.dimension(2); ++k) {
+          _outview(i) += _inview(i,j,k) ;
+        }
+      }
+    }
+  };
+
+};
+
+
+template <typename DeviceType>
+void test_dynrankview_op_perf( const int par_size )
+{
+
+  typedef DeviceType execution_space;
+  typedef typename execution_space::size_type size_type;
+  const size_type dim2 = 900;
+  const size_type dim3 = 300;
+
+  double elapsed_time_view = 0;
+  double elapsed_time_compview = 0;
+  double elapsed_time_strideview = 0;
+  double elapsed_time_view_rank7 = 0;
+  double elapsed_time_drview = 0;
+  double elapsed_time_compdrview = 0;
+  Kokkos::Timer timer;
+  {
+    Kokkos::View<double***,DeviceType> testview("testview",par_size,dim2,dim3);
+    typedef InitViewFunctor<DeviceType> FunctorType;
+
+    timer.reset();
+    Kokkos::RangePolicy<DeviceType> policy(0,par_size);
+    Kokkos::parallel_for( policy , FunctorType(testview) );
+    DeviceType::fence();
+    elapsed_time_view = timer.seconds();
+    std::cout << " View time (init only): " << elapsed_time_view << std::endl;
+
+
+    timer.reset();
+    Kokkos::View<double*,DeviceType> sumview("sumview",par_size);
+    Kokkos::parallel_for( policy , typename FunctorType::SumComputationTest(testview, sumview) );
+    DeviceType::fence();
+    elapsed_time_compview = timer.seconds();
+    std::cout << " View sum computation time: " << elapsed_time_view << std::endl;
+
+
+    Kokkos::View<double***,Kokkos::LayoutStride, DeviceType> teststrideview = Kokkos::subview(testview, Kokkos::ALL, Kokkos::ALL,Kokkos::ALL);
+    typedef InitStrideViewFunctor<DeviceType> FunctorStrideType;
+
+    timer.reset();
+    Kokkos::parallel_for( policy , FunctorStrideType(teststrideview) );
+    DeviceType::fence();
+    elapsed_time_strideview = timer.seconds();
+    std::cout << " Strided View time (init only): " << elapsed_time_strideview << std::endl;
+  }
+  {
+    Kokkos::View<double*******,DeviceType> testview("testview",par_size,dim2,dim3,1,1,1,1);
+    typedef InitViewRank7Functor<DeviceType> FunctorType;
+
+    timer.reset();
+    Kokkos::RangePolicy<DeviceType> policy(0,par_size);
+    Kokkos::parallel_for( policy , FunctorType(testview) );
+    DeviceType::fence();
+    elapsed_time_view_rank7 = timer.seconds();
+    std::cout << " View Rank7 time (init only): " << elapsed_time_view_rank7 << std::endl;
+  }
+  {
+    Kokkos::DynRankView<double,DeviceType> testdrview("testdrview",par_size,dim2,dim3);
+    typedef InitDynRankViewFunctor<DeviceType> FunctorType;
+
+    timer.reset();
+    Kokkos::RangePolicy<DeviceType> policy(0,par_size);
+    Kokkos::parallel_for( policy , FunctorType(testdrview) );
+    DeviceType::fence();
+    elapsed_time_drview = timer.seconds();
+    std::cout << " DynRankView time (init only): " << elapsed_time_drview << std::endl;
+
+    timer.reset();
+    Kokkos::DynRankView<double,DeviceType> sumview("sumview",par_size);
+    Kokkos::parallel_for( policy , typename FunctorType::SumComputationTest(testdrview, sumview) );
+    DeviceType::fence();
+    elapsed_time_compdrview = timer.seconds();
+    std::cout << " DynRankView sum computation time: " << elapsed_time_compdrview << std::endl;
+
+  }
+
+  std::cout << " Ratio of View to DynRankView time: " << elapsed_time_view / elapsed_time_drview << std::endl; //expect < 1
+  std::cout << " Ratio of View to DynRankView sum computation time: " << elapsed_time_compview / elapsed_time_compdrview << std::endl; //expect < 1
+  std::cout << " Ratio of View to View Rank7  time: " << elapsed_time_view / elapsed_time_view_rank7 << std::endl; //expect < 1
+  std::cout << " Ratio of StrideView to DynRankView time: " << elapsed_time_strideview / elapsed_time_drview << std::endl; //expect < 1
+  std::cout << " Ratio of DynRankView to View Rank7  time: " << elapsed_time_drview / elapsed_time_view_rank7 << std::endl; //expect ?
+
+  timer.reset();
+
+} //end test_dynrankview
+
+
+} //end Performance
+#endif
diff --git a/lib/kokkos/containers/performance_tests/TestGlobal2LocalIds.hpp b/lib/kokkos/containers/performance_tests/TestGlobal2LocalIds.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..66f1fbf092dd4231e359602f4d6850fe757d7333
--- /dev/null
+++ b/lib/kokkos/containers/performance_tests/TestGlobal2LocalIds.hpp
@@ -0,0 +1,231 @@
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+
+#ifndef KOKKOS_TEST_GLOBAL_TO_LOCAL_IDS_HPP
+#define KOKKOS_TEST_GLOBAL_TO_LOCAL_IDS_HPP
+
+#include <Kokkos_Core.hpp>
+#include <Kokkos_UnorderedMap.hpp>
+#include <vector>
+#include <algorithm>
+
+#include <impl/Kokkos_Timer.hpp>
+
+// This test will simulate global ids
+
+namespace Performance {
+
+static const unsigned begin_id_size = 256u;
+static const unsigned end_id_size = 1u << 22;
+static const unsigned id_step = 2u;
+
+union helper
+{
+  uint32_t word;
+  uint8_t byte[4];
+};
+
+
+template <typename Device>
+struct generate_ids
+{
+  typedef Device execution_space;
+  typedef typename execution_space::size_type size_type;
+  typedef Kokkos::View<uint32_t*,execution_space> local_id_view;
+
+  local_id_view local_2_global;
+
+  generate_ids( local_id_view & ids)
+    : local_2_global(ids)
+  {
+    Kokkos::parallel_for(local_2_global.dimension_0(), *this);
+  }
+
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(size_type i) const
+  {
+
+    helper x = {static_cast<uint32_t>(i)};
+
+    // shuffle the bytes of i to create a unique, semi-random global_id
+    x.word = ~x.word;
+
+    uint8_t tmp = x.byte[3];
+    x.byte[3] = x.byte[1];
+    x.byte[1] = tmp;
+
+    tmp = x.byte[2];
+    x.byte[2] = x.byte[0];
+    x.byte[0] = tmp;
+
+    local_2_global[i] = x.word;
+  }
+
+};
+
+template <typename Device>
+struct fill_map
+{
+  typedef Device execution_space;
+  typedef typename execution_space::size_type size_type;
+  typedef Kokkos::View<const uint32_t*,execution_space, Kokkos::MemoryRandomAccess> local_id_view;
+  typedef Kokkos::UnorderedMap<uint32_t,size_type,execution_space> global_id_view;
+
+  global_id_view global_2_local;
+  local_id_view local_2_global;
+
+  fill_map( global_id_view gIds, local_id_view lIds)
+    : global_2_local(gIds) , local_2_global(lIds)
+  {
+    Kokkos::parallel_for(local_2_global.dimension_0(), *this);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(size_type i) const
+  {
+    global_2_local.insert( local_2_global[i], i);
+  }
+
+};
+
+template <typename Device>
+struct find_test
+{
+  typedef Device execution_space;
+  typedef typename execution_space::size_type size_type;
+  typedef Kokkos::View<const uint32_t*,execution_space, Kokkos::MemoryRandomAccess> local_id_view;
+  typedef Kokkos::UnorderedMap<const uint32_t, const size_type,execution_space> global_id_view;
+
+  global_id_view global_2_local;
+  local_id_view local_2_global;
+
+  typedef size_t value_type;
+
+  find_test( global_id_view gIds, local_id_view lIds, value_type & num_errors)
+    : global_2_local(gIds) , local_2_global(lIds)
+  {
+    Kokkos::parallel_reduce(local_2_global.dimension_0(), *this, num_errors);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void init(value_type & v) const
+  { v = 0; }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile value_type & dst, volatile value_type const & src) const
+  { dst += src; }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(size_type i, value_type & num_errors) const
+  {
+    uint32_t index = global_2_local.find( local_2_global[i] );
+
+    if ( global_2_local.value_at(index) != i) ++num_errors;
+  }
+
+};
+
+template <typename Device>
+void test_global_to_local_ids(unsigned num_ids)
+{
+
+  typedef Device execution_space;
+  typedef typename execution_space::size_type size_type;
+
+  typedef Kokkos::View<uint32_t*,execution_space> local_id_view;
+  typedef Kokkos::UnorderedMap<uint32_t,size_type,execution_space> global_id_view;
+
+  //size
+  std::cout << num_ids << ", ";
+
+  double elasped_time = 0;
+  Kokkos::Timer timer;
+
+  local_id_view local_2_global("local_ids", num_ids);
+  global_id_view global_2_local((3u*num_ids)/2u);
+
+  //create
+  elasped_time = timer.seconds();
+  std::cout << elasped_time << ", ";
+  timer.reset();
+
+  // generate unique ids
+  {
+    generate_ids<Device> gen(local_2_global);
+  }
+  Device::fence();
+  // generate
+  elasped_time = timer.seconds();
+  std::cout << elasped_time << ", ";
+  timer.reset();
+
+  {
+    fill_map<Device> fill(global_2_local, local_2_global);
+  }
+  Device::fence();
+
+  // fill
+  elasped_time = timer.seconds();
+  std::cout << elasped_time << ", ";
+  timer.reset();
+
+
+  size_t num_errors = 0;
+  for (int i=0; i<100; ++i)
+  {
+    find_test<Device> find(global_2_local, local_2_global,num_errors);
+  }
+  Device::fence();
+
+  // find
+  elasped_time = timer.seconds();
+  std::cout << elasped_time << std::endl;
+
+  ASSERT_EQ( num_errors, 0u);
+}
+
+
+} // namespace Performance
+
+
+#endif //KOKKOS_TEST_GLOBAL_TO_LOCAL_IDS_HPP
+
diff --git a/lib/kokkos/containers/performance_tests/TestMain.cpp b/lib/kokkos/containers/performance_tests/TestMain.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f952ab3db51028aff0a0ebfe313b2639e353ab87
--- /dev/null
+++ b/lib/kokkos/containers/performance_tests/TestMain.cpp
@@ -0,0 +1,50 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+int main(int argc, char *argv[]) {
+  ::testing::InitGoogleTest(&argc,argv);
+  return RUN_ALL_TESTS();
+}
+
diff --git a/lib/kokkos/containers/performance_tests/TestOpenMP.cpp b/lib/kokkos/containers/performance_tests/TestOpenMP.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..da74d32ac1fad932f7354d73384ddcb9bec75354
--- /dev/null
+++ b/lib/kokkos/containers/performance_tests/TestOpenMP.cpp
@@ -0,0 +1,140 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+#include <Kokkos_UnorderedMap.hpp>
+
+#include <TestGlobal2LocalIds.hpp>
+#include <TestUnorderedMapPerformance.hpp>
+
+#include <TestDynRankView.hpp>
+
+#include <iomanip>
+#include <sstream>
+#include <string>
+#include <fstream>
+
+
+namespace Performance {
+
+class openmp : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+    std::cout << std::setprecision(5) << std::scientific;
+
+    unsigned num_threads = 4;
+
+    if (Kokkos::hwloc::available()) {
+      num_threads = Kokkos::hwloc::get_available_numa_count()
+                    * Kokkos::hwloc::get_available_cores_per_numa()
+                    * Kokkos::hwloc::get_available_threads_per_core()
+                    ;
+
+    }
+
+    std::cout << "OpenMP: " << num_threads << std::endl;
+
+    Kokkos::OpenMP::initialize( num_threads );
+
+    std::cout << "available threads: " << omp_get_max_threads() << std::endl;
+  }
+
+  static void TearDownTestCase()
+  {
+    Kokkos::OpenMP::finalize();
+
+    omp_set_num_threads(1);
+
+    ASSERT_EQ( 1 , omp_get_max_threads() );
+  }
+};
+
+TEST_F( openmp, dynrankview_perf ) 
+{
+  std::cout << "OpenMP" << std::endl;
+  std::cout << " DynRankView vs View: Initialization Only " << std::endl;
+  test_dynrankview_op_perf<Kokkos::OpenMP>( 8192 );
+}
+
+TEST_F( openmp, global_2_local)
+{
+  std::cout << "OpenMP" << std::endl;
+  std::cout << "size, create, generate, fill, find" << std::endl;
+  for (unsigned i=Performance::begin_id_size; i<=Performance::end_id_size; i *= Performance::id_step)
+    test_global_to_local_ids<Kokkos::OpenMP>(i);
+}
+
+TEST_F( openmp, unordered_map_performance_near)
+{
+  unsigned num_openmp = 4;
+  if (Kokkos::hwloc::available()) {
+    num_openmp = Kokkos::hwloc::get_available_numa_count() *
+                  Kokkos::hwloc::get_available_cores_per_numa() *
+                  Kokkos::hwloc::get_available_threads_per_core();
+
+  }
+  std::ostringstream base_file_name;
+  base_file_name << "openmp-" << num_openmp << "-near";
+  Perf::run_performance_tests<Kokkos::OpenMP,true>(base_file_name.str());
+}
+
+TEST_F( openmp, unordered_map_performance_far)
+{
+  unsigned num_openmp = 4;
+  if (Kokkos::hwloc::available()) {
+    num_openmp = Kokkos::hwloc::get_available_numa_count() *
+                  Kokkos::hwloc::get_available_cores_per_numa() *
+                  Kokkos::hwloc::get_available_threads_per_core();
+
+  }
+  std::ostringstream base_file_name;
+  base_file_name << "openmp-" << num_openmp << "-far";
+  Perf::run_performance_tests<Kokkos::OpenMP,false>(base_file_name.str());
+}
+
+} // namespace test
+
diff --git a/lib/kokkos/containers/performance_tests/TestThreads.cpp b/lib/kokkos/containers/performance_tests/TestThreads.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4179b7de4c79cc095d83ef4fcdd179593a575f08
--- /dev/null
+++ b/lib/kokkos/containers/performance_tests/TestThreads.cpp
@@ -0,0 +1,135 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+#include <Kokkos_UnorderedMap.hpp>
+
+#include <iomanip>
+
+#include <TestGlobal2LocalIds.hpp>
+#include <TestUnorderedMapPerformance.hpp>
+
+#include <TestDynRankView.hpp>
+
+#include <iomanip>
+#include <sstream>
+#include <string>
+#include <fstream>
+
+namespace Performance {
+
+class threads : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+    std::cout << std::setprecision(5) << std::scientific;
+
+    unsigned num_threads = 4;
+
+    if (Kokkos::hwloc::available()) {
+      num_threads = Kokkos::hwloc::get_available_numa_count() *
+                    Kokkos::hwloc::get_available_cores_per_numa() *
+                    Kokkos::hwloc::get_available_threads_per_core();
+
+    }
+
+    std::cout << "Threads: " << num_threads << std::endl;
+
+    Kokkos::Threads::initialize( num_threads );
+  }
+
+  static void TearDownTestCase()
+  {
+    Kokkos::Threads::finalize();
+  }
+};
+
+TEST_F( threads, dynrankview_perf ) 
+{
+  std::cout << "Threads" << std::endl;
+  std::cout << " DynRankView vs View: Initialization Only " << std::endl;
+  test_dynrankview_op_perf<Kokkos::Threads>( 8192 );
+}
+
+TEST_F( threads, global_2_local)
+{
+  std::cout << "Threads" << std::endl;
+  std::cout << "size, create, generate, fill, find" << std::endl;
+  for (unsigned i=Performance::begin_id_size; i<=Performance::end_id_size; i *= Performance::id_step)
+    test_global_to_local_ids<Kokkos::Threads>(i);
+}
+
+TEST_F( threads, unordered_map_performance_near)
+{
+  unsigned num_threads = 4;
+  if (Kokkos::hwloc::available()) {
+    num_threads = Kokkos::hwloc::get_available_numa_count() *
+                  Kokkos::hwloc::get_available_cores_per_numa() *
+                  Kokkos::hwloc::get_available_threads_per_core();
+
+  }
+  std::ostringstream base_file_name;
+  base_file_name << "threads-" << num_threads << "-near";
+  Perf::run_performance_tests<Kokkos::Threads,true>(base_file_name.str());
+}
+
+TEST_F( threads, unordered_map_performance_far)
+{
+  unsigned num_threads = 4;
+  if (Kokkos::hwloc::available()) {
+    num_threads = Kokkos::hwloc::get_available_numa_count() *
+                  Kokkos::hwloc::get_available_cores_per_numa() *
+                  Kokkos::hwloc::get_available_threads_per_core();
+
+  }
+  std::ostringstream base_file_name;
+  base_file_name << "threads-" << num_threads << "-far";
+  Perf::run_performance_tests<Kokkos::Threads,false>(base_file_name.str());
+}
+
+} // namespace Performance
+
+
diff --git a/lib/kokkos/containers/performance_tests/TestUnorderedMapPerformance.hpp b/lib/kokkos/containers/performance_tests/TestUnorderedMapPerformance.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..71d1182cbe0ea1ca13d92833111a612a93f72f1c
--- /dev/null
+++ b/lib/kokkos/containers/performance_tests/TestUnorderedMapPerformance.hpp
@@ -0,0 +1,262 @@
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+
+#ifndef KOKKOS_TEST_UNORDERED_MAP_PERFORMANCE_HPP
+#define KOKKOS_TEST_UNORDERED_MAP_PERFORMANCE_HPP
+
+#include <impl/Kokkos_Timer.hpp>
+
+#include <iostream>
+#include <iomanip>
+#include <fstream>
+#include <string>
+#include <sstream>
+
+
+namespace Perf {
+
+template <typename Device, bool Near>
+struct UnorderedMapTest
+{
+  typedef Device execution_space;
+  typedef Kokkos::UnorderedMap<uint32_t, uint32_t, execution_space> map_type;
+  typedef typename map_type::histogram_type histogram_type;
+
+  struct value_type {
+    uint32_t failed_count;
+    uint32_t max_list;
+  };
+
+  uint32_t capacity;
+  uint32_t inserts;
+  uint32_t collisions;
+  double   seconds;
+  map_type map;
+  histogram_type histogram;
+
+  UnorderedMapTest( uint32_t arg_capacity, uint32_t arg_inserts, uint32_t arg_collisions)
+    : capacity(arg_capacity)
+    , inserts(arg_inserts)
+    , collisions(arg_collisions)
+    , seconds(0)
+    , map(capacity)
+    , histogram(map.get_histogram())
+  {
+    Kokkos::Timer wall_clock ;
+    wall_clock.reset();
+
+    value_type v = {};
+    int loop_count = 0;
+    do {
+      ++loop_count;
+
+      v = value_type();
+      Kokkos::parallel_reduce(inserts, *this, v);
+
+      if (v.failed_count > 0u) {
+        const uint32_t new_capacity = map.capacity() + ((map.capacity()*3ull)/20u) + v.failed_count/collisions ;
+        map.rehash( new_capacity );
+      }
+    } while (v.failed_count > 0u);
+
+    seconds = wall_clock.seconds();
+
+    switch (loop_count)
+    {
+    case 1u: std::cout << " \033[0;32m" << loop_count << "\033[0m "; break;
+    case 2u: std::cout << " \033[1;31m" << loop_count << "\033[0m "; break;
+    default: std::cout << " \033[0;31m" << loop_count << "\033[0m "; break;
+    }
+    std::cout << std::setprecision(2) << std::fixed << std::setw(5) << (1e9*(seconds/(inserts))) << "; " << std::flush;
+
+    histogram.calculate();
+    Device::fence();
+  }
+
+  void print(std::ostream & metrics_out, std::ostream & length_out, std::ostream & distance_out, std::ostream & block_distance_out)
+  {
+    metrics_out << map.capacity() << " , ";
+    metrics_out << inserts/collisions << " , ";
+    metrics_out << (100.0 * inserts/collisions) / map.capacity() << " , ";
+    metrics_out << inserts << " , ";
+    metrics_out << (map.failed_insert() ? "true" : "false") << " , ";
+    metrics_out << collisions << " , ";
+    metrics_out << 1e9*(seconds/inserts) << " , ";
+    metrics_out << seconds << std::endl;
+
+    length_out << map.capacity() << " , ";
+    length_out << ((100.0 *inserts/collisions) / map.capacity()) << " , ";
+    length_out << collisions << " , ";
+    histogram.print_length(length_out);
+
+    distance_out << map.capacity() << " , ";
+    distance_out << ((100.0 *inserts/collisions) / map.capacity()) << " , ";
+    distance_out << collisions << " , ";
+    histogram.print_distance(distance_out);
+
+    block_distance_out << map.capacity() << " , ";
+    block_distance_out << ((100.0 *inserts/collisions) / map.capacity()) << " , ";
+    block_distance_out << collisions << " , ";
+    histogram.print_block_distance(block_distance_out);
+  }
+
+
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type & v ) const
+  {
+    v.failed_count = 0;
+    v.max_list = 0;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile value_type & dst, const volatile value_type & src ) const
+  {
+    dst.failed_count += src.failed_count;
+    dst.max_list = src.max_list < dst.max_list ? dst.max_list : src.max_list;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(uint32_t i, value_type & v) const
+  {
+    const uint32_t key = Near ? i/collisions : i%(inserts/collisions);
+    typename map_type::insert_result result = map.insert(key,i);
+    v.failed_count += !result.failed() ? 0 : 1;
+    v.max_list = result.list_position() < v.max_list ? v.max_list : result.list_position();
+  }
+
+};
+
+//#define KOKKOS_COLLECT_UNORDERED_MAP_METRICS
+
+template <typename Device, bool Near>
+void run_performance_tests(std::string const & base_file_name)
+{
+#if defined(KOKKOS_COLLECT_UNORDERED_MAP_METRICS)
+  std::string metrics_file_name = base_file_name + std::string("-metrics.csv");
+  std::string length_file_name = base_file_name  + std::string("-length.csv");
+  std::string distance_file_name = base_file_name + std::string("-distance.csv");
+  std::string block_distance_file_name = base_file_name + std::string("-block_distance.csv");
+
+  std::ofstream metrics_out( metrics_file_name.c_str(), std::ofstream::out );
+  std::ofstream length_out( length_file_name.c_str(), std::ofstream::out );
+  std::ofstream distance_out( distance_file_name.c_str(), std::ofstream::out );
+  std::ofstream block_distance_out( block_distance_file_name.c_str(), std::ofstream::out );
+
+
+  /*
+  const double test_ratios[] = {
+     0.50
+   , 0.75
+   , 0.80
+   , 0.85
+   , 0.90
+   , 0.95
+   , 1.00
+   , 1.25
+   , 2.00
+  };
+  */
+
+  const double test_ratios[] = { 1.00 };
+
+  const int num_ratios = sizeof(test_ratios) / sizeof(double);
+
+  /*
+  const uint32_t collisions[] {
+      1
+    , 4
+    , 16
+    , 64
+  };
+  */
+
+  const uint32_t collisions[] = { 16 };
+
+  const int num_collisions = sizeof(collisions) / sizeof(uint32_t);
+
+  // set up file headers
+  metrics_out << "Capacity , Unique , Percent Full , Attempted Inserts , Failed Inserts , Collision Ratio , Nanoseconds/Inserts, Seconds" << std::endl;
+  length_out << "Capacity , Percent Full , ";
+  distance_out << "Capacity , Percent Full , ";
+  block_distance_out << "Capacity , Percent Full , ";
+
+  for (int i=0; i<100; ++i) {
+    length_out << i << " , ";
+    distance_out << i << " , ";
+    block_distance_out << i << " , ";
+  }
+
+  length_out << "\b\b\b   " << std::endl;
+  distance_out << "\b\b\b   " << std::endl;
+  block_distance_out << "\b\b\b   " << std::endl;
+
+  Kokkos::Timer wall_clock ;
+  for (int i=0;  i < num_collisions ; ++i) {
+    wall_clock.reset();
+    std::cout << "Collisions: " << collisions[i] << std::endl;
+    for (int j = 0; j < num_ratios; ++j) {
+      std::cout << std::setprecision(1) << std::fixed << std::setw(5) << (100.0*test_ratios[j]) << "%  " << std::flush;
+      for (uint32_t capacity = 1<<14; capacity < 1<<25; capacity = capacity << 1) {
+        uint32_t inserts = static_cast<uint32_t>(test_ratios[j]*(capacity));
+        std::cout << capacity << std::flush;
+        UnorderedMapTest<Device, Near> test(capacity, inserts*collisions[i], collisions[i]);
+        Device::fence();
+        test.print(metrics_out, length_out, distance_out, block_distance_out);
+      }
+      std::cout << "\b\b  " <<  std::endl;
+
+    }
+    std::cout << "  " << wall_clock.seconds() << " secs" << std::endl;
+  }
+  metrics_out.close();
+  length_out.close();
+  distance_out.close();
+  block_distance_out.close();
+#else
+  (void)base_file_name;
+  std::cout << "skipping test" << std::endl;
+#endif
+}
+
+
+} // namespace Perf
+
+#endif //KOKKOS_TEST_UNORDERED_MAP_PERFORMANCE_HPP
diff --git a/lib/kokkos/containers/src/CMakeLists.txt b/lib/kokkos/containers/src/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..da5a791530fb887a409dbb236c9a512c3f960dd1
--- /dev/null
+++ b/lib/kokkos/containers/src/CMakeLists.txt
@@ -0,0 +1,31 @@
+
+TRIBITS_CONFIGURE_FILE(${PACKAGE_NAME}_config.h)
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+#-----------------------------------------------------------------------------
+
+SET(HEADERS "")
+SET(SOURCES "")
+
+SET(HEADERS_IMPL "")
+
+FILE(GLOB HEADERS *.hpp)
+FILE(GLOB HEADERS_IMPL impl/*.hpp)
+FILE(GLOB SOURCES impl/*.cpp)
+
+SET(TRILINOS_INCDIR ${CMAKE_INSTALL_PREFIX}/${${PROJECT_NAME}_INSTALL_INCLUDE_DIR})
+
+INSTALL(FILES ${HEADERS_IMPL} DESTINATION ${TRILINOS_INCDIR}/impl/)
+
+TRIBITS_ADD_LIBRARY(
+    kokkoscontainers
+    HEADERS ${HEADERS}
+    NOINSTALLHEADERS ${HEADERS_IMPL}
+    SOURCES ${SOURCES}
+    DEPLIBS
+    )
+
+#-----------------------------------------------------------------------------
+
diff --git a/lib/kokkos/containers/src/Kokkos_Bitset.hpp b/lib/kokkos/containers/src/Kokkos_Bitset.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..74da5f61b5d1e9506bf426595e0de9574384662b
--- /dev/null
+++ b/lib/kokkos/containers/src/Kokkos_Bitset.hpp
@@ -0,0 +1,437 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_BITSET_HPP
+#define KOKKOS_BITSET_HPP
+
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Functional.hpp>
+
+#include <impl/Kokkos_Bitset_impl.hpp>
+
+#include <stdexcept>
+
+namespace Kokkos {
+
+template <typename Device = Kokkos::DefaultExecutionSpace >
+class Bitset;
+
+template <typename Device = Kokkos::DefaultExecutionSpace >
+class ConstBitset;
+
+template <typename DstDevice, typename SrcDevice>
+void deep_copy( Bitset<DstDevice> & dst, Bitset<SrcDevice> const& src);
+
+template <typename DstDevice, typename SrcDevice>
+void deep_copy( Bitset<DstDevice> & dst, ConstBitset<SrcDevice> const& src);
+
+template <typename DstDevice, typename SrcDevice>
+void deep_copy( ConstBitset<DstDevice> & dst, ConstBitset<SrcDevice> const& src);
+
+
+/// A thread safe view to a bitset
+template <typename Device>
+class Bitset
+{
+public:
+  typedef Device execution_space;
+  typedef unsigned size_type;
+
+  enum { BIT_SCAN_REVERSE = 1u };
+  enum { MOVE_HINT_BACKWARD = 2u };
+
+  enum {
+      BIT_SCAN_FORWARD_MOVE_HINT_FORWARD = 0u
+    , BIT_SCAN_REVERSE_MOVE_HINT_FORWARD = BIT_SCAN_REVERSE
+    , BIT_SCAN_FORWARD_MOVE_HINT_BACKWARD = MOVE_HINT_BACKWARD
+    , BIT_SCAN_REVERSE_MOVE_HINT_BACKWARD = BIT_SCAN_REVERSE | MOVE_HINT_BACKWARD
+  };
+
+private:
+  enum { block_size = static_cast<unsigned>(sizeof(unsigned)*CHAR_BIT) };
+  enum { block_mask = block_size-1u };
+  enum { block_shift = Kokkos::Impl::integral_power_of_two(block_size) };
+
+public:
+
+
+  /// constructor
+  /// arg_size := number of bit in set
+  Bitset(unsigned arg_size = 0u)
+    : m_size(arg_size)
+    , m_last_block_mask(0u)
+    , m_blocks("Bitset", ((m_size + block_mask) >> block_shift) )
+  {
+    for (int i=0, end = static_cast<int>(m_size & block_mask); i < end; ++i) {
+      m_last_block_mask |= 1u << i;
+    }
+  }
+
+  /// assignment
+  Bitset<Device> & operator = (Bitset<Device> const & rhs)
+  {
+    this->m_size = rhs.m_size;
+    this->m_last_block_mask = rhs.m_last_block_mask;
+    this->m_blocks = rhs.m_blocks;
+
+    return *this;
+  }
+
+  /// copy constructor
+  Bitset( Bitset<Device> const & rhs)
+    : m_size( rhs.m_size )
+    , m_last_block_mask( rhs.m_last_block_mask )
+    , m_blocks( rhs.m_blocks )
+  {}
+
+  /// number of bits in the set
+  /// can be call from the host or the device
+  KOKKOS_FORCEINLINE_FUNCTION
+  unsigned size() const
+  { return m_size; }
+
+  /// number of bits which are set to 1
+  /// can only be called from the host
+  unsigned count() const
+  {
+    Impl::BitsetCount< Bitset<Device> > f(*this);
+    return f.apply();
+  }
+
+  /// set all bits to 1
+  /// can only be called from the host
+  void set()
+  {
+    Kokkos::deep_copy(m_blocks, ~0u );
+
+    if (m_last_block_mask) {
+      //clear the unused bits in the last block
+      typedef Kokkos::Impl::DeepCopy< typename execution_space::memory_space, Kokkos::HostSpace > raw_deep_copy;
+      raw_deep_copy( m_blocks.ptr_on_device() + (m_blocks.dimension_0() -1u), &m_last_block_mask, sizeof(unsigned));
+    }
+  }
+
+  /// set all bits to 0
+  /// can only be called from the host
+  void reset()
+  {
+    Kokkos::deep_copy(m_blocks, 0u );
+  }
+
+  /// set all bits to 0
+  /// can only be called from the host
+  void clear()
+  {
+    Kokkos::deep_copy(m_blocks, 0u );
+  }
+
+  /// set i'th bit to 1
+  /// can only be called from the device
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool set( unsigned i ) const
+  {
+    if ( i < m_size ) {
+      unsigned * block_ptr = &m_blocks[ i >> block_shift ];
+      const unsigned mask = 1u << static_cast<int>( i & block_mask );
+
+      return !( atomic_fetch_or( block_ptr, mask ) & mask );
+    }
+    return false;
+  }
+
+  /// set i'th bit to 0
+  /// can only be called from the device
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool reset( unsigned i ) const
+  {
+    if ( i < m_size ) {
+      unsigned * block_ptr = &m_blocks[ i >> block_shift ];
+      const unsigned mask = 1u << static_cast<int>( i & block_mask );
+
+      return atomic_fetch_and( block_ptr, ~mask ) & mask;
+    }
+    return false;
+  }
+
+  /// return true if the i'th bit set to 1
+  /// can only be called from the device
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool test( unsigned i ) const
+  {
+    if ( i < m_size ) {
+      const unsigned block = volatile_load(&m_blocks[ i >> block_shift ]);
+      const unsigned mask = 1u << static_cast<int>( i & block_mask );
+      return block & mask;
+    }
+    return false;
+  }
+
+  /// used with find_any_set_near or find_any_unset_near functions
+  /// returns the max number of times those functions should be call
+  /// when searching for an available bit
+  KOKKOS_FORCEINLINE_FUNCTION
+  unsigned max_hint() const
+  {
+    return m_blocks.dimension_0();
+  }
+
+  /// find a bit set to 1 near the hint
+  /// returns a pair< bool, unsigned> where if result.first is true then result.second is the bit found
+  /// and if result.first is false the result.second is a new hint
+  KOKKOS_INLINE_FUNCTION
+  Kokkos::pair<bool, unsigned> find_any_set_near( unsigned hint , unsigned scan_direction = BIT_SCAN_FORWARD_MOVE_HINT_FORWARD ) const
+  {
+    const unsigned block_idx = (hint >> block_shift) < m_blocks.dimension_0() ? (hint >> block_shift) : 0;
+    const unsigned offset = hint & block_mask;
+    unsigned block = volatile_load(&m_blocks[ block_idx ]);
+    block = !m_last_block_mask || (block_idx < (m_blocks.dimension_0()-1)) ? block : block & m_last_block_mask ;
+
+    return find_any_helper(block_idx, offset, block, scan_direction);
+  }
+
+  /// find a bit set to 0 near the hint
+  /// returns a pair< bool, unsigned> where if result.first is true then result.second is the bit found
+  /// and if result.first is false the result.second is a new hint
+  KOKKOS_INLINE_FUNCTION
+  Kokkos::pair<bool, unsigned> find_any_unset_near( unsigned hint , unsigned scan_direction = BIT_SCAN_FORWARD_MOVE_HINT_FORWARD ) const
+  {
+    const unsigned block_idx = hint >> block_shift;
+    const unsigned offset = hint & block_mask;
+    unsigned block = volatile_load(&m_blocks[ block_idx ]);
+    block = !m_last_block_mask || (block_idx < (m_blocks.dimension_0()-1) ) ? ~block : ~block & m_last_block_mask ;
+
+    return find_any_helper(block_idx, offset, block, scan_direction);
+  }
+
+private:
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  Kokkos::pair<bool, unsigned> find_any_helper(unsigned block_idx, unsigned offset, unsigned block, unsigned scan_direction) const
+  {
+    Kokkos::pair<bool, unsigned> result( block > 0u, 0);
+
+    if (!result.first) {
+      result.second = update_hint( block_idx, offset, scan_direction );
+    }
+    else {
+      result.second = scan_block(  (block_idx << block_shift)
+                                 , offset
+                                 , block
+                                 , scan_direction
+                                );
+    }
+    return result;
+  }
+
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  unsigned scan_block(unsigned block_start, int offset, unsigned block, unsigned scan_direction ) const
+  {
+    offset = !(scan_direction & BIT_SCAN_REVERSE) ? offset : (offset + block_mask) & block_mask;
+    block = Impl::rotate_right(block, offset);
+    return ((( !(scan_direction & BIT_SCAN_REVERSE) ?
+               Impl::bit_scan_forward(block) :
+               Impl::bit_scan_reverse(block)
+             ) + offset
+            ) & block_mask
+           ) + block_start;
+  }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  unsigned update_hint( long long block_idx, unsigned offset, unsigned scan_direction ) const
+  {
+    block_idx += scan_direction & MOVE_HINT_BACKWARD ? -1 : 1;
+    block_idx = block_idx >= 0 ? block_idx : m_blocks.dimension_0() - 1;
+    block_idx = block_idx < static_cast<long long>(m_blocks.dimension_0()) ? block_idx : 0;
+
+    return static_cast<unsigned>(block_idx)*block_size + offset;
+  }
+
+private:
+
+  unsigned m_size;
+  unsigned m_last_block_mask;
+  View< unsigned *, execution_space, MemoryTraits<RandomAccess> > m_blocks;
+
+private:
+  template <typename DDevice>
+  friend class Bitset;
+
+  template <typename DDevice>
+  friend class ConstBitset;
+
+  template <typename Bitset>
+  friend struct Impl::BitsetCount;
+
+  template <typename DstDevice, typename SrcDevice>
+  friend void deep_copy( Bitset<DstDevice> & dst, Bitset<SrcDevice> const& src);
+
+  template <typename DstDevice, typename SrcDevice>
+  friend void deep_copy( Bitset<DstDevice> & dst, ConstBitset<SrcDevice> const& src);
+};
+
+/// a thread-safe view to a const bitset
+/// i.e. can only test bits
+template <typename Device>
+class ConstBitset
+{
+public:
+  typedef Device execution_space;
+  typedef unsigned size_type;
+
+private:
+  enum { block_size = static_cast<unsigned>(sizeof(unsigned)*CHAR_BIT) };
+  enum { block_mask = block_size -1u };
+  enum { block_shift = Kokkos::Impl::integral_power_of_two(block_size) };
+
+public:
+  ConstBitset()
+    : m_size (0)
+  {}
+
+  ConstBitset(Bitset<Device> const& rhs)
+    : m_size(rhs.m_size)
+    , m_blocks(rhs.m_blocks)
+  {}
+
+  ConstBitset(ConstBitset<Device> const& rhs)
+    : m_size( rhs.m_size )
+    , m_blocks( rhs.m_blocks )
+  {}
+
+  ConstBitset<Device> & operator = (Bitset<Device> const & rhs)
+  {
+    this->m_size = rhs.m_size;
+    this->m_blocks = rhs.m_blocks;
+
+    return *this;
+  }
+
+  ConstBitset<Device> & operator = (ConstBitset<Device> const & rhs)
+  {
+    this->m_size = rhs.m_size;
+    this->m_blocks = rhs.m_blocks;
+
+    return *this;
+  }
+
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  unsigned size() const
+  {
+    return m_size;
+  }
+
+  unsigned count() const
+  {
+    Impl::BitsetCount< ConstBitset<Device> > f(*this);
+    return f.apply();
+  }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool test( unsigned i ) const
+  {
+    if ( i < m_size ) {
+      const unsigned block = m_blocks[ i >> block_shift ];
+      const unsigned mask = 1u << static_cast<int>( i & block_mask );
+      return block & mask;
+    }
+    return false;
+  }
+
+private:
+
+  unsigned m_size;
+  View< const unsigned *, execution_space, MemoryTraits<RandomAccess> > m_blocks;
+
+private:
+  template <typename DDevice>
+  friend class ConstBitset;
+
+  template <typename Bitset>
+  friend struct Impl::BitsetCount;
+
+  template <typename DstDevice, typename SrcDevice>
+  friend void deep_copy( Bitset<DstDevice> & dst, ConstBitset<SrcDevice> const& src);
+
+  template <typename DstDevice, typename SrcDevice>
+  friend void deep_copy( ConstBitset<DstDevice> & dst, ConstBitset<SrcDevice> const& src);
+};
+
+
+template <typename DstDevice, typename SrcDevice>
+void deep_copy( Bitset<DstDevice> & dst, Bitset<SrcDevice> const& src)
+{
+  if (dst.size() != src.size()) {
+    throw std::runtime_error("Error: Cannot deep_copy bitsets of different sizes!");
+  }
+
+  typedef Kokkos::Impl::DeepCopy< typename DstDevice::memory_space, typename SrcDevice::memory_space > raw_deep_copy;
+  raw_deep_copy(dst.m_blocks.ptr_on_device(), src.m_blocks.ptr_on_device(), sizeof(unsigned)*src.m_blocks.dimension_0());
+}
+
+template <typename DstDevice, typename SrcDevice>
+void deep_copy( Bitset<DstDevice> & dst, ConstBitset<SrcDevice> const& src)
+{
+  if (dst.size() != src.size()) {
+    throw std::runtime_error("Error: Cannot deep_copy bitsets of different sizes!");
+  }
+
+  typedef Kokkos::Impl::DeepCopy< typename DstDevice::memory_space, typename SrcDevice::memory_space > raw_deep_copy;
+  raw_deep_copy(dst.m_blocks.ptr_on_device(), src.m_blocks.ptr_on_device(), sizeof(unsigned)*src.m_blocks.dimension_0());
+}
+
+template <typename DstDevice, typename SrcDevice>
+void deep_copy( ConstBitset<DstDevice> & dst, ConstBitset<SrcDevice> const& src)
+{
+  if (dst.size() != src.size()) {
+    throw std::runtime_error("Error: Cannot deep_copy bitsets of different sizes!");
+  }
+
+  typedef Kokkos::Impl::DeepCopy< typename DstDevice::memory_space, typename SrcDevice::memory_space > raw_deep_copy;
+  raw_deep_copy(dst.m_blocks.ptr_on_device(), src.m_blocks.ptr_on_device(), sizeof(unsigned)*src.m_blocks.dimension_0());
+}
+
+} // namespace Kokkos
+
+#endif //KOKKOS_BITSET_HPP
diff --git a/lib/kokkos/containers/src/Kokkos_DualView.hpp b/lib/kokkos/containers/src/Kokkos_DualView.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..1230df4d97741123f2be0a011fd8fd7a40fbd35f
--- /dev/null
+++ b/lib/kokkos/containers/src/Kokkos_DualView.hpp
@@ -0,0 +1,982 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+/// \file Kokkos_DualView.hpp
+/// \brief Declaration and definition of Kokkos::DualView.
+///
+/// This header file declares and defines Kokkos::DualView and its
+/// related nonmember functions.
+
+#ifndef KOKKOS_DUALVIEW_HPP
+#define KOKKOS_DUALVIEW_HPP
+
+#include <Kokkos_Core.hpp>
+#include <impl/Kokkos_Error.hpp>
+
+namespace Kokkos {
+
+/* \class DualView
+ * \brief Container to manage mirroring a Kokkos::View that lives
+ *   in device memory with a Kokkos::View that lives in host memory.
+ *
+ * This class provides capabilities to manage data which exists in two
+ * memory spaces at the same time.  It keeps views of the same layout
+ * on two memory spaces as well as modified flags for both
+ * allocations.  Users are responsible for setting the modified flags
+ * manually if they change the data in either memory space, by calling
+ * the sync() method templated on the device where they modified the
+ * data.  Users may synchronize data by calling the modify() function,
+ * templated on the device towards which they want to synchronize
+ * (i.e., the target of the one-way copy operation).
+ *
+ * The DualView class also provides convenience methods such as
+ * realloc, resize and capacity which call the appropriate methods of
+ * the underlying Kokkos::View objects.
+ *
+ * The four template arguments are the same as those of Kokkos::View.
+ * (Please refer to that class' documentation for a detailed
+ * description.)
+ *
+ *   \tparam DataType The type of the entries stored in the container.
+ *
+ *   \tparam Layout The array's layout in memory.
+ *
+ *   \tparam Device The Kokkos Device type.  If its memory space is
+ *     not the same as the host's memory space, then DualView will
+ *     contain two separate Views: one in device memory, and one in
+ *     host memory.  Otherwise, DualView will only store one View.
+ *
+ *   \tparam MemoryTraits (optional) The user's intended memory access
+ *     behavior.  Please see the documentation of Kokkos::View for
+ *     examples.  The default suffices for most users.
+ */
+template< class DataType ,
+          class Arg1Type = void ,
+          class Arg2Type = void ,
+          class Arg3Type = void>
+class DualView : public ViewTraits< DataType , Arg1Type , Arg2Type, Arg3Type >
+{
+public:
+  //! \name Typedefs for device types and various Kokkos::View specializations.
+  //@{
+  typedef ViewTraits< DataType , Arg1Type , Arg2Type, Arg3Type > traits ;
+
+  //! The Kokkos Host Device type;
+  typedef typename traits::host_mirror_space host_mirror_space ;
+
+  //! The type of a Kokkos::View on the device.
+  typedef View< typename traits::data_type ,
+                Arg1Type ,
+                Arg2Type ,
+                Arg3Type > t_dev ;
+
+  /// \typedef t_host
+  /// \brief The type of a Kokkos::View host mirror of \c t_dev.
+  typedef typename t_dev::HostMirror t_host ;
+
+  //! The type of a const View on the device.
+  //! The type of a Kokkos::View on the device.
+  typedef View< typename traits::const_data_type ,
+                Arg1Type ,
+                Arg2Type ,
+                Arg3Type > t_dev_const ;
+
+  /// \typedef t_host_const
+  /// \brief The type of a const View host mirror of \c t_dev_const.
+  typedef typename t_dev_const::HostMirror t_host_const;
+
+  //! The type of a const, random-access View on the device.
+  typedef View< typename traits::const_data_type ,
+                typename traits::array_layout ,
+                typename traits::device_type ,
+                Kokkos::MemoryTraits<Kokkos::RandomAccess> > t_dev_const_randomread ;
+
+  /// \typedef t_host_const_randomread
+  /// \brief The type of a const, random-access View host mirror of
+  ///   \c t_dev_const_randomread.
+  typedef typename t_dev_const_randomread::HostMirror t_host_const_randomread;
+
+  //! The type of an unmanaged View on the device.
+  typedef View< typename traits::data_type ,
+                typename traits::array_layout ,
+                typename traits::device_type ,
+                MemoryUnmanaged> t_dev_um;
+
+  //! The type of an unmanaged View host mirror of \c t_dev_um.
+  typedef View< typename t_host::data_type ,
+                typename t_host::array_layout ,
+                typename t_host::device_type ,
+                MemoryUnmanaged> t_host_um;
+
+  //! The type of a const unmanaged View on the device.
+  typedef View< typename traits::const_data_type ,
+                typename traits::array_layout ,
+                typename traits::device_type ,
+                MemoryUnmanaged> t_dev_const_um;
+
+  //! The type of a const unmanaged View host mirror of \c t_dev_const_um.
+  typedef View<typename t_host::const_data_type,
+               typename t_host::array_layout,
+               typename t_host::device_type,
+               MemoryUnmanaged> t_host_const_um;
+
+  //! The type of a const, random-access View on the device.
+  typedef View< typename t_host::const_data_type ,
+                typename t_host::array_layout ,
+                typename t_host::device_type ,
+                Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> > t_dev_const_randomread_um ;
+
+  /// \typedef t_host_const_randomread
+  /// \brief The type of a const, random-access View host mirror of
+  ///   \c t_dev_const_randomread.
+  typedef typename t_dev_const_randomread::HostMirror t_host_const_randomread_um;
+
+  //@}
+  //! \name The two View instances.
+  //@{
+
+  t_dev d_view;
+  t_host h_view;
+
+  //@}
+  //! \name Counters to keep track of changes ("modified" flags)
+  //@{
+
+  View<unsigned int,LayoutLeft,typename t_host::execution_space> modified_device;
+  View<unsigned int,LayoutLeft,typename t_host::execution_space> modified_host;
+
+  //@}
+  //! \name Constructors
+  //@{
+
+  /// \brief Empty constructor.
+  ///
+  /// Both device and host View objects are constructed using their
+  /// default constructors.  The "modified" flags are both initialized
+  /// to "unmodified."
+  DualView () :
+    modified_device (View<unsigned int,LayoutLeft,typename t_host::execution_space> ("DualView::modified_device")),
+    modified_host (View<unsigned int,LayoutLeft,typename t_host::execution_space> ("DualView::modified_host"))
+  {}
+
+  /// \brief Constructor that allocates View objects on both host and device.
+  ///
+  /// This constructor works like the analogous constructor of View.
+  /// The first argument is a string label, which is entirely for your
+  /// benefit.  (Different DualView objects may have the same label if
+  /// you like.)  The arguments that follow are the dimensions of the
+  /// View objects.  For example, if the View has three dimensions,
+  /// the first three integer arguments will be nonzero, and you may
+  /// omit the integer arguments that follow.
+  DualView (const std::string& label,
+            const size_t n0 = 0,
+            const size_t n1 = 0,
+            const size_t n2 = 0,
+            const size_t n3 = 0,
+            const size_t n4 = 0,
+            const size_t n5 = 0,
+            const size_t n6 = 0,
+            const size_t n7 = 0)
+    : d_view (label, n0, n1, n2, n3, n4, n5, n6, n7)
+    , h_view (create_mirror_view (d_view)) // without UVM, host View mirrors
+    , modified_device (View<unsigned int,LayoutLeft,typename t_host::execution_space> ("DualView::modified_device"))
+    , modified_host (View<unsigned int,LayoutLeft,typename t_host::execution_space> ("DualView::modified_host"))
+  {}
+
+  //! Copy constructor (shallow copy)
+  template<class SS, class LS, class DS, class MS>
+  DualView (const DualView<SS,LS,DS,MS>& src) :
+    d_view (src.d_view),
+    h_view (src.h_view),
+    modified_device (src.modified_device),
+    modified_host (src.modified_host)
+  {}
+
+  //! Subview constructor
+  template< class SD, class S1 , class S2 , class S3
+          , class Arg0 , class ... Args >
+  DualView( const DualView<SD,S1,S2,S3> & src
+          , const Arg0 & arg0
+          , Args ... args
+          )
+    : d_view( Kokkos::subview( src.d_view , arg0 , args ... ) )
+    , h_view( Kokkos::subview( src.h_view , arg0 , args ... ) )
+    , modified_device (src.modified_device)
+    , modified_host (src.modified_host)
+    {}
+
+  /// \brief Create DualView from existing device and host View objects.
+  ///
+  /// This constructor assumes that the device and host View objects
+  /// are synchronized.  You, the caller, are responsible for making
+  /// sure this is the case before calling this constructor.  After
+  /// this constructor returns, you may use DualView's sync() and
+  /// modify() methods to ensure synchronization of the View objects.
+  ///
+  /// \param d_view_ Device View
+  /// \param h_view_ Host View (must have type t_host = t_dev::HostMirror)
+  DualView (const t_dev& d_view_, const t_host& h_view_) :
+    d_view (d_view_),
+    h_view (h_view_),
+    modified_device (View<unsigned int,LayoutLeft,typename t_host::execution_space> ("DualView::modified_device")),
+    modified_host (View<unsigned int,LayoutLeft,typename t_host::execution_space> ("DualView::modified_host"))
+  {
+#if ! KOKKOS_USING_EXP_VIEW
+    Impl::assert_shapes_are_equal (d_view.shape (), h_view.shape ());
+#else
+    if ( int(d_view.rank)     != int(h_view.rank) ||
+         d_view.dimension_0() != h_view.dimension_0() ||
+         d_view.dimension_1() != h_view.dimension_1() ||
+         d_view.dimension_2() != h_view.dimension_2() ||
+         d_view.dimension_3() != h_view.dimension_3() ||
+         d_view.dimension_4() != h_view.dimension_4() ||
+         d_view.dimension_5() != h_view.dimension_5() ||
+         d_view.dimension_6() != h_view.dimension_6() ||
+         d_view.dimension_7() != h_view.dimension_7() ||
+         d_view.stride_0()    != h_view.stride_0() ||
+         d_view.stride_1()    != h_view.stride_1() ||
+         d_view.stride_2()    != h_view.stride_2() ||
+         d_view.stride_3()    != h_view.stride_3() ||
+         d_view.stride_4()    != h_view.stride_4() ||
+         d_view.stride_5()    != h_view.stride_5() ||
+         d_view.stride_6()    != h_view.stride_6() ||
+         d_view.stride_7()    != h_view.stride_7() ||
+         d_view.span()        != h_view.span() ) {
+      Kokkos::Impl::throw_runtime_exception("DualView constructed with incompatible views");
+    }
+#endif
+  }
+
+  //@}
+  //! \name Methods for synchronizing, marking as modified, and getting Views.
+  //@{
+
+  /// \brief Return a View on a specific device \c Device.
+  ///
+  /// Please don't be afraid of the if_c expression in the return
+  /// value's type.  That just tells the method what the return type
+  /// should be: t_dev if the \c Device template parameter matches
+  /// this DualView's device type, else t_host.
+  ///
+  /// For example, suppose you create a DualView on Cuda, like this:
+  /// \code
+  /// typedef Kokkos::DualView<float, Kokkos::LayoutRight, Kokkos::Cuda> dual_view_type;
+  /// dual_view_type DV ("my dual view", 100);
+  /// \endcode
+  /// If you want to get the CUDA device View, do this:
+  /// \code
+  /// typename dual_view_type::t_dev cudaView = DV.view<Kokkos::Cuda> ();
+  /// \endcode
+  /// and if you want to get the host mirror of that View, do this:
+  /// \code
+  /// typedef typename Kokkos::HostSpace::execution_space host_device_type;
+  /// typename dual_view_type::t_host hostView = DV.view<host_device_type> ();
+  /// \endcode
+  template< class Device >
+  KOKKOS_INLINE_FUNCTION
+  const typename Impl::if_c<
+    Impl::is_same<typename t_dev::memory_space,
+                          typename Device::memory_space>::value,
+    t_dev,
+    t_host>::type& view () const
+  {
+    return Impl::if_c<
+      Impl::is_same<
+        typename t_dev::memory_space,
+        typename Device::memory_space>::value,
+      t_dev,
+      t_host >::select (d_view , h_view);
+  }
+
+  /// \brief Update data on device or host only if data in the other
+  ///   space has been marked as modified.
+  ///
+  /// If \c Device is the same as this DualView's device type, then
+  /// copy data from host to device.  Otherwise, copy data from device
+  /// to host.  In either case, only copy if the source of the copy
+  /// has been modified.
+  ///
+  /// This is a one-way synchronization only.  If the target of the
+  /// copy has been modified, this operation will discard those
+  /// modifications.  It will also reset both device and host modified
+  /// flags.
+  ///
+  /// \note This method doesn't know on its own whether you modified
+  ///   the data in either View.  You must manually mark modified data
+  ///   as modified, by calling the modify() method with the
+  ///   appropriate template parameter.
+  template<class Device>
+  void sync( const typename Impl::enable_if<
+        ( Impl::is_same< typename traits::data_type , typename traits::non_const_data_type>::value) ||
+        ( Impl::is_same< Device , int>::value)
+        , int >::type& = 0)
+  {
+    const unsigned int dev =
+      Impl::if_c<
+        Impl::is_same<
+          typename t_dev::memory_space,
+          typename Device::memory_space>::value ,
+        unsigned int,
+        unsigned int>::select (1, 0);
+
+    if (dev) { // if Device is the same as DualView's device type
+      if ((modified_host () > 0) && (modified_host () >= modified_device ())) {
+        deep_copy (d_view, h_view);
+        modified_host() = modified_device() = 0;
+      }
+    } else { // hopefully Device is the same as DualView's host type
+      if ((modified_device () > 0) && (modified_device () >= modified_host ())) {
+        deep_copy (h_view, d_view);
+        modified_host() = modified_device() = 0;
+      }
+    }
+    if(Impl::is_same<typename t_host::memory_space,typename t_dev::memory_space>::value) {
+      t_dev::execution_space::fence();
+      t_host::execution_space::fence();
+    }
+  }
+
+  template<class Device>
+  void sync ( const typename Impl::enable_if<
+      ( ! Impl::is_same< typename traits::data_type , typename traits::non_const_data_type>::value ) ||
+      ( Impl::is_same< Device , int>::value)
+      , int >::type& = 0 )
+  {
+    const unsigned int dev =
+      Impl::if_c<
+        Impl::is_same<
+          typename t_dev::memory_space,
+          typename Device::memory_space>::value,
+        unsigned int,
+        unsigned int>::select (1, 0);
+    if (dev) { // if Device is the same as DualView's device type
+      if ((modified_host () > 0) && (modified_host () >= modified_device ())) {
+        Impl::throw_runtime_exception("Calling sync on a DualView with a const datatype.");
+      }
+    } else { // hopefully Device is the same as DualView's host type
+      if ((modified_device () > 0) && (modified_device () >= modified_host ())) {
+        Impl::throw_runtime_exception("Calling sync on a DualView with a const datatype.");
+      }
+    }
+  }
+
+  template<class Device>
+  bool need_sync() const
+  {
+    const unsigned int dev =
+      Impl::if_c<
+        Impl::is_same<
+          typename t_dev::memory_space,
+          typename Device::memory_space>::value ,
+        unsigned int,
+        unsigned int>::select (1, 0);
+
+    if (dev) { // if Device is the same as DualView's device type
+      if ((modified_host () > 0) && (modified_host () >= modified_device ())) {
+        return true;
+      }
+    } else { // hopefully Device is the same as DualView's host type
+      if ((modified_device () > 0) && (modified_device () >= modified_host ())) {
+        return true;
+      }
+    }
+    return false;
+  }
+  /// \brief Mark data as modified on the given device \c Device.
+  ///
+  /// If \c Device is the same as this DualView's device type, then
+  /// mark the device's data as modified.  Otherwise, mark the host's
+  /// data as modified.
+  template<class Device>
+  void modify () {
+    const unsigned int dev =
+      Impl::if_c<
+        Impl::is_same<
+          typename t_dev::memory_space,
+          typename Device::memory_space>::value,
+        unsigned int,
+        unsigned int>::select (1, 0);
+
+    if (dev) { // if Device is the same as DualView's device type
+      // Increment the device's modified count.
+      modified_device () = (modified_device () > modified_host () ?
+                            modified_device () : modified_host ()) + 1;
+    } else { // hopefully Device is the same as DualView's host type
+      // Increment the host's modified count.
+      modified_host () = (modified_device () > modified_host () ?
+                          modified_device () : modified_host ())  + 1;
+    }
+  }
+
+  //@}
+  //! \name Methods for reallocating or resizing the View objects.
+  //@{
+
+  /// \brief Reallocate both View objects.
+  ///
+  /// This discards any existing contents of the objects, and resets
+  /// their modified flags.  It does <i>not</i> copy the old contents
+  /// of either View into the new View objects.
+  void realloc( const size_t n0 = 0 ,
+           const size_t n1 = 0 ,
+           const size_t n2 = 0 ,
+           const size_t n3 = 0 ,
+           const size_t n4 = 0 ,
+           const size_t n5 = 0 ,
+           const size_t n6 = 0 ,
+           const size_t n7 = 0 ) {
+    ::Kokkos::realloc(d_view,n0,n1,n2,n3,n4,n5,n6,n7);
+     h_view = create_mirror_view( d_view );
+
+     /* Reset dirty flags */
+     modified_device() = modified_host() = 0;
+  }
+
+  /// \brief Resize both views, copying old contents into new if necessary.
+  ///
+  /// This method only copies the old contents into the new View
+  /// objects for the device which was last marked as modified.
+  void resize( const size_t n0 = 0 ,
+           const size_t n1 = 0 ,
+           const size_t n2 = 0 ,
+           const size_t n3 = 0 ,
+           const size_t n4 = 0 ,
+           const size_t n5 = 0 ,
+           const size_t n6 = 0 ,
+           const size_t n7 = 0 ) {
+   if(modified_device() >= modified_host()) {
+     /* Resize on Device */
+     ::Kokkos::resize(d_view,n0,n1,n2,n3,n4,n5,n6,n7);
+     h_view = create_mirror_view( d_view );
+
+     /* Mark Device copy as modified */
+     modified_device() = modified_device()+1;
+
+   } else {
+     /* Realloc on Device */
+
+     ::Kokkos::realloc(d_view,n0,n1,n2,n3,n4,n5,n6,n7);
+     t_host temp_view = create_mirror_view( d_view );
+
+     /* Remap on Host */
+     Kokkos::deep_copy( temp_view , h_view );
+
+     h_view = temp_view;
+
+     /* Mark Host copy as modified */
+     modified_host() = modified_host()+1;
+   }
+  }
+
+  //@}
+  //! \name Methods for getting capacity, stride, or dimension(s).
+  //@{
+
+  //! The allocation size (same as Kokkos::View::capacity).
+  size_t capacity() const {
+#if KOKKOS_USING_EXP_VIEW
+    return d_view.span();
+#else
+    return d_view.capacity();
+#endif
+  }
+
+  //! Get stride(s) for each dimension.
+  template< typename iType>
+  void stride(iType* stride_) const {
+    d_view.stride(stride_);
+  }
+
+  /* \brief return size of dimension 0 */
+  size_t dimension_0() const {return d_view.dimension_0();}
+  /* \brief return size of dimension 1 */
+  size_t dimension_1() const {return d_view.dimension_1();}
+  /* \brief return size of dimension 2 */
+  size_t dimension_2() const {return d_view.dimension_2();}
+  /* \brief return size of dimension 3 */
+  size_t dimension_3() const {return d_view.dimension_3();}
+  /* \brief return size of dimension 4 */
+  size_t dimension_4() const {return d_view.dimension_4();}
+  /* \brief return size of dimension 5 */
+  size_t dimension_5() const {return d_view.dimension_5();}
+  /* \brief return size of dimension 6 */
+  size_t dimension_6() const {return d_view.dimension_6();}
+  /* \brief return size of dimension 7 */
+  size_t dimension_7() const {return d_view.dimension_7();}
+
+  //@}
+};
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+//
+// Partial specializations of Kokkos::subview() for DualView objects.
+//
+
+#if KOKKOS_USING_EXP_VIEW
+
+namespace Kokkos {
+namespace Impl {
+
+template< class D, class A1, class A2, class A3, class ... Args >
+struct DualViewSubview {
+
+  typedef typename Kokkos::Experimental::Impl::ViewMapping
+    < void
+    , Kokkos::ViewTraits< D, A1, A2, A3 >
+    , Args ...
+    >::traits_type dst_traits ;
+
+  typedef Kokkos::DualView
+    < typename dst_traits::data_type
+    , typename dst_traits::array_layout
+    , typename dst_traits::device_type
+    , typename dst_traits::memory_traits
+    > type ;
+};
+
+} /* namespace Impl */
+
+
+template< class D , class A1 , class A2 , class A3 , class ... Args >
+typename Impl::DualViewSubview<D,A1,A2,A3,Args...>::type
+subview( const DualView<D,A1,A2,A3> & src , Args ... args )
+{
+  return typename
+    Impl::DualViewSubview<D,A1,A2,A3,Args...>::type( src , args ... );
+}
+
+} /* namespace Kokkos */
+
+#else
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+//
+// Partial specializations of Kokkos::subview() for DualView objects.
+//
+
+namespace Kokkos {
+namespace Impl {
+
+template< class SrcDataType , class SrcArg1Type , class SrcArg2Type , class SrcArg3Type
+        , class SubArg0_type , class SubArg1_type , class SubArg2_type , class SubArg3_type
+        , class SubArg4_type , class SubArg5_type , class SubArg6_type , class SubArg7_type
+        >
+struct ViewSubview< DualView< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type  >
+                  , SubArg0_type , SubArg1_type , SubArg2_type , SubArg3_type
+                  , SubArg4_type , SubArg5_type , SubArg6_type , SubArg7_type >
+{
+private:
+
+  typedef DualView< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type >  SrcViewType ;
+
+  enum { V0 = Impl::is_same< SubArg0_type , void >::value ? 1 : 0 };
+  enum { V1 = Impl::is_same< SubArg1_type , void >::value ? 1 : 0 };
+  enum { V2 = Impl::is_same< SubArg2_type , void >::value ? 1 : 0 };
+  enum { V3 = Impl::is_same< SubArg3_type , void >::value ? 1 : 0 };
+  enum { V4 = Impl::is_same< SubArg4_type , void >::value ? 1 : 0 };
+  enum { V5 = Impl::is_same< SubArg5_type , void >::value ? 1 : 0 };
+  enum { V6 = Impl::is_same< SubArg6_type , void >::value ? 1 : 0 };
+  enum { V7 = Impl::is_same< SubArg7_type , void >::value ? 1 : 0 };
+
+  // The source view rank must be equal to the input argument rank
+  // Once a void argument is encountered all subsequent arguments must be void.
+  enum { InputRank =
+    Impl::StaticAssert<( SrcViewType::rank ==
+                         ( V0 ? 0 : (
+                           V1 ? 1 : (
+                           V2 ? 2 : (
+                           V3 ? 3 : (
+                           V4 ? 4 : (
+                           V5 ? 5 : (
+                           V6 ? 6 : (
+                           V7 ? 7 : 8 ))))))) ))
+                       &&
+                       ( SrcViewType::rank ==
+                         ( 8 - ( V0 + V1 + V2 + V3 + V4 + V5 + V6 + V7 ) ) )
+    >::value ? SrcViewType::rank : 0 };
+
+  enum { R0 = Impl::ViewOffsetRange< SubArg0_type >::is_range ? 1 : 0 };
+  enum { R1 = Impl::ViewOffsetRange< SubArg1_type >::is_range ? 1 : 0 };
+  enum { R2 = Impl::ViewOffsetRange< SubArg2_type >::is_range ? 1 : 0 };
+  enum { R3 = Impl::ViewOffsetRange< SubArg3_type >::is_range ? 1 : 0 };
+  enum { R4 = Impl::ViewOffsetRange< SubArg4_type >::is_range ? 1 : 0 };
+  enum { R5 = Impl::ViewOffsetRange< SubArg5_type >::is_range ? 1 : 0 };
+  enum { R6 = Impl::ViewOffsetRange< SubArg6_type >::is_range ? 1 : 0 };
+  enum { R7 = Impl::ViewOffsetRange< SubArg7_type >::is_range ? 1 : 0 };
+
+  enum { OutputRank = unsigned(R0) + unsigned(R1) + unsigned(R2) + unsigned(R3)
+                    + unsigned(R4) + unsigned(R5) + unsigned(R6) + unsigned(R7) };
+
+  // Reverse
+  enum { R0_rev = 0 == InputRank ? 0u : (
+                  1 == InputRank ? unsigned(R0) : (
+                  2 == InputRank ? unsigned(R1) : (
+                  3 == InputRank ? unsigned(R2) : (
+                  4 == InputRank ? unsigned(R3) : (
+                  5 == InputRank ? unsigned(R4) : (
+                  6 == InputRank ? unsigned(R5) : (
+                  7 == InputRank ? unsigned(R6) : unsigned(R7) ))))))) };
+
+  typedef typename SrcViewType::array_layout  SrcViewLayout ;
+
+  // Choose array layout, attempting to preserve original layout if at all possible.
+  typedef typename Impl::if_c<
+     ( // Same Layout IF
+       // OutputRank 0
+       ( OutputRank == 0 )
+       ||
+       // OutputRank 1 or 2, InputLayout Left, Interval 0
+       // because single stride one or second index has a stride.
+       ( OutputRank <= 2 && R0 && Impl::is_same<SrcViewLayout,LayoutLeft>::value )
+       ||
+       // OutputRank 1 or 2, InputLayout Right, Interval [InputRank-1]
+       // because single stride one or second index has a stride.
+       ( OutputRank <= 2 && R0_rev && Impl::is_same<SrcViewLayout,LayoutRight>::value )
+     ), SrcViewLayout , Kokkos::LayoutStride >::type OutputViewLayout ;
+
+  // Choose data type as a purely dynamic rank array to accomodate a runtime range.
+  typedef typename Impl::if_c< OutputRank == 0 , typename SrcViewType::value_type ,
+          typename Impl::if_c< OutputRank == 1 , typename SrcViewType::value_type *,
+          typename Impl::if_c< OutputRank == 2 , typename SrcViewType::value_type **,
+          typename Impl::if_c< OutputRank == 3 , typename SrcViewType::value_type ***,
+          typename Impl::if_c< OutputRank == 4 , typename SrcViewType::value_type ****,
+          typename Impl::if_c< OutputRank == 5 , typename SrcViewType::value_type *****,
+          typename Impl::if_c< OutputRank == 6 , typename SrcViewType::value_type ******,
+          typename Impl::if_c< OutputRank == 7 , typename SrcViewType::value_type *******,
+                                                 typename SrcViewType::value_type ********
+  >::type >::type >::type >::type >::type >::type >::type >::type  OutputData ;
+
+  // Choose space.
+  // If the source view's template arg1 or arg2 is a space then use it,
+  // otherwise use the source view's execution space.
+
+  typedef typename Impl::if_c< Impl::is_space< SrcArg1Type >::value , SrcArg1Type ,
+          typename Impl::if_c< Impl::is_space< SrcArg2Type >::value , SrcArg2Type , typename SrcViewType::execution_space
+  >::type >::type OutputSpace ;
+
+public:
+
+  // If keeping the layout then match non-data type arguments
+  // else keep execution space and memory traits.
+  typedef typename
+    Impl::if_c< Impl::is_same< SrcViewLayout , OutputViewLayout >::value
+              , Kokkos::DualView< OutputData , SrcArg1Type , SrcArg2Type , SrcArg3Type >
+              , Kokkos::DualView< OutputData , OutputViewLayout , OutputSpace
+                            , typename SrcViewType::memory_traits >
+              >::type  type ;
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+namespace Kokkos {
+
+template< class D , class A1 , class A2 , class A3 ,
+          class ArgType0 >
+typename Impl::ViewSubview< DualView<D,A1,A2,A3>
+                          , ArgType0 , void , void , void
+                          , void , void , void , void
+                          >::type
+subview( const DualView<D,A1,A2,A3> & src ,
+         const ArgType0 & arg0 )
+{
+  typedef typename
+    Impl::ViewSubview< DualView<D,A1,A2,A3>
+                 , ArgType0 , void , void , void
+                 , void , void , void , void
+                 >::type
+      DstViewType ;
+  DstViewType sub_view;
+  sub_view.d_view = subview(src.d_view,arg0);
+  sub_view.h_view = subview(src.h_view,arg0);
+  sub_view.modified_device = src.modified_device;
+  sub_view.modified_host = src.modified_host;
+  return sub_view;
+}
+
+
+template< class D , class A1 , class A2 , class A3 ,
+          class ArgType0 , class ArgType1 >
+typename Impl::ViewSubview< DualView<D,A1,A2,A3>
+                          , ArgType0 , ArgType1 , void , void
+                          , void , void , void , void
+                          >::type
+subview( const DualView<D,A1,A2,A3> & src ,
+         const ArgType0 & arg0 ,
+         const ArgType1 & arg1 )
+{
+  typedef typename
+    Impl::ViewSubview< DualView<D,A1,A2,A3>
+                 , ArgType0 , ArgType1 , void , void
+                 , void , void , void , void
+                 >::type
+      DstViewType ;
+  DstViewType sub_view;
+  sub_view.d_view = subview(src.d_view,arg0,arg1);
+  sub_view.h_view = subview(src.h_view,arg0,arg1);
+  sub_view.modified_device = src.modified_device;
+  sub_view.modified_host = src.modified_host;
+  return sub_view;
+}
+
+template< class D , class A1 , class A2 , class A3 ,
+          class ArgType0 , class ArgType1 , class ArgType2 >
+typename Impl::ViewSubview< DualView<D,A1,A2,A3>
+                          , ArgType0 , ArgType1 , ArgType2 , void
+                          , void , void , void , void
+                          >::type
+subview( const DualView<D,A1,A2,A3> & src ,
+         const ArgType0 & arg0 ,
+         const ArgType1 & arg1 ,
+         const ArgType2 & arg2 )
+{
+  typedef typename
+    Impl::ViewSubview< DualView<D,A1,A2,A3>
+                 , ArgType0 , ArgType1 , ArgType2 , void
+                 , void , void , void , void
+                 >::type
+      DstViewType ;
+  DstViewType sub_view;
+  sub_view.d_view = subview(src.d_view,arg0,arg1,arg2);
+  sub_view.h_view = subview(src.h_view,arg0,arg1,arg2);
+  sub_view.modified_device = src.modified_device;
+  sub_view.modified_host = src.modified_host;
+  return sub_view;
+}
+
+template< class D , class A1 , class A2 , class A3 ,
+          class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 >
+typename Impl::ViewSubview< DualView<D,A1,A2,A3>
+                          , ArgType0 , ArgType1 , ArgType2 , ArgType3
+                          , void , void , void , void
+                          >::type
+subview( const DualView<D,A1,A2,A3> & src ,
+         const ArgType0 & arg0 ,
+         const ArgType1 & arg1 ,
+         const ArgType2 & arg2 ,
+         const ArgType3 & arg3 )
+{
+  typedef typename
+    Impl::ViewSubview< DualView<D,A1,A2,A3>
+                 , ArgType0 , ArgType1 , ArgType2 , ArgType3
+                 , void , void , void , void
+                 >::type
+      DstViewType ;
+  DstViewType sub_view;
+  sub_view.d_view = subview(src.d_view,arg0,arg1,arg2,arg3);
+  sub_view.h_view = subview(src.h_view,arg0,arg1,arg2,arg3);
+  sub_view.modified_device = src.modified_device;
+  sub_view.modified_host = src.modified_host;
+  return sub_view;
+}
+
+template< class D , class A1 , class A2 , class A3 ,
+          class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 ,
+          class ArgType4 >
+typename Impl::ViewSubview< DualView<D,A1,A2,A3>
+                          , ArgType0 , ArgType1 , ArgType2 , ArgType3
+                          , ArgType4 , void , void , void
+                          >::type
+subview( const DualView<D,A1,A2,A3> & src ,
+         const ArgType0 & arg0 ,
+         const ArgType1 & arg1 ,
+         const ArgType2 & arg2 ,
+         const ArgType3 & arg3 ,
+         const ArgType4 & arg4 )
+{
+  typedef typename
+    Impl::ViewSubview< DualView<D,A1,A2,A3>
+                 , ArgType0 , ArgType1 , ArgType2 , ArgType3
+                 , ArgType4 , void , void ,void
+                 >::type
+      DstViewType ;
+  DstViewType sub_view;
+  sub_view.d_view = subview(src.d_view,arg0,arg1,arg2,arg3,arg4);
+  sub_view.h_view = subview(src.h_view,arg0,arg1,arg2,arg3,arg4);
+  sub_view.modified_device = src.modified_device;
+  sub_view.modified_host = src.modified_host;
+  return sub_view;
+}
+
+template< class D , class A1 , class A2 , class A3 ,
+          class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 ,
+          class ArgType4 , class ArgType5 >
+typename Impl::ViewSubview< DualView<D,A1,A2,A3>
+                          , ArgType0 , ArgType1 , ArgType2 , ArgType3
+                          , ArgType4 , ArgType5 , void , void
+                          >::type
+subview( const DualView<D,A1,A2,A3> & src ,
+         const ArgType0 & arg0 ,
+         const ArgType1 & arg1 ,
+         const ArgType2 & arg2 ,
+         const ArgType3 & arg3 ,
+         const ArgType4 & arg4 ,
+         const ArgType5 & arg5 )
+{
+  typedef typename
+    Impl::ViewSubview< DualView<D,A1,A2,A3>
+                 , ArgType0 , ArgType1 , ArgType2 , ArgType3
+                 , ArgType4 , ArgType5 , void , void
+                 >::type
+      DstViewType ;
+  DstViewType sub_view;
+  sub_view.d_view = subview(src.d_view,arg0,arg1,arg2,arg3,arg4,arg5);
+  sub_view.h_view = subview(src.h_view,arg0,arg1,arg2,arg3,arg4,arg5);
+  sub_view.modified_device = src.modified_device;
+  sub_view.modified_host = src.modified_host;
+  return sub_view;
+}
+
+template< class D , class A1 , class A2 , class A3 ,
+          class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 ,
+          class ArgType4 , class ArgType5 , class ArgType6 >
+typename Impl::ViewSubview< DualView<D,A1,A2,A3>
+                          , ArgType0 , ArgType1 , ArgType2 , ArgType3
+                          , ArgType4 , ArgType5 , ArgType6 , void
+                          >::type
+subview( const DualView<D,A1,A2,A3> & src ,
+         const ArgType0 & arg0 ,
+         const ArgType1 & arg1 ,
+         const ArgType2 & arg2 ,
+         const ArgType3 & arg3 ,
+         const ArgType4 & arg4 ,
+         const ArgType5 & arg5 ,
+         const ArgType6 & arg6 )
+{
+  typedef typename
+    Impl::ViewSubview< DualView<D,A1,A2,A3>
+                 , ArgType0 , ArgType1 , ArgType2 , ArgType3
+                 , ArgType4 , ArgType5 , ArgType6 , void
+                 >::type
+      DstViewType ;
+  DstViewType sub_view;
+  sub_view.d_view = subview(src.d_view,arg0,arg1,arg2,arg3,arg4,arg5,arg6);
+  sub_view.h_view = subview(src.h_view,arg0,arg1,arg2,arg3,arg4,arg5,arg6);
+  sub_view.modified_device = src.modified_device;
+  sub_view.modified_host = src.modified_host;
+  return sub_view;
+}
+
+template< class D , class A1 , class A2 , class A3 ,
+          class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 ,
+          class ArgType4 , class ArgType5 , class ArgType6 , class ArgType7 >
+typename Impl::ViewSubview< DualView<D,A1,A2,A3>
+                          , ArgType0 , ArgType1 , ArgType2 , ArgType3
+                          , ArgType4 , ArgType5 , ArgType6 , ArgType7
+                          >::type
+subview( const DualView<D,A1,A2,A3> & src ,
+         const ArgType0 & arg0 ,
+         const ArgType1 & arg1 ,
+         const ArgType2 & arg2 ,
+         const ArgType3 & arg3 ,
+         const ArgType4 & arg4 ,
+         const ArgType5 & arg5 ,
+         const ArgType6 & arg6 ,
+         const ArgType7 & arg7 )
+{
+  typedef typename
+    Impl::ViewSubview< DualView<D,A1,A2,A3>
+                 , ArgType0 , ArgType1 , ArgType2 , ArgType3
+                 , ArgType4 , ArgType5 , ArgType6 , ArgType7
+                 >::type
+      DstViewType ;
+  DstViewType sub_view;
+  sub_view.d_view = subview(src.d_view,arg0,arg1,arg2,arg3,arg4,arg5,arg6,arg7);
+  sub_view.h_view = subview(src.h_view,arg0,arg1,arg2,arg3,arg4,arg5,arg6,arg7);
+  sub_view.modified_device = src.modified_device;
+  sub_view.modified_host = src.modified_host;
+  return sub_view;
+}
+
+} // namespace Kokkos
+
+#endif /* KOKKOS_USING_EXP_VIEW */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+//
+// Partial specialization of Kokkos::deep_copy() for DualView objects.
+//
+
+template< class DT , class DL , class DD , class DM ,
+          class ST , class SL , class SD , class SM >
+void
+deep_copy (DualView<DT,DL,DD,DM> dst, // trust me, this must not be a reference
+           const DualView<ST,SL,SD,SM>& src )
+{
+  if (src.modified_device () >= src.modified_host ()) {
+    deep_copy (dst.d_view, src.d_view);
+    dst.template modify<typename DualView<DT,DL,DD,DM>::device_type> ();
+  } else {
+    deep_copy (dst.h_view, src.h_view);
+    dst.template modify<typename DualView<DT,DL,DD,DM>::host_mirror_space> ();
+  }
+}
+
+template< class ExecutionSpace ,
+          class DT , class DL , class DD , class DM ,
+          class ST , class SL , class SD , class SM >
+void
+deep_copy (const ExecutionSpace& exec ,
+           DualView<DT,DL,DD,DM> dst, // trust me, this must not be a reference
+           const DualView<ST,SL,SD,SM>& src )
+{
+  if (src.modified_device () >= src.modified_host ()) {
+    deep_copy (exec, dst.d_view, src.d_view);
+    dst.template modify<typename DualView<DT,DL,DD,DM>::device_type> ();
+  } else {
+    deep_copy (exec, dst.h_view, src.h_view);
+    dst.template modify<typename DualView<DT,DL,DD,DM>::host_mirror_space> ();
+  }
+}
+
+} // namespace Kokkos
+
+#endif
diff --git a/lib/kokkos/containers/src/Kokkos_DynRankView.hpp b/lib/kokkos/containers/src/Kokkos_DynRankView.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f72277700ad87cd0fe9cb1cdee4c2d34ff69ab80
--- /dev/null
+++ b/lib/kokkos/containers/src/Kokkos_DynRankView.hpp
@@ -0,0 +1,1834 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+/// \file Kokkos_DynRankView.hpp
+/// \brief Declaration and definition of Kokkos::Experimental::DynRankView.
+///
+/// This header file declares and defines Kokkos::Experimental::DynRankView and its
+/// related nonmember functions.
+/*
+ *   Changes from View
+ *   1. The rank of the DynRankView is returned by the method rank()
+ *   2. Max rank of a DynRankView is 7
+ *   3. subview name is subdynrankview
+ *   4. Every subdynrankview is returned with LayoutStride
+ *
+ *   NEW: Redesigned DynRankView
+ *   5. subview function name now available
+ *   6. Copy and Copy-Assign View to DynRankView
+ *   7. deep_copy between Views and DynRankViews
+ *   8. rank( view ); returns the rank of View or DynRankView
+ */
+
+#ifndef KOKKOS_DYNRANKVIEW_HPP
+#define KOKKOS_DYNRANKVIEW_HPP
+
+#include <Kokkos_Core.hpp>
+#include <impl/Kokkos_Error.hpp>
+#include <type_traits>
+
+namespace Kokkos {
+namespace Experimental {
+
+template< typename DataType , class ... Properties >
+class DynRankView;  //forward declare
+
+namespace Impl {
+
+template <typename Specialize>
+struct DynRankDimTraits {
+
+  enum : size_t{unspecified = ~size_t(0)};
+
+  // Compute the rank of the view from the nonzero dimension arguments.
+  KOKKOS_INLINE_FUNCTION
+  static size_t computeRank( const size_t N0
+                           , const size_t N1
+                           , const size_t N2
+                           , const size_t N3
+                           , const size_t N4
+                           , const size_t N5
+                           , const size_t N6
+                           , const size_t N7 )
+  {
+    return
+      (   (N6 == unspecified && N5 == unspecified && N4 == unspecified && N3 == unspecified && N2 == unspecified && N1 == unspecified && N0 == unspecified) ? 0
+      : ( (N6 == unspecified && N5 == unspecified && N4 == unspecified && N3 == unspecified && N2 == unspecified && N1 == unspecified) ? 1
+      : ( (N6 == unspecified && N5 == unspecified && N4 == unspecified && N3 == unspecified && N2 == unspecified) ? 2
+      : ( (N6 == unspecified && N5 == unspecified && N4 == unspecified && N3 == unspecified) ? 3
+      : ( (N6 == unspecified && N5 == unspecified && N4 == unspecified) ? 4
+      : ( (N6 == unspecified && N5 == unspecified) ? 5
+      : ( (N6 == unspecified) ? 6
+      : 7 ) ) ) ) ) ) );
+  }
+
+  // Compute the rank of the view from the nonzero layout arguments.
+  template <typename Layout>
+  KOKKOS_INLINE_FUNCTION
+  static size_t computeRank( const Layout& layout )
+  {
+    return computeRank( layout.dimension[0]
+                      , layout.dimension[1]
+                      , layout.dimension[2]
+                      , layout.dimension[3]
+                      , layout.dimension[4]
+                      , layout.dimension[5]
+                      , layout.dimension[6]
+                      , layout.dimension[7] );
+  }
+
+  // Create the layout for the rank-7 view.
+  // Non-strided Layout
+  template <typename Layout>
+  KOKKOS_INLINE_FUNCTION
+  static typename std::enable_if< (std::is_same<Layout , Kokkos::LayoutRight>::value || std::is_same<Layout , Kokkos::LayoutLeft>::value) , Layout >::type createLayout( const Layout& layout )
+  {
+    return Layout( layout.dimension[0] != unspecified ? layout.dimension[0] : 1
+                 , layout.dimension[1] != unspecified ? layout.dimension[1] : 1
+                 , layout.dimension[2] != unspecified ? layout.dimension[2] : 1
+                 , layout.dimension[3] != unspecified ? layout.dimension[3] : 1
+                 , layout.dimension[4] != unspecified ? layout.dimension[4] : 1
+                 , layout.dimension[5] != unspecified ? layout.dimension[5] : 1
+                 , layout.dimension[6] != unspecified ? layout.dimension[6] : 1
+                 , layout.dimension[7] != unspecified ? layout.dimension[7] : 1
+                 );
+  }
+
+  // LayoutStride
+  template <typename Layout>
+  KOKKOS_INLINE_FUNCTION
+  static typename std::enable_if< (std::is_same<Layout , Kokkos::LayoutStride>::value) , Layout>::type createLayout( const Layout& layout )
+  {
+    return Layout( layout.dimension[0] != unspecified ? layout.dimension[0] : 1
+                 , layout.stride[0] 
+                 , layout.dimension[1] != unspecified ? layout.dimension[1] : 1
+                 , layout.stride[1] 
+                 , layout.dimension[2] != unspecified ? layout.dimension[2] : 1
+                 , layout.stride[2] 
+                 , layout.dimension[3] != unspecified ? layout.dimension[3] : 1
+                 , layout.stride[3] 
+                 , layout.dimension[4] != unspecified ? layout.dimension[4] : 1
+                 , layout.stride[4] 
+                 , layout.dimension[5] != unspecified ? layout.dimension[5] : 1
+                 , layout.stride[5] 
+                 , layout.dimension[6] != unspecified ? layout.dimension[6] : 1
+                 , layout.stride[6] 
+                 , layout.dimension[7] != unspecified ? layout.dimension[7] : 1
+                 , layout.stride[7] 
+                 );
+  }
+
+  // Create a view from the given dimension arguments.
+  // This is only necessary because the shmem constructor doesn't take a layout.
+  template <typename ViewType, typename ViewArg>
+  static ViewType createView( const ViewArg& arg
+                            , const size_t N0
+                            , const size_t N1
+                            , const size_t N2
+                            , const size_t N3
+                            , const size_t N4
+                            , const size_t N5
+                            , const size_t N6
+                            , const size_t N7 )
+  {
+    return ViewType( arg
+                   , N0 != unspecified ? N0 : 1
+                   , N1 != unspecified ? N1 : 1
+                   , N2 != unspecified ? N2 : 1
+                   , N3 != unspecified ? N3 : 1
+                   , N4 != unspecified ? N4 : 1
+                   , N5 != unspecified ? N5 : 1
+                   , N6 != unspecified ? N6 : 1
+                   , N7 != unspecified ? N7 : 1 );
+  }
+};
+
+  // Non-strided Layout
+  template <typename Layout , typename iType>
+  KOKKOS_INLINE_FUNCTION
+  static typename std::enable_if< (std::is_same<Layout , Kokkos::LayoutRight>::value || std::is_same<Layout , Kokkos::LayoutLeft>::value) && std::is_integral<iType>::value , Layout >::type reconstructLayout( const Layout& layout , iType dynrank )
+  {
+    return Layout( dynrank > 0 ? layout.dimension[0] : ~size_t(0) 
+                 , dynrank > 1 ? layout.dimension[1] : ~size_t(0)
+                 , dynrank > 2 ? layout.dimension[2] : ~size_t(0)
+                 , dynrank > 3 ? layout.dimension[3] : ~size_t(0)
+                 , dynrank > 4 ? layout.dimension[4] : ~size_t(0)
+                 , dynrank > 5 ? layout.dimension[5] : ~size_t(0)
+                 , dynrank > 6 ? layout.dimension[6] : ~size_t(0)
+                 , dynrank > 7 ? layout.dimension[7] : ~size_t(0)
+                 );
+  }
+
+  // LayoutStride
+  template <typename Layout , typename iType>
+  KOKKOS_INLINE_FUNCTION
+  static typename std::enable_if< (std::is_same<Layout , Kokkos::LayoutStride>::value) && std::is_integral<iType>::value , Layout >::type reconstructLayout( const Layout& layout , iType dynrank )
+  {
+    return Layout( dynrank > 0 ? layout.dimension[0] : ~size_t(0)
+                 , dynrank > 0 ? layout.stride[0] : (0) 
+                 , dynrank > 1 ? layout.dimension[1] : ~size_t(0)
+                 , dynrank > 1 ? layout.stride[1] : (0) 
+                 , dynrank > 2 ? layout.dimension[2] : ~size_t(0)
+                 , dynrank > 2 ? layout.stride[2] : (0) 
+                 , dynrank > 3 ? layout.dimension[3] : ~size_t(0)
+                 , dynrank > 3 ? layout.stride[3] : (0) 
+                 , dynrank > 4 ? layout.dimension[4] : ~size_t(0)
+                 , dynrank > 4 ? layout.stride[4] : (0) 
+                 , dynrank > 5 ? layout.dimension[5] : ~size_t(0)
+                 , dynrank > 5 ? layout.stride[5] : (0) 
+                 , dynrank > 6 ? layout.dimension[6] : ~size_t(0)
+                 , dynrank > 6 ? layout.stride[6] : (0) 
+                 , dynrank > 7 ? layout.dimension[7] : ~size_t(0)
+                 , dynrank > 7 ? layout.stride[7] : (0) 
+                 );
+  }
+
+  template < typename DynRankViewType , typename iType >
+  void verify_dynrankview_rank ( iType N , const DynRankViewType &drv )
+  {
+    if ( static_cast<iType>(drv.rank()) > N )
+       {
+         Kokkos::abort( "Need at least rank arguments to the operator()" ); 
+       }
+  }
+
+
+/** \brief  Assign compatible default mappings */
+struct ViewToDynRankViewTag {};
+
+template< class DstTraits , class SrcTraits >
+class ViewMapping< DstTraits , SrcTraits ,
+  typename std::enable_if<(
+    std::is_same< typename DstTraits::memory_space , typename SrcTraits::memory_space >::value
+    &&
+    std::is_same< typename DstTraits::specialize , void >::value
+    &&
+    std::is_same< typename SrcTraits::specialize , void >::value
+    &&
+    (
+      std::is_same< typename DstTraits::array_layout , typename SrcTraits::array_layout >::value
+      ||
+      (
+        (
+          std::is_same< typename DstTraits::array_layout , Kokkos::LayoutLeft >::value ||
+          std::is_same< typename DstTraits::array_layout , Kokkos::LayoutRight >::value ||
+          std::is_same< typename DstTraits::array_layout , Kokkos::LayoutStride >::value
+        )
+        &&
+        (
+          std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutLeft >::value ||
+          std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutRight >::value ||
+          std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutStride >::value
+        )
+      )
+    )
+  ) , ViewToDynRankViewTag >::type >
+{
+private:
+
+  enum { is_assignable_value_type =
+    std::is_same< typename DstTraits::value_type
+                , typename SrcTraits::value_type >::value ||
+    std::is_same< typename DstTraits::value_type
+                , typename SrcTraits::const_value_type >::value };
+
+  enum { is_assignable_layout =
+    std::is_same< typename DstTraits::array_layout
+                , typename SrcTraits::array_layout >::value ||
+    std::is_same< typename DstTraits::array_layout
+                , Kokkos::LayoutStride >::value 
+    };
+
+public:
+
+  enum { is_assignable = is_assignable_value_type &&
+                         is_assignable_layout };
+
+  typedef ViewMapping< DstTraits , void >  DstType ;
+  typedef ViewMapping< SrcTraits , void >  SrcType ;
+
+  template < typename DT , typename ... DP , typename ST , typename ... SP >
+  KOKKOS_INLINE_FUNCTION
+  static void assign( Kokkos::Experimental::DynRankView< DT , DP...> & dst ,  const Kokkos::View< ST , SP... > & src )
+    {
+      static_assert( is_assignable_value_type
+                   , "View assignment must have same value type or const = non-const" );
+
+      static_assert( is_assignable_layout
+                   , "View assignment must have compatible layout or have rank <= 1" );
+
+    // Removed dimension checks...
+
+      typedef typename DstType::offset_type  dst_offset_type ;
+      dst.m_map.m_offset = dst_offset_type(std::integral_constant<unsigned,0>() , src.layout() ); //Check this for integer input1 for padding, etc
+      dst.m_map.m_handle = Kokkos::Experimental::Impl::ViewDataHandle< DstTraits >::assign( src.m_map.m_handle , src.m_track );
+      dst.m_track.assign( src.m_track , DstTraits::is_managed );
+      dst.m_rank = src.Rank ;
+    }
+};
+
+} //end Impl
+
+/* \class DynRankView
+ * \brief Container that creates a Kokkos view with rank determined at runtime. 
+ *   Essentially this is a rank 7 view that wraps the access operators
+ *   to yield the functionality of a view 
+ *
+ *   Changes from View
+ *   1. The rank of the DynRankView is returned by the method rank()
+ *   2. Max rank of a DynRankView is 7
+ *   3. subview name is subdynrankview
+ *   4. Every subdynrankview is returned with LayoutStride
+ *
+ *   NEW: Redesigned DynRankView
+ *   5. subview function name now available
+ *   6. Copy and Copy-Assign View to DynRankView
+ *   7. deep_copy between Views and DynRankViews
+ *   8. rank( view ); returns the rank of View or DynRankView
+ *
+ */
+
+template< class > struct is_dyn_rank_view : public std::false_type {};
+
+template< class D, class ... P >
+struct is_dyn_rank_view< Kokkos::Experimental::DynRankView<D,P...> > : public std::true_type {};
+
+
+template< typename DataType , class ... Properties >
+class DynRankView : public ViewTraits< DataType , Properties ... >
+{
+  static_assert( !std::is_array<DataType>::value && !std::is_pointer<DataType>::value , "Cannot template DynRankView with array or pointer datatype - must be pod" );
+
+private: 
+  template < class , class ... > friend class DynRankView ;
+//  template < class , class ... > friend class Kokkos::Experimental::View ; //unnecessary now...
+  template < class , class ... > friend class Impl::ViewMapping ;
+
+public: 
+  typedef ViewTraits< DataType , Properties ... > drvtraits ;
+
+  typedef View< DataType******* , Properties...> view_type ; 
+
+  typedef ViewTraits< DataType******* , Properties ... > traits ;
+
+
+private:
+  typedef Kokkos::Experimental::Impl::ViewMapping< traits , void > map_type ;
+  typedef Kokkos::Experimental::Impl::SharedAllocationTracker      track_type ;
+
+  track_type  m_track ;
+  map_type    m_map ;
+  unsigned m_rank;
+
+public: 
+  KOKKOS_INLINE_FUNCTION
+  view_type & DownCast() const { return ( view_type & ) (*this); }
+  KOKKOS_INLINE_FUNCTION
+  const view_type & ConstDownCast() const { return (const view_type & ) (*this); }
+
+  //Types below - at least the HostMirror requires the value_type, NOT the rank 7 data_type of the traits
+
+  /** \brief  Compatible view of array of scalar types */
+  typedef DynRankView< typename drvtraits::scalar_array_type ,
+                typename drvtraits::array_layout ,
+                typename drvtraits::device_type ,
+                typename drvtraits::memory_traits >
+    array_type ;
+
+  /** \brief  Compatible view of const data type */
+  typedef DynRankView< typename drvtraits::const_data_type ,
+                typename drvtraits::array_layout ,
+                typename drvtraits::device_type ,
+                typename drvtraits::memory_traits >
+    const_type ;
+
+  /** \brief  Compatible view of non-const data type */
+  typedef DynRankView< typename drvtraits::non_const_data_type ,
+                typename drvtraits::array_layout ,
+                typename drvtraits::device_type ,
+                typename drvtraits::memory_traits >
+    non_const_type ;
+
+  /** \brief  Compatible HostMirror view */
+  typedef DynRankView< typename drvtraits::non_const_data_type ,
+                typename drvtraits::array_layout ,
+                typename drvtraits::host_mirror_space >
+    HostMirror ;
+
+
+  //----------------------------------------
+  // Domain rank and extents
+
+//  enum { Rank = map_type::Rank }; //Will be dyn rank of 7 always, keep the enum?
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION constexpr
+  typename std::enable_if< std::is_integral<iType>::value , size_t >::type
+  extent( const iType & r ) const
+    { return m_map.extent(r); }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION constexpr
+  typename std::enable_if< std::is_integral<iType>::value , int >::type
+  extent_int( const iType & r ) const
+    { return static_cast<int>(m_map.extent(r)); }
+
+  KOKKOS_INLINE_FUNCTION constexpr
+  typename traits::array_layout layout() const
+    { return m_map.layout(); }
+
+  //----------------------------------------
+  /*  Deprecate all 'dimension' functions in favor of
+   *  ISO/C++ vocabulary 'extent'.
+   */
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION constexpr
+  typename std::enable_if< std::is_integral<iType>::value , size_t >::type
+  dimension( const iType & r ) const { return extent( r ); }
+
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_0() const { return m_map.dimension_0(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_1() const { return m_map.dimension_1(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_2() const { return m_map.dimension_2(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_3() const { return m_map.dimension_3(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_4() const { return m_map.dimension_4(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_5() const { return m_map.dimension_5(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_6() const { return m_map.dimension_6(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_7() const { return m_map.dimension_7(); }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION constexpr size_t size() const { return m_map.dimension_0() *
+                                                                m_map.dimension_1() *
+                                                                m_map.dimension_2() *
+                                                                m_map.dimension_3() *
+                                                                m_map.dimension_4() *
+                                                                m_map.dimension_5() *
+                                                                m_map.dimension_6() *
+                                                                m_map.dimension_7(); }
+
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_0() const { return m_map.stride_0(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_1() const { return m_map.stride_1(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_2() const { return m_map.stride_2(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_3() const { return m_map.stride_3(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_4() const { return m_map.stride_4(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_5() const { return m_map.stride_5(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_6() const { return m_map.stride_6(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_7() const { return m_map.stride_7(); }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION void stride( iType * const s ) const { m_map.stride(s); }
+
+  //----------------------------------------
+  // Range span is the span which contains all members.
+
+  typedef typename map_type::reference_type  reference_type ;
+  typedef typename map_type::pointer_type    pointer_type ;
+
+  enum { reference_type_is_lvalue_reference = std::is_lvalue_reference< reference_type >::value };
+
+  KOKKOS_INLINE_FUNCTION constexpr size_t span() const { return m_map.span(); }
+  // Deprecated, use 'span()' instead
+  KOKKOS_INLINE_FUNCTION constexpr size_t capacity() const { return m_map.span(); }
+  KOKKOS_INLINE_FUNCTION constexpr bool   span_is_contiguous() const { return m_map.span_is_contiguous(); }
+  KOKKOS_INLINE_FUNCTION constexpr pointer_type data() const { return m_map.data(); }
+
+  // Deprecated, use 'span_is_contigous()' instead
+  KOKKOS_INLINE_FUNCTION constexpr bool   is_contiguous() const { return m_map.span_is_contiguous(); }
+  // Deprecated, use 'data()' instead
+  KOKKOS_INLINE_FUNCTION constexpr pointer_type ptr_on_device() const { return m_map.data(); }
+
+  //----------------------------------------
+  // Allow specializations to query their specialized map
+
+  KOKKOS_INLINE_FUNCTION
+  const Kokkos::Experimental::Impl::ViewMapping< traits , void > &
+  implementation_map() const { return m_map ; }
+
+  //----------------------------------------
+
+private:
+
+  enum {
+    is_layout_left = std::is_same< typename traits::array_layout
+                                  , Kokkos::LayoutLeft >::value ,
+
+    is_layout_right = std::is_same< typename traits::array_layout
+                                  , Kokkos::LayoutRight >::value ,
+
+    is_layout_stride = std::is_same< typename traits::array_layout
+                                   , Kokkos::LayoutStride >::value ,
+
+    is_default_map =
+      std::is_same< typename traits::specialize , void >::value &&
+      ( is_layout_left || is_layout_right || is_layout_stride )
+  };
+
+// Bounds checking macros
+#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
+
+#define KOKKOS_VIEW_OPERATOR_VERIFY( N , ARG ) \
+  Kokkos::Impl::VerifyExecutionCanAccessMemorySpace \
+    < Kokkos::Impl::ActiveExecutionMemorySpace , typename traits::memory_space >::verify(); \
+  Kokkos::Experimental::Impl::verify_dynrankview_rank ( N , *this ) ; \
+  Kokkos::Experimental::Impl::view_verify_operator_bounds ARG ; 
+
+#else
+
+#define KOKKOS_VIEW_OPERATOR_VERIFY( N , ARG ) \
+  Kokkos::Impl::VerifyExecutionCanAccessMemorySpace \
+    < Kokkos::Impl::ActiveExecutionMemorySpace , typename traits::memory_space >::verify();
+
+#endif
+
+public:
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr unsigned rank() const { return m_rank; }
+
+
+  //operators ()
+  // Rank 0
+  KOKKOS_INLINE_FUNCTION
+  reference_type operator()() const
+    { 
+      KOKKOS_VIEW_OPERATOR_VERIFY( 0 , ( implementation_map() ) )
+      return implementation_map().reference();
+      //return m_map.reference(0,0,0,0,0,0,0); 
+    }
+
+  // Rank 1
+  // This assumes a contiguous underlying memory (i.e. no padding, no striding...)
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  typename std::enable_if< std::is_same<typename drvtraits::value_type, typename drvtraits::scalar_array_type>::value && std::is_integral<iType>::value, reference_type>::type
+  operator[](const iType & i0) const
+    {
+      return data()[i0];
+    }
+
+  // This assumes a contiguous underlying memory (i.e. no padding, no striding...
+  // AND a Trilinos/Sacado scalar type )
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  typename std::enable_if< !std::is_same<typename drvtraits::value_type, typename drvtraits::scalar_array_type>::value && std::is_integral<iType>::value, reference_type>::type
+  operator[](const iType & i0) const
+    {
+//      auto map = implementation_map();
+      const size_t dim_scalar = m_map.dimension_scalar();
+      const size_t bytes = this->span() / dim_scalar;
+
+      typedef Kokkos::View<DataType*, typename traits::array_layout, typename traits::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged | traits::memory_traits::RandomAccess | traits::memory_traits::Atomic> > tmp_view_type;
+      tmp_view_type rankone_view(this->data(), bytes, dim_scalar);
+      return rankone_view(i0);
+    }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  typename std::enable_if< (std::is_same<typename traits::specialize , void>::value && std::is_integral<iType>::value), reference_type>::type
+  operator()(const iType & i0 ) const 
+    { 
+      KOKKOS_VIEW_OPERATOR_VERIFY( 1 , ( m_map , i0 ) )
+      return m_map.reference(i0); 
+    }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  typename std::enable_if< !(std::is_same<typename traits::specialize , void>::value && std::is_integral<iType>::value), reference_type>::type
+  operator()(const iType & i0 ) const
+    {
+      return m_map.reference(i0,0,0,0,0,0,0);
+    }
+
+  // Rank 2
+  template< typename iType0 , typename iType1 >
+  KOKKOS_INLINE_FUNCTION
+  typename std::enable_if< (std::is_same<typename traits::specialize , void>::value && std::is_integral<iType0>::value  && std::is_integral<iType1>::value), reference_type>::type
+  operator()(const iType0 & i0 , const iType1 & i1 ) const 
+    { 
+      KOKKOS_VIEW_OPERATOR_VERIFY( 2 , ( m_map , i0 , i1 ) )
+      return m_map.reference(i0,i1); 
+    }
+
+  template< typename iType0 , typename iType1 >
+  KOKKOS_INLINE_FUNCTION
+  typename std::enable_if< !(std::is_same<typename drvtraits::specialize , void>::value && std::is_integral<iType0>::value), reference_type>::type
+  operator()(const iType0 & i0 , const iType1 & i1 ) const 
+    { 
+      KOKKOS_VIEW_OPERATOR_VERIFY( 2 , ( m_map , i0 , i1 ) )
+      return m_map.reference(i0,i1,0,0,0,0,0); 
+    }
+
+  // Rank 3
+  template< typename iType0 , typename iType1 , typename iType2 >
+  KOKKOS_INLINE_FUNCTION
+  typename std::enable_if< (std::is_same<typename traits::specialize , void>::value && std::is_integral<iType0>::value  && std::is_integral<iType1>::value && std::is_integral<iType2>::value), reference_type>::type
+  operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 ) const 
+    { 
+      KOKKOS_VIEW_OPERATOR_VERIFY( 3 , ( m_map , i0 , i1 , i2 ) )
+      return m_map.reference(i0,i1,i2); 
+    }
+
+  template< typename iType0 , typename iType1 , typename iType2 >
+  KOKKOS_INLINE_FUNCTION
+  typename std::enable_if< !(std::is_same<typename drvtraits::specialize , void>::value && std::is_integral<iType0>::value), reference_type>::type
+  operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 ) const 
+    { 
+      KOKKOS_VIEW_OPERATOR_VERIFY( 3 , ( m_map , i0 , i1 , i2 ) )
+      return m_map.reference(i0,i1,i2,0,0,0,0); 
+    }
+
+  // Rank 4
+  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 >
+  KOKKOS_INLINE_FUNCTION
+  typename std::enable_if< (std::is_same<typename traits::specialize , void>::value && std::is_integral<iType0>::value  && std::is_integral<iType1>::value && std::is_integral<iType2>::value && std::is_integral<iType3>::value), reference_type>::type
+  operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ) const 
+    { 
+      KOKKOS_VIEW_OPERATOR_VERIFY( 4 , ( m_map , i0 , i1 , i2 , i3 ) )
+      return m_map.reference(i0,i1,i2,i3); 
+    }
+
+  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 >
+  KOKKOS_INLINE_FUNCTION
+  typename std::enable_if< !(std::is_same<typename drvtraits::specialize , void>::value && std::is_integral<iType0>::value), reference_type>::type
+  operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ) const 
+    { 
+      KOKKOS_VIEW_OPERATOR_VERIFY( 4 , ( m_map , i0 , i1 , i2 , i3 ) )
+      return m_map.reference(i0,i1,i2,i3,0,0,0); 
+    }
+
+  // Rank 5
+  template< typename iType0 , typename iType1 , typename iType2 , typename iType3, typename iType4 >
+  KOKKOS_INLINE_FUNCTION
+  typename std::enable_if< (std::is_same<typename traits::specialize , void>::value && std::is_integral<iType0>::value  && std::is_integral<iType1>::value && std::is_integral<iType2>::value && std::is_integral<iType3>::value && std::is_integral<iType4>::value), reference_type>::type
+  operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , const iType4 & i4 ) const 
+    { 
+      KOKKOS_VIEW_OPERATOR_VERIFY( 5 , ( m_map , i0 , i1 , i2 , i3 , i4 ) )
+      return m_map.reference(i0,i1,i2,i3,i4); 
+    }
+
+  template< typename iType0 , typename iType1 , typename iType2 , typename iType3, typename iType4 >
+  KOKKOS_INLINE_FUNCTION
+  typename std::enable_if< !(std::is_same<typename drvtraits::specialize , void>::value && std::is_integral<iType0>::value), reference_type>::type
+  operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , const iType4 & i4 ) const 
+    { 
+      KOKKOS_VIEW_OPERATOR_VERIFY( 5 , ( m_map , i0 , i1 , i2 , i3 , i4 ) )
+      return m_map.reference(i0,i1,i2,i3,i4,0,0); 
+    }
+
+  // Rank 6
+  template< typename iType0 , typename iType1 , typename iType2 , typename iType3, typename iType4 , typename iType5 >
+  KOKKOS_INLINE_FUNCTION
+  typename std::enable_if< (std::is_same<typename traits::specialize , void>::value && std::is_integral<iType0>::value  && std::is_integral<iType1>::value && std::is_integral<iType2>::value && std::is_integral<iType3>::value && std::is_integral<iType4>::value && std::is_integral<iType5>::value), reference_type>::type
+  operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , const iType4 & i4 , const iType5 & i5 ) const 
+    { 
+      KOKKOS_VIEW_OPERATOR_VERIFY( 6 , ( m_map , i0 , i1 , i2 , i3 , i4 , i5 ) )
+      return m_map.reference(i0,i1,i2,i3,i4,i5); 
+    }
+
+  template< typename iType0 , typename iType1 , typename iType2 , typename iType3, typename iType4 , typename iType5 >
+  KOKKOS_INLINE_FUNCTION
+  typename std::enable_if< !(std::is_same<typename drvtraits::specialize , void>::value && std::is_integral<iType0>::value), reference_type>::type
+  operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , const iType4 & i4 , const iType5 & i5 ) const 
+    { 
+      KOKKOS_VIEW_OPERATOR_VERIFY( 6 , ( m_map , i0 , i1 , i2 , i3 , i4 , i5 ) )
+      return m_map.reference(i0,i1,i2,i3,i4,i5,0); 
+    }
+
+  // Rank 7
+  template< typename iType0 , typename iType1 , typename iType2 , typename iType3, typename iType4 , typename iType5 , typename iType6 >
+  KOKKOS_INLINE_FUNCTION
+  typename std::enable_if< (std::is_integral<iType0>::value  && std::is_integral<iType1>::value && std::is_integral<iType2>::value && std::is_integral<iType3>::value && std::is_integral<iType4>::value && std::is_integral<iType5>::value && std::is_integral<iType6>::value), reference_type>::type
+  operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , const iType4 & i4 , const iType5 & i5 , const iType6 & i6 ) const 
+    { 
+      KOKKOS_VIEW_OPERATOR_VERIFY( 7 , ( m_map , i0 , i1 , i2 , i3 , i4 , i5 , i6 ) )
+      return m_map.reference(i0,i1,i2,i3,i4,i5,i6); 
+    }
+
+#undef KOKKOS_VIEW_OPERATOR_VERIFY
+
+  //----------------------------------------
+  // Standard constructor, destructor, and assignment operators... 
+
+  KOKKOS_INLINE_FUNCTION
+  ~DynRankView() {}
+
+  KOKKOS_INLINE_FUNCTION
+  DynRankView() : m_track(), m_map(), m_rank() {} //Default ctor
+
+  KOKKOS_INLINE_FUNCTION
+  DynRankView( const DynRankView & rhs ) : m_track( rhs.m_track ), m_map( rhs.m_map ), m_rank(rhs.m_rank) {}
+
+  KOKKOS_INLINE_FUNCTION
+  DynRankView( DynRankView && rhs ) : m_track( rhs.m_track ), m_map( rhs.m_map ), m_rank(rhs.m_rank) {}
+
+  KOKKOS_INLINE_FUNCTION
+  DynRankView & operator = ( const DynRankView & rhs ) { m_track = rhs.m_track; m_map = rhs.m_map; m_rank = rhs.m_rank; return *this; }
+
+  KOKKOS_INLINE_FUNCTION
+  DynRankView & operator = ( DynRankView && rhs ) { m_track = rhs.m_track; m_map = rhs.m_map; m_rank = rhs.m_rank; return *this; } 
+
+  //----------------------------------------
+  // Compatible view copy constructor and assignment
+  // may assign unmanaged from managed.
+  template< class RT , class ... RP >
+  KOKKOS_INLINE_FUNCTION
+  DynRankView( const DynRankView<RT,RP...> & rhs )
+    : m_track( rhs.m_track , traits::is_managed )
+    , m_map()
+    , m_rank(rhs.m_rank)
+    {
+      typedef typename DynRankView<RT,RP...> ::traits SrcTraits ;
+      typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits , void > Mapping ;
+      static_assert( Mapping::is_assignable , "Incompatible DynRankView copy construction" );
+      Mapping::assign( m_map , rhs.m_map , rhs.m_track );
+    }
+
+  template< class RT , class ... RP >
+  KOKKOS_INLINE_FUNCTION
+  DynRankView & operator = (const DynRankView<RT,RP...> & rhs )
+    {
+      typedef typename DynRankView<RT,RP...> ::traits SrcTraits ;
+      typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits , void > Mapping ;
+      static_assert( Mapping::is_assignable , "Incompatible DynRankView copy construction" );
+      Mapping::assign( m_map , rhs.m_map , rhs.m_track );
+      m_track.assign( rhs.m_track , traits::is_managed );
+      m_rank = rhs.rank();
+      return *this;
+    }
+
+// Experimental
+// Copy/Assign View to DynRankView
+  template< class RT , class ... RP >
+  KOKKOS_INLINE_FUNCTION
+  DynRankView( const View<RT,RP...> & rhs )
+    : m_track()
+    , m_map()
+    , m_rank( rhs.Rank )
+    {
+      typedef typename View<RT,RP...>::traits  SrcTraits ;
+      typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits , Kokkos::Experimental::Impl::ViewToDynRankViewTag >  Mapping ;
+      static_assert( Mapping::is_assignable , "Incompatible DynRankView copy construction" );
+      Mapping::assign( *this , rhs );
+    }
+
+  template< class RT , class ... RP >
+  KOKKOS_INLINE_FUNCTION
+  DynRankView & operator = ( const View<RT,RP...> & rhs )
+    {
+      typedef typename View<RT,RP...>::traits  SrcTraits ;
+      typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits , Kokkos::Experimental::Impl::ViewToDynRankViewTag >  Mapping ;
+      static_assert( Mapping::is_assignable , "Incompatible View to DynRankView copy assignment" );
+      Mapping::assign( *this , rhs );
+      return *this ;
+    }
+
+  //----------------------------------------
+  // Allocation tracking properties
+
+  KOKKOS_INLINE_FUNCTION
+  int use_count() const
+    { return m_track.use_count(); }
+
+  inline
+  const std::string label() const
+    { return m_track.template get_label< typename traits::memory_space >(); }
+
+  //----------------------------------------
+  // Allocation according to allocation properties and array layout
+  // unused arg_layout dimensions must be set to ~size_t(0) so that rank deduction can properly take place
+  template< class ... P >
+  explicit inline
+  DynRankView( const Impl::ViewCtorProp< P ... > & arg_prop
+      , typename std::enable_if< ! Impl::ViewCtorProp< P... >::has_pointer
+                               , typename traits::array_layout
+                               >::type const & arg_layout
+      )
+      : m_track()
+      , m_map()
+      , m_rank( Impl::DynRankDimTraits<typename traits::specialize>::computeRank(arg_layout) )
+    {
+      // Append layout and spaces if not input
+      typedef Impl::ViewCtorProp< P ... > alloc_prop_input ;
+
+      // use 'std::integral_constant<unsigned,I>' for non-types
+      // to avoid duplicate class error.
+      typedef Impl::ViewCtorProp
+        < P ...
+        , typename std::conditional
+            < alloc_prop_input::has_label
+            , std::integral_constant<unsigned,0>
+            , typename std::string
+            >::type
+        , typename std::conditional
+            < alloc_prop_input::has_memory_space
+            , std::integral_constant<unsigned,1>
+            , typename traits::device_type::memory_space
+            >::type
+        , typename std::conditional
+            < alloc_prop_input::has_execution_space
+            , std::integral_constant<unsigned,2>
+            , typename traits::device_type::execution_space
+            >::type
+        > alloc_prop ;
+
+      static_assert( traits::is_managed
+                   , "View allocation constructor requires managed memory" );
+
+      if ( alloc_prop::initialize &&
+           ! alloc_prop::execution_space::is_initialized() ) {
+        // If initializing view data then
+        // the execution space must be initialized.
+        Kokkos::Impl::throw_runtime_exception("Constructing DynRankView and initializing data with uninitialized execution space");
+      }
+
+      // Copy the input allocation properties with possibly defaulted properties
+      alloc_prop prop( arg_prop );
+
+//------------------------------------------------------------
+#if defined( KOKKOS_HAVE_CUDA )
+      // If allocating in CudaUVMSpace must fence before and after
+      // the allocation to protect against possible concurrent access
+      // on the CPU and the GPU.
+      // Fence using the trait's executon space (which will be Kokkos::Cuda)
+      // to avoid incomplete type errors from usng Kokkos::Cuda directly.
+      if ( std::is_same< Kokkos::CudaUVMSpace , typename traits::device_type::memory_space >::value ) {
+        traits::device_type::memory_space::execution_space::fence();
+      }
+#endif
+//------------------------------------------------------------
+
+      Kokkos::Experimental::Impl::SharedAllocationRecord<> *
+        record = m_map.allocate_shared( prop , Impl::DynRankDimTraits<typename traits::specialize>::createLayout(arg_layout) );
+
+//------------------------------------------------------------
+#if defined( KOKKOS_HAVE_CUDA )
+      if ( std::is_same< Kokkos::CudaUVMSpace , typename traits::device_type::memory_space >::value ) {
+        traits::device_type::memory_space::execution_space::fence();
+      }
+#endif
+//------------------------------------------------------------
+
+      // Setup and initialization complete, start tracking
+      m_track.assign_allocated_record_to_uninitialized( record );
+    }
+
+
+  // Wrappers
+  template< class ... P >
+  explicit KOKKOS_INLINE_FUNCTION
+  DynRankView( const Impl::ViewCtorProp< P ... > & arg_prop
+      , typename std::enable_if< Impl::ViewCtorProp< P... >::has_pointer
+                               , typename traits::array_layout
+                               >::type const & arg_layout
+      )
+      : m_track() // No memory tracking
+      , m_map( arg_prop , Impl::DynRankDimTraits<typename traits::specialize>::createLayout(arg_layout) )
+      , m_rank( Impl::DynRankDimTraits<typename traits::specialize>::computeRank(arg_layout) )
+    {
+      static_assert(
+        std::is_same< pointer_type
+                    , typename Impl::ViewCtorProp< P... >::pointer_type
+                    >::value ,
+        "Constructing DynRankView to wrap user memory must supply matching pointer type" );
+    }
+
+  //----------------------------------------
+  //Constructor(s)
+
+  // Simple dimension-only layout
+  template< class ... P >
+  explicit inline
+  DynRankView( const Impl::ViewCtorProp< P ... > & arg_prop
+      , typename std::enable_if< ! Impl::ViewCtorProp< P... >::has_pointer
+                               , size_t
+                               >::type const arg_N0 = ~size_t(0)
+      , const size_t arg_N1 = ~size_t(0)
+      , const size_t arg_N2 = ~size_t(0)
+      , const size_t arg_N3 = ~size_t(0)
+      , const size_t arg_N4 = ~size_t(0)
+      , const size_t arg_N5 = ~size_t(0)
+      , const size_t arg_N6 = ~size_t(0)
+      , const size_t arg_N7 = ~size_t(0)
+      )
+    : DynRankView( arg_prop
+    , typename traits::array_layout
+          ( arg_N0 , arg_N1 , arg_N2 , arg_N3 , arg_N4 , arg_N5 , arg_N6 , arg_N7 )
+      )
+    {}
+
+  template< class ... P >
+  explicit KOKKOS_INLINE_FUNCTION
+  DynRankView( const Impl::ViewCtorProp< P ... > & arg_prop
+      , typename std::enable_if< Impl::ViewCtorProp< P... >::has_pointer
+                               , size_t
+                               >::type const arg_N0 = ~size_t(0)
+      , const size_t arg_N1 = ~size_t(0)
+      , const size_t arg_N2 = ~size_t(0)
+      , const size_t arg_N3 = ~size_t(0)
+      , const size_t arg_N4 = ~size_t(0)
+      , const size_t arg_N5 = ~size_t(0)
+      , const size_t arg_N6 = ~size_t(0)
+      , const size_t arg_N7 = ~size_t(0)
+      )
+    : DynRankView( arg_prop
+    , typename traits::array_layout
+          ( arg_N0 , arg_N1 , arg_N2 , arg_N3 , arg_N4 , arg_N5 , arg_N6 , arg_N7 )
+      )
+    {}
+
+  // Allocate with label and layout
+  template< typename Label >
+  explicit inline
+  DynRankView( const Label & arg_label
+      , typename std::enable_if<
+          Kokkos::Experimental::Impl::is_view_label<Label>::value ,
+          typename traits::array_layout >::type const & arg_layout
+      )
+    : DynRankView( Impl::ViewCtorProp< std::string >( arg_label ) , arg_layout )
+    {}
+
+  // Allocate label and layout, must disambiguate from subview constructor
+  template< typename Label >
+  explicit inline
+  DynRankView( const Label & arg_label
+      , typename std::enable_if<
+          Kokkos::Experimental::Impl::is_view_label<Label>::value ,
+        const size_t >::type arg_N0 = ~size_t(0) 
+      , const size_t arg_N1 = ~size_t(0)
+      , const size_t arg_N2 = ~size_t(0)
+      , const size_t arg_N3 = ~size_t(0)
+      , const size_t arg_N4 = ~size_t(0)
+      , const size_t arg_N5 = ~size_t(0)
+      , const size_t arg_N6 = ~size_t(0)
+      , const size_t arg_N7 = ~size_t(0)
+      )
+    : DynRankView( Impl::ViewCtorProp< std::string >( arg_label )
+    , typename traits::array_layout
+          ( arg_N0 , arg_N1 , arg_N2 , arg_N3 , arg_N4 , arg_N5 , arg_N6 , arg_N7 )
+          )
+    {}
+
+  // For backward compatibility
+  explicit inline
+  DynRankView( const ViewAllocateWithoutInitializing & arg_prop
+      , const typename traits::array_layout & arg_layout
+      )
+    : DynRankView( Impl::ViewCtorProp< std::string , Kokkos::Experimental::Impl::WithoutInitializing_t >( arg_prop.label , Kokkos::Experimental::WithoutInitializing )
+          , Impl::DynRankDimTraits<typename traits::specialize>::createLayout(arg_layout)
+      )
+    {}
+
+  explicit inline
+  DynRankView( const ViewAllocateWithoutInitializing & arg_prop
+      , const size_t arg_N0 = ~size_t(0)
+      , const size_t arg_N1 = ~size_t(0)
+      , const size_t arg_N2 = ~size_t(0)
+      , const size_t arg_N3 = ~size_t(0)
+      , const size_t arg_N4 = ~size_t(0)
+      , const size_t arg_N5 = ~size_t(0)
+      , const size_t arg_N6 = ~size_t(0)
+      , const size_t arg_N7 = ~size_t(0)
+      )
+    : DynRankView(Impl::ViewCtorProp< std::string , Kokkos::Experimental::Impl::WithoutInitializing_t >( arg_prop.label , Kokkos::Experimental::WithoutInitializing ), arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7 ) 
+    {}
+
+  //----------------------------------------
+  // Memory span required to wrap these dimensions.
+  static constexpr size_t required_allocation_size(
+                                       const size_t arg_N0 = 0
+                                     , const size_t arg_N1 = 0
+                                     , const size_t arg_N2 = 0
+                                     , const size_t arg_N3 = 0
+                                     , const size_t arg_N4 = 0
+                                     , const size_t arg_N5 = 0
+                                     , const size_t arg_N6 = 0
+                                     , const size_t arg_N7 = 0
+                                     )
+    {
+      return map_type::memory_span(
+        typename traits::array_layout
+          ( arg_N0 , arg_N1 , arg_N2 , arg_N3
+          , arg_N4 , arg_N5 , arg_N6 , arg_N7 ) );
+    }
+
+  explicit KOKKOS_INLINE_FUNCTION
+  DynRankView( pointer_type arg_ptr
+      , const size_t arg_N0 = ~size_t(0)
+      , const size_t arg_N1 = ~size_t(0)
+      , const size_t arg_N2 = ~size_t(0)
+      , const size_t arg_N3 = ~size_t(0)
+      , const size_t arg_N4 = ~size_t(0)
+      , const size_t arg_N5 = ~size_t(0)
+      , const size_t arg_N6 = ~size_t(0)
+      , const size_t arg_N7 = ~size_t(0)
+      )
+    : DynRankView( Impl::ViewCtorProp<pointer_type>(arg_ptr) , arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7 )
+    {}
+
+  explicit KOKKOS_INLINE_FUNCTION
+  DynRankView( pointer_type arg_ptr
+      , typename traits::array_layout & arg_layout
+      )
+    : DynRankView( Impl::ViewCtorProp<pointer_type>(arg_ptr) , arg_layout )
+    {}
+
+
+  //----------------------------------------
+  // Shared scratch memory constructor
+
+  static inline
+  size_t shmem_size( const size_t arg_N0 = ~size_t(0) ,
+                     const size_t arg_N1 = ~size_t(0) ,
+                     const size_t arg_N2 = ~size_t(0) ,
+                     const size_t arg_N3 = ~size_t(0) ,
+                     const size_t arg_N4 = ~size_t(0) ,
+                     const size_t arg_N5 = ~size_t(0) ,
+                     const size_t arg_N6 = ~size_t(0) ,
+                     const size_t arg_N7 = ~size_t(0) )
+  {
+    const size_t num_passed_args =
+      ( arg_N0 != ~size_t(0) ) + ( arg_N1 != ~size_t(0) ) + ( arg_N2 != ~size_t(0) ) +
+      ( arg_N3 != ~size_t(0) ) + ( arg_N4 != ~size_t(0) ) + ( arg_N5 != ~size_t(0) ) +
+      ( arg_N6 != ~size_t(0) ) + ( arg_N7 != ~size_t(0) );
+
+    if ( std::is_same<typename traits::specialize , void>::value && num_passed_args != traits::rank_dynamic ) {
+      Kokkos::abort( "Kokkos::View::shmem_size() rank_dynamic != number of arguments.\n" );
+    }
+    {}
+
+    return map_type::memory_span(
+           typename traits::array_layout
+            ( arg_N0 , arg_N1 , arg_N2 , arg_N3
+            , arg_N4 , arg_N5 , arg_N6 , arg_N7 ) );
+  }
+
+  explicit KOKKOS_INLINE_FUNCTION
+  DynRankView( const typename traits::execution_space::scratch_memory_space & arg_space
+      , const typename traits::array_layout & arg_layout )
+    : DynRankView( Impl::ViewCtorProp<pointer_type>(
+              reinterpret_cast<pointer_type>(
+                arg_space.get_shmem( map_type::memory_span( 
+                  Impl::DynRankDimTraits<typename traits::specialize>::createLayout( arg_layout ) //is this correct?
+                ) ) ) )
+         , arg_layout )
+     {}
+
+  explicit KOKKOS_INLINE_FUNCTION
+  DynRankView( const typename traits::execution_space::scratch_memory_space & arg_space
+      , const size_t arg_N0 = ~size_t(0)
+      , const size_t arg_N1 = ~size_t(0)
+      , const size_t arg_N2 = ~size_t(0)
+      , const size_t arg_N3 = ~size_t(0)
+      , const size_t arg_N4 = ~size_t(0)
+      , const size_t arg_N5 = ~size_t(0)
+      , const size_t arg_N6 = ~size_t(0)
+      , const size_t arg_N7 = ~size_t(0) )
+
+    : DynRankView( Impl::ViewCtorProp<pointer_type>(
+                   reinterpret_cast<pointer_type>(
+                     arg_space.get_shmem(
+                       map_type::memory_span(
+                       Impl::DynRankDimTraits<typename traits::specialize>::createLayout(
+                       typename traits::array_layout
+                       ( arg_N0 , arg_N1 , arg_N2 , arg_N3
+                       , arg_N4 , arg_N5 , arg_N6 , arg_N7 ) ) ) ) ) 
+                    )
+                  , typename traits::array_layout
+                    ( arg_N0 , arg_N1 , arg_N2 , arg_N3
+                    , arg_N4 , arg_N5 , arg_N6 , arg_N7 )
+        )
+    {}
+
+};
+
+
+  template < typename D , class ... P >
+  KOKKOS_INLINE_FUNCTION
+  constexpr unsigned rank( const DynRankView<D , P...> & DRV ) { return DRV.rank(); } //needed for transition to common constexpr method in view and dynrankview to return rank
+
+//----------------------------------------------------------------------------
+// Subview mapping.
+// Deduce destination view type from source view traits and subview arguments
+
+namespace Impl {
+
+struct DynRankSubviewTag {};
+
+template< class SrcTraits , class ... Args >
+struct ViewMapping
+  < typename std::enable_if<(
+      std::is_same< typename SrcTraits::specialize , void >::value
+      &&
+      (
+        std::is_same< typename SrcTraits::array_layout
+                    , Kokkos::LayoutLeft >::value ||
+        std::is_same< typename SrcTraits::array_layout
+                    , Kokkos::LayoutRight >::value ||
+        std::is_same< typename SrcTraits::array_layout
+                    , Kokkos::LayoutStride >::value
+      ) 
+    ), DynRankSubviewTag >::type
+  , SrcTraits
+  , Args ... >
+{
+private:
+
+  enum
+    { RZ = false
+    , R0 = bool(is_integral_extent<0,Args...>::value)
+    , R1 = bool(is_integral_extent<1,Args...>::value)
+    , R2 = bool(is_integral_extent<2,Args...>::value)
+    , R3 = bool(is_integral_extent<3,Args...>::value)
+    , R4 = bool(is_integral_extent<4,Args...>::value)
+    , R5 = bool(is_integral_extent<5,Args...>::value)
+    , R6 = bool(is_integral_extent<6,Args...>::value)
+    };
+
+  enum { rank = unsigned(R0) + unsigned(R1) + unsigned(R2) + unsigned(R3)
+              + unsigned(R4) + unsigned(R5) + unsigned(R6) };
+
+  typedef Kokkos::LayoutStride array_layout ;
+
+  typedef typename SrcTraits::value_type  value_type ;
+
+  typedef value_type******* data_type ; 
+
+public:
+
+  typedef Kokkos::Experimental::ViewTraits
+    < data_type
+    , array_layout 
+    , typename SrcTraits::device_type
+    , typename SrcTraits::memory_traits > traits_type ;
+
+  typedef Kokkos::Experimental::View
+    < data_type
+    , array_layout 
+    , typename SrcTraits::device_type
+    , typename SrcTraits::memory_traits > type ;
+
+
+  template< class MemoryTraits >
+  struct apply {
+
+    static_assert( Kokkos::Impl::is_memory_traits< MemoryTraits >::value , "" );
+
+    typedef Kokkos::Experimental::ViewTraits
+      < data_type 
+      , array_layout
+      , typename SrcTraits::device_type
+      , MemoryTraits > traits_type ;
+
+    typedef Kokkos::Experimental::View
+      < data_type 
+      , array_layout
+      , typename SrcTraits::device_type
+      , MemoryTraits > type ;
+  }; 
+
+
+  typedef typename SrcTraits::dimension dimension ;
+
+  template < class Arg0 = int, class Arg1 = int, class Arg2 = int, class Arg3 = int, class Arg4 = int, class Arg5 = int, class Arg6 = int >
+  struct ExtentGenerator {
+    KOKKOS_INLINE_FUNCTION
+    static SubviewExtents< 7 , rank > generator ( const dimension & dim , Arg0 arg0 = Arg0(), Arg1 arg1 = Arg1(), Arg2 arg2 = Arg2(), Arg3 arg3 = Arg3(), Arg4 arg4 = Arg4(), Arg5 arg5 = Arg5(), Arg6 arg6 = Arg6() )
+    {
+       return SubviewExtents< 7 , rank>( dim , arg0 , arg1 , arg2 , arg3 , arg4 , arg5 , arg6 );
+    }
+  };
+
+
+  typedef DynRankView< value_type , array_layout , typename SrcTraits::device_type , typename SrcTraits::memory_traits >  ret_type;
+
+  template < typename T , class ... P >
+  KOKKOS_INLINE_FUNCTION
+  static ret_type subview( const unsigned src_rank , Kokkos::Experimental::DynRankView< T , P...> const & src 
+                    , Args ... args )
+    {
+
+       typedef ViewMapping< traits_type, void >  DstType ;
+
+       typedef typename std::conditional< (rank==0) , ViewDimension<>
+                                                    , typename std::conditional< (rank==1) , ViewDimension<0>
+                                                    , typename std::conditional< (rank==2) , ViewDimension<0,0>
+                                                    , typename std::conditional< (rank==3) , ViewDimension<0,0,0>
+                                                    , typename std::conditional< (rank==4) , ViewDimension<0,0,0,0>
+                                                    , typename std::conditional< (rank==5) , ViewDimension<0,0,0,0,0>
+                                                    , typename std::conditional< (rank==6) , ViewDimension<0,0,0,0,0,0>
+                                                                                           , ViewDimension<0,0,0,0,0,0,0>
+                                                    >::type >::type >::type >::type >::type >::type >::type  DstDimType ;
+
+      typedef ViewOffset< DstDimType , Kokkos::LayoutStride > dst_offset_type ;
+      typedef typename DstType::handle_type  dst_handle_type ;
+
+      ret_type dst ;
+
+      const SubviewExtents< 7 , rank > extents = 
+        ExtentGenerator< Args ... >::generator( src.m_map.m_offset.m_dim , args... ) ; 
+
+      dst_offset_type tempdst( src.m_map.m_offset , extents ) ;
+
+      dst.m_track = src.m_track ;
+
+      dst.m_map.m_offset.m_dim.N0 = tempdst.m_dim.N0 ;
+      dst.m_map.m_offset.m_dim.N1 = tempdst.m_dim.N1 ;
+      dst.m_map.m_offset.m_dim.N2 = tempdst.m_dim.N2 ;
+      dst.m_map.m_offset.m_dim.N3 = tempdst.m_dim.N3 ;
+      dst.m_map.m_offset.m_dim.N4 = tempdst.m_dim.N4 ;
+      dst.m_map.m_offset.m_dim.N5 = tempdst.m_dim.N5 ;
+      dst.m_map.m_offset.m_dim.N6 = tempdst.m_dim.N6 ;
+
+      dst.m_map.m_offset.m_stride.S0 = tempdst.m_stride.S0 ;
+      dst.m_map.m_offset.m_stride.S1 = tempdst.m_stride.S1 ;
+      dst.m_map.m_offset.m_stride.S2 = tempdst.m_stride.S2 ;
+      dst.m_map.m_offset.m_stride.S3 = tempdst.m_stride.S3 ;
+      dst.m_map.m_offset.m_stride.S4 = tempdst.m_stride.S4 ;
+      dst.m_map.m_offset.m_stride.S5 = tempdst.m_stride.S5 ;
+      dst.m_map.m_offset.m_stride.S6 = tempdst.m_stride.S6 ;
+
+      dst.m_map.m_handle = dst_handle_type( src.m_map.m_handle +
+                                      src.m_map.m_offset( extents.domain_offset(0)
+                                                  , extents.domain_offset(1)
+                                                  , extents.domain_offset(2)
+                                                  , extents.domain_offset(3)
+                                                  , extents.domain_offset(4)
+                                                  , extents.domain_offset(5)
+                                                  , extents.domain_offset(6)
+                                                  ) );
+
+      dst.m_rank = ( src_rank > 0 ? unsigned(R0) : 0 )
+                 + ( src_rank > 1 ? unsigned(R1) : 0 )
+                 + ( src_rank > 2 ? unsigned(R2) : 0 )
+                 + ( src_rank > 3 ? unsigned(R3) : 0 )
+                 + ( src_rank > 4 ? unsigned(R4) : 0 )
+                 + ( src_rank > 5 ? unsigned(R5) : 0 )
+                 + ( src_rank > 6 ? unsigned(R6) : 0 ) ;
+
+      return dst ;
+    }
+};
+
+} // end Impl
+
+
+template< class V , class ... Args >
+using Subdynrankview = typename Kokkos::Experimental::Impl::ViewMapping< Kokkos::Experimental::Impl::DynRankSubviewTag , V , Args... >::ret_type ;
+
+template< class D , class ... P , class ...Args >
+KOKKOS_INLINE_FUNCTION
+Subdynrankview< ViewTraits<D******* , P...> , Args... > 
+subdynrankview( const Kokkos::Experimental::DynRankView< D , P... > &src , Args...args)
+  {
+    if ( src.rank() > sizeof...(Args) ) //allow sizeof...(Args) >= src.rank(), ignore the remaining args
+      { Kokkos::abort("subdynrankview: num of args must be >= rank of the source DynRankView"); }
+  
+    typedef Kokkos::Experimental::Impl::ViewMapping< Kokkos::Experimental::Impl::DynRankSubviewTag , Kokkos::Experimental::ViewTraits< D*******, P... > , Args... > metafcn ;
+
+    return metafcn::subview( src.rank() , src , args... );
+  }
+
+//Wrapper to allow subview function name
+template< class D , class ... P , class ...Args >
+KOKKOS_INLINE_FUNCTION
+Subdynrankview< ViewTraits<D******* , P...> , Args... > 
+subview( const Kokkos::Experimental::DynRankView< D , P... > &src , Args...args)
+  {
+    return subdynrankview( src , args... );
+  }
+
+} // namespace Experimental
+} // namespace Kokkos
+
+namespace Kokkos {
+namespace Experimental {
+
+// overload == and !=
+template< class LT , class ... LP , class RT , class ... RP >
+KOKKOS_INLINE_FUNCTION
+bool operator == ( const DynRankView<LT,LP...> & lhs ,
+                   const DynRankView<RT,RP...> & rhs )
+{
+  // Same data, layout, dimensions
+  typedef ViewTraits<LT,LP...>  lhs_traits ;
+  typedef ViewTraits<RT,RP...>  rhs_traits ;
+
+  return
+    std::is_same< typename lhs_traits::const_value_type ,
+                  typename rhs_traits::const_value_type >::value &&
+    std::is_same< typename lhs_traits::array_layout ,
+                  typename rhs_traits::array_layout >::value &&
+    std::is_same< typename lhs_traits::memory_space ,
+                  typename rhs_traits::memory_space >::value &&
+    lhs.rank()       ==  rhs.rank() &&
+    lhs.data()       == rhs.data() &&
+    lhs.span()       == rhs.span() &&
+    lhs.dimension(0) == rhs.dimension(0) &&
+    lhs.dimension(1) == rhs.dimension(1) &&
+    lhs.dimension(2) == rhs.dimension(2) &&
+    lhs.dimension(3) == rhs.dimension(3) &&
+    lhs.dimension(4) == rhs.dimension(4) &&
+    lhs.dimension(5) == rhs.dimension(5) &&
+    lhs.dimension(6) == rhs.dimension(6) &&
+    lhs.dimension(7) == rhs.dimension(7);
+}
+
+template< class LT , class ... LP , class RT , class ... RP >
+KOKKOS_INLINE_FUNCTION
+bool operator != ( const DynRankView<LT,LP...> & lhs ,
+                   const DynRankView<RT,RP...> & rhs )
+{
+  return ! ( operator==(lhs,rhs) );
+}
+
+} //end Experimental
+} //end Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template< class OutputView , typename Enable = void >
+struct DynRankViewFill {
+
+  typedef typename OutputView::traits::const_value_type  const_value_type ;
+
+  const OutputView output ;
+  const_value_type input ;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_t i0 ) const
+  {
+    const size_t n1 = output.dimension_1();
+    const size_t n2 = output.dimension_2();
+    const size_t n3 = output.dimension_3();
+    const size_t n4 = output.dimension_4();
+    const size_t n5 = output.dimension_5();
+    const size_t n6 = output.dimension_6();
+
+    for ( size_t i1 = 0 ; i1 < n1 ; ++i1 ) {
+    for ( size_t i2 = 0 ; i2 < n2 ; ++i2 ) {
+    for ( size_t i3 = 0 ; i3 < n3 ; ++i3 ) {
+    for ( size_t i4 = 0 ; i4 < n4 ; ++i4 ) {
+    for ( size_t i5 = 0 ; i5 < n5 ; ++i5 ) {
+    for ( size_t i6 = 0 ; i6 < n6 ; ++i6 ) {
+      output(i0,i1,i2,i3,i4,i5,i6) = input ;
+    }}}}}}
+  }
+
+  DynRankViewFill( const OutputView & arg_out , const_value_type & arg_in )
+    : output( arg_out ), input( arg_in )
+    {
+      typedef typename OutputView::execution_space  execution_space ;
+      typedef Kokkos::RangePolicy< execution_space > Policy ;
+
+      const Kokkos::Impl::ParallelFor< DynRankViewFill , Policy > closure( *this , Policy( 0 , output.dimension_0() ) );
+
+      closure.execute();
+
+      execution_space::fence();
+    }
+};
+
+template< class OutputView >
+struct DynRankViewFill< OutputView , typename std::enable_if< OutputView::Rank == 0 >::type > { 
+  DynRankViewFill( const OutputView & dst , const typename OutputView::const_value_type & src )
+    {
+      Kokkos::Impl::DeepCopy< typename OutputView::memory_space , Kokkos::HostSpace >
+        ( dst.data() , & src , sizeof(typename OutputView::const_value_type) );
+    }
+};
+
+template< class OutputView , class InputView , class ExecSpace = typename OutputView::execution_space >
+struct DynRankViewRemap {
+
+  const OutputView output ;
+  const InputView  input ;
+  const size_t n0 ;
+  const size_t n1 ;
+  const size_t n2 ;
+  const size_t n3 ;
+  const size_t n4 ;
+  const size_t n5 ;
+  const size_t n6 ;
+  const size_t n7 ;
+
+  DynRankViewRemap( const OutputView & arg_out , const InputView & arg_in )
+    : output( arg_out ), input( arg_in )
+    , n0( std::min( (size_t)arg_out.dimension_0() , (size_t)arg_in.dimension_0() ) )
+    , n1( std::min( (size_t)arg_out.dimension_1() , (size_t)arg_in.dimension_1() ) )
+    , n2( std::min( (size_t)arg_out.dimension_2() , (size_t)arg_in.dimension_2() ) )
+    , n3( std::min( (size_t)arg_out.dimension_3() , (size_t)arg_in.dimension_3() ) )
+    , n4( std::min( (size_t)arg_out.dimension_4() , (size_t)arg_in.dimension_4() ) )
+    , n5( std::min( (size_t)arg_out.dimension_5() , (size_t)arg_in.dimension_5() ) )
+    , n6( std::min( (size_t)arg_out.dimension_6() , (size_t)arg_in.dimension_6() ) )
+    , n7( std::min( (size_t)arg_out.dimension_7() , (size_t)arg_in.dimension_7() ) )
+    {
+      typedef Kokkos::RangePolicy< ExecSpace > Policy ;
+      const Kokkos::Impl::ParallelFor< DynRankViewRemap , Policy > closure( *this , Policy( 0 , n0 ) );
+      closure.execute();
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_t i0 ) const
+  {
+    for ( size_t i1 = 0 ; i1 < n1 ; ++i1 ) {
+    for ( size_t i2 = 0 ; i2 < n2 ; ++i2 ) {
+    for ( size_t i3 = 0 ; i3 < n3 ; ++i3 ) {
+    for ( size_t i4 = 0 ; i4 < n4 ; ++i4 ) {
+    for ( size_t i5 = 0 ; i5 < n5 ; ++i5 ) {
+    for ( size_t i6 = 0 ; i6 < n6 ; ++i6 ) {
+      output(i0,i1,i2,i3,i4,i5,i6) = input(i0,i1,i2,i3,i4,i5,i6);
+    }}}}}}
+  }
+};
+
+} /* namespace Impl */
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+
+namespace Kokkos {
+namespace Experimental {
+
+/** \brief  Deep copy a value from Host memory into a view.  */
+template< class DT , class ... DP >
+inline
+void deep_copy
+  ( const DynRankView<DT,DP...> & dst
+  , typename ViewTraits<DT,DP...>::const_value_type & value
+  , typename std::enable_if<
+    std::is_same< typename ViewTraits<DT,DP...>::specialize , void >::value
+    >::type * = 0 )
+{
+  static_assert(
+    std::is_same< typename ViewTraits<DT,DP...>::non_const_value_type ,
+                  typename ViewTraits<DT,DP...>::value_type >::value
+    , "deep_copy requires non-const type" );
+
+  Kokkos::Experimental::Impl::DynRankViewFill< DynRankView<DT,DP...> >( dst , value );
+}
+
+/** \brief  Deep copy into a value in Host memory from a view.  */
+template< class ST , class ... SP >
+inline
+void deep_copy
+  ( typename ViewTraits<ST,SP...>::non_const_value_type & dst
+  , const DynRankView<ST,SP...> & src
+  , typename std::enable_if<
+    std::is_same< typename ViewTraits<ST,SP...>::specialize , void >::value
+    >::type * = 0 )
+{
+  if ( src.rank() != 0 )
+  {
+    Kokkos::abort("");
+  }
+
+  typedef ViewTraits<ST,SP...>               src_traits ;
+  typedef typename src_traits::memory_space  src_memory_space ;
+  Kokkos::Impl::DeepCopy< HostSpace , src_memory_space >( & dst , src.data() , sizeof(ST) );
+}
+
+//----------------------------------------------------------------------------
+/** \brief  A deep copy between views of the default specialization, compatible type,
+ *          same rank, same contiguous layout.
+ */
+template< class DstType , class SrcType >
+inline
+void deep_copy
+  ( const DstType & dst
+  , const SrcType & src
+  , typename std::enable_if<(
+    std::is_same< typename DstType::traits::specialize , void >::value &&
+    std::is_same< typename SrcType::traits::specialize , void >::value
+    &&
+    ( Kokkos::Experimental::is_dyn_rank_view<DstType>::value || Kokkos::Experimental::is_dyn_rank_view<SrcType>::value)
+  )>::type * = 0 )
+{
+  static_assert(
+    std::is_same< typename DstType::traits::value_type ,
+                  typename DstType::traits::non_const_value_type >::value
+    , "deep_copy requires non-const destination type" );
+
+  typedef DstType  dst_type ;
+  typedef SrcType  src_type ;
+
+  typedef typename dst_type::execution_space  dst_execution_space ;
+  typedef typename src_type::execution_space  src_execution_space ;
+  typedef typename dst_type::memory_space     dst_memory_space ;
+  typedef typename src_type::memory_space     src_memory_space ;
+
+  enum { DstExecCanAccessSrc =
+   Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< typename dst_execution_space::memory_space , src_memory_space >::value };
+
+  enum { SrcExecCanAccessDst =
+   Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< typename src_execution_space::memory_space , dst_memory_space >::value };
+
+  if ( (void *) dst.data() != (void*) src.data() ) {
+
+    // Concern: If overlapping views then a parallel copy will be erroneous.
+    // ...
+
+    // If same type, equal layout, equal dimensions, equal span, and contiguous memory then can byte-wise copy
+    if ( rank(src) == 0 && rank(dst) == 0 )
+    { 
+      typedef typename dst_type::value_type    value_type ;
+      Kokkos::Impl::DeepCopy< dst_memory_space , src_memory_space >( dst.data() , src.data() , sizeof(value_type) ); 
+    }
+    else if ( std::is_same< typename DstType::traits::value_type ,
+                       typename SrcType::traits::non_const_value_type >::value &&
+         (
+           ( std::is_same< typename DstType::traits::array_layout ,
+                           typename SrcType::traits::array_layout >::value
+             &&
+             ( std::is_same< typename DstType::traits::array_layout ,
+                             typename Kokkos::LayoutLeft>::value
+             ||
+               std::is_same< typename DstType::traits::array_layout ,
+                             typename Kokkos::LayoutRight>::value
+             )
+           )
+           ||
+           (
+             rank(dst) == 1
+             &&
+             rank(src) == 1
+           )
+         ) &&
+         dst.span_is_contiguous() &&
+         src.span_is_contiguous() &&
+         dst.span() == src.span() &&
+         dst.dimension_0() == src.dimension_0() &&
+         dst.dimension_1() == src.dimension_1() &&
+         dst.dimension_2() == src.dimension_2() &&
+         dst.dimension_3() == src.dimension_3() &&
+         dst.dimension_4() == src.dimension_4() &&
+         dst.dimension_5() == src.dimension_5() &&
+         dst.dimension_6() == src.dimension_6() &&
+         dst.dimension_7() == src.dimension_7() ) {
+
+      const size_t nbytes = sizeof(typename dst_type::value_type) * dst.span();
+
+      Kokkos::Impl::DeepCopy< dst_memory_space , src_memory_space >( dst.data() , src.data() , nbytes );
+    }
+    else if ( std::is_same< typename DstType::traits::value_type ,
+                            typename SrcType::traits::non_const_value_type >::value &&
+         (
+           ( std::is_same< typename DstType::traits::array_layout ,
+                           typename SrcType::traits::array_layout >::value
+             &&
+             std::is_same< typename DstType::traits::array_layout ,
+                          typename Kokkos::LayoutStride>::value
+           )
+           ||
+           (
+             rank(dst) == 1
+             &&
+             rank(src) == 1
+           )
+         ) &&
+         dst.span_is_contiguous() &&
+         src.span_is_contiguous() &&
+         dst.span() == src.span() &&
+         dst.dimension_0() == src.dimension_0() &&
+         dst.dimension_1() == src.dimension_1() &&
+         dst.dimension_2() == src.dimension_2() &&
+         dst.dimension_3() == src.dimension_3() &&
+         dst.dimension_4() == src.dimension_4() &&
+         dst.dimension_5() == src.dimension_5() &&
+         dst.dimension_6() == src.dimension_6() &&
+         dst.dimension_7() == src.dimension_7() &&
+         dst.stride_0() == src.stride_0() &&
+         dst.stride_1() == src.stride_1() &&
+         dst.stride_2() == src.stride_2() &&
+         dst.stride_3() == src.stride_3() &&
+         dst.stride_4() == src.stride_4() &&
+         dst.stride_5() == src.stride_5() &&
+         dst.stride_6() == src.stride_6() &&
+         dst.stride_7() == src.stride_7()
+         ) {
+
+      const size_t nbytes = sizeof(typename dst_type::value_type) * dst.span();
+
+      Kokkos::Impl::DeepCopy< dst_memory_space , src_memory_space >( dst.data() , src.data() , nbytes );
+    }
+    else if ( DstExecCanAccessSrc ) {
+      // Copying data between views in accessible memory spaces and either non-contiguous or incompatible shape.
+      Kokkos::Experimental::Impl::DynRankViewRemap< dst_type , src_type >( dst , src );
+    }
+    else if ( SrcExecCanAccessDst ) {
+      // Copying data between views in accessible memory spaces and either non-contiguous or incompatible shape.
+      Kokkos::Experimental::Impl::DynRankViewRemap< dst_type , src_type , src_execution_space >( dst , src );
+    }
+    else {
+      Kokkos::Impl::throw_runtime_exception("deep_copy given views that would require a temporary allocation");
+    }
+  }
+}
+
+} //end Experimental
+} //end Kokkos
+
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+
+namespace Impl {
+
+
+// Deduce Mirror Types
+template<class Space, class T, class ... P>
+struct MirrorDRViewType {
+  // The incoming view_type
+  typedef typename Kokkos::Experimental::DynRankView<T,P...> src_view_type;
+  // The memory space for the mirror view
+  typedef typename Space::memory_space memory_space;
+  // Check whether it is the same memory space
+  enum { is_same_memspace = std::is_same<memory_space,typename src_view_type::memory_space>::value };
+  // The array_layout
+  typedef typename src_view_type::array_layout array_layout;
+  // The data type (we probably want it non-const since otherwise we can't even deep_copy to it.
+  typedef typename src_view_type::non_const_data_type data_type;
+  // The destination view type if it is not the same memory space
+  typedef Kokkos::Experimental::DynRankView<data_type,array_layout,Space> dest_view_type;
+  // If it is the same memory_space return the existsing view_type
+  // This will also keep the unmanaged trait if necessary
+  typedef typename std::conditional<is_same_memspace,src_view_type,dest_view_type>::type view_type;
+};
+
+template<class Space, class T, class ... P>
+struct MirrorDRVType {
+  // The incoming view_type
+  typedef typename Kokkos::Experimental::DynRankView<T,P...> src_view_type;
+  // The memory space for the mirror view
+  typedef typename Space::memory_space memory_space;
+  // Check whether it is the same memory space
+  enum { is_same_memspace = std::is_same<memory_space,typename src_view_type::memory_space>::value };
+  // The array_layout
+  typedef typename src_view_type::array_layout array_layout;
+  // The data type (we probably want it non-const since otherwise we can't even deep_copy to it.
+  typedef typename src_view_type::non_const_data_type data_type;
+  // The destination view type if it is not the same memory space
+  typedef Kokkos::Experimental::DynRankView<data_type,array_layout,Space> view_type;
+};
+
+}
+
+
+template< class T , class ... P >
+inline
+typename DynRankView<T,P...>::HostMirror
+create_mirror( const DynRankView<T,P...> & src
+             , typename std::enable_if<
+                 ! std::is_same< typename Kokkos::Experimental::ViewTraits<T,P...>::array_layout
+                               , Kokkos::LayoutStride >::value
+               >::type * = 0
+             )
+{
+  typedef DynRankView<T,P...>                   src_type ;
+  typedef typename src_type::HostMirror  dst_type ;
+
+  return dst_type( std::string( src.label() ).append("_mirror")
+                 , Impl::reconstructLayout(src.layout(), src.rank()) );
+}
+
+
+template< class T , class ... P >
+inline
+typename DynRankView<T,P...>::HostMirror
+create_mirror( const DynRankView<T,P...> & src
+             , typename std::enable_if<
+                 std::is_same< typename Kokkos::Experimental::ViewTraits<T,P...>::array_layout
+                             , Kokkos::LayoutStride >::value
+               >::type * = 0
+             )
+{
+  typedef DynRankView<T,P...>                   src_type ;
+  typedef typename src_type::HostMirror  dst_type ;
+
+  return dst_type( std::string( src.label() ).append("_mirror") 
+                 , Impl::reconstructLayout(src.layout(), src.rank()) );
+}
+
+
+// Create a mirror in a new space (specialization for different space)
+template<class Space, class T, class ... P>
+typename Impl::MirrorDRVType<Space,T,P ...>::view_type create_mirror(const Space& , const Kokkos::Experimental::DynRankView<T,P...> & src) {
+  return typename Impl::MirrorDRVType<Space,T,P ...>::view_type(src.label(), Impl::reconstructLayout(src.layout(), src.rank()) );
+}
+
+template< class T , class ... P >
+inline
+typename DynRankView<T,P...>::HostMirror
+create_mirror_view( const DynRankView<T,P...> & src
+                  , typename std::enable_if<(
+                      std::is_same< typename DynRankView<T,P...>::memory_space
+                                  , typename DynRankView<T,P...>::HostMirror::memory_space
+                                  >::value
+                      &&
+                      std::is_same< typename DynRankView<T,P...>::data_type
+                                  , typename DynRankView<T,P...>::HostMirror::data_type
+                                  >::value
+                    )>::type * = 0
+                  )
+{
+  return src ;
+}
+
+template< class T , class ... P >
+inline
+typename DynRankView<T,P...>::HostMirror
+create_mirror_view( const DynRankView<T,P...> & src
+                  , typename std::enable_if< ! (
+                      std::is_same< typename DynRankView<T,P...>::memory_space
+                                  , typename DynRankView<T,P...>::HostMirror::memory_space
+                                  >::value
+                      &&
+                      std::is_same< typename DynRankView<T,P...>::data_type
+                                  , typename DynRankView<T,P...>::HostMirror::data_type
+                                  >::value
+                    )>::type * = 0
+                  )
+{
+  return Kokkos::Experimental::create_mirror( src ); 
+}
+
+// Create a mirror view in a new space (specialization for same space)
+template<class Space, class T, class ... P>
+typename Impl::MirrorDRViewType<Space,T,P ...>::view_type
+create_mirror_view(const Space& , const Kokkos::Experimental::DynRankView<T,P...> & src
+  , typename std::enable_if<Impl::MirrorDRViewType<Space,T,P ...>::is_same_memspace>::type* = 0 ) {
+  return src;
+}
+
+// Create a mirror view in a new space (specialization for different space)
+template<class Space, class T, class ... P>
+typename Impl::MirrorDRViewType<Space,T,P ...>::view_type
+create_mirror_view(const Space& , const Kokkos::Experimental::DynRankView<T,P...> & src
+  , typename std::enable_if<!Impl::MirrorDRViewType<Space,T,P ...>::is_same_memspace>::type* = 0 ) {
+  return typename Impl::MirrorDRViewType<Space,T,P ...>::view_type(src.label(), Impl::reconstructLayout(src.layout(), src.rank()) );
+}
+
+} //end Experimental
+} //end Kokkos
+
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+/** \brief  Resize a view with copying old data to new data at the corresponding indices. */
+template< class T , class ... P >
+inline
+void resize( DynRankView<T,P...> & v ,
+             const size_t n0 = ~size_t(0) ,
+             const size_t n1 = ~size_t(0) ,
+             const size_t n2 = ~size_t(0) ,
+             const size_t n3 = ~size_t(0) ,
+             const size_t n4 = ~size_t(0) ,
+             const size_t n5 = ~size_t(0) ,
+             const size_t n6 = ~size_t(0) ,
+             const size_t n7 = ~size_t(0) )
+{
+  typedef DynRankView<T,P...>  drview_type ;
+
+  static_assert( Kokkos::Experimental::ViewTraits<T,P...>::is_managed , "Can only resize managed views" );
+
+  drview_type v_resized( v.label(), n0, n1, n2, n3, n4, n5, n6 );
+
+  Kokkos::Experimental::Impl::DynRankViewRemap< drview_type , drview_type >( v_resized, v );
+
+  v = v_resized ;
+}
+
+/** \brief  Resize a view with copying old data to new data at the corresponding indices. */
+template< class T , class ... P >
+inline
+void realloc( DynRankView<T,P...> & v ,
+              const size_t n0 = ~size_t(0) ,
+              const size_t n1 = ~size_t(0) ,
+              const size_t n2 = ~size_t(0) ,
+              const size_t n3 = ~size_t(0) ,
+              const size_t n4 = ~size_t(0) ,
+              const size_t n5 = ~size_t(0) ,
+              const size_t n6 = ~size_t(0) ,
+              const size_t n7 = ~size_t(0) )
+{
+  typedef DynRankView<T,P...>  drview_type ;
+
+  static_assert( Kokkos::Experimental::ViewTraits<T,P...>::is_managed , "Can only realloc managed views" );
+
+  const std::string label = v.label();
+
+  v = drview_type(); // Deallocate first, if the only view to allocation
+  v = drview_type( label, n0, n1, n2, n3, n4, n5, n6 );
+}
+
+} //end Experimental
+
+} //end Kokkos
+
+using Kokkos::Experimental::is_dyn_rank_view ;
+
+namespace Kokkos {
+
+template< typename D , class ... P >
+using DynRankView = Kokkos::Experimental::DynRankView< D , P... > ;
+
+using Kokkos::Experimental::deep_copy ;
+using Kokkos::Experimental::create_mirror ;
+using Kokkos::Experimental::create_mirror_view ;
+using Kokkos::Experimental::subdynrankview ;
+using Kokkos::Experimental::subview ;
+using Kokkos::Experimental::resize ;
+using Kokkos::Experimental::realloc ;
+
+} //end Kokkos
+#endif
diff --git a/lib/kokkos/containers/src/Kokkos_DynamicView.hpp b/lib/kokkos/containers/src/Kokkos_DynamicView.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..fb364f0bf252e2ccae8aa04544487bc8f3f1a74f
--- /dev/null
+++ b/lib/kokkos/containers/src/Kokkos_DynamicView.hpp
@@ -0,0 +1,494 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_DYNAMIC_VIEW_HPP
+#define KOKKOS_DYNAMIC_VIEW_HPP
+
+#include <cstdio>
+
+#include <Kokkos_Core.hpp>
+#include <impl/Kokkos_Error.hpp>
+
+namespace Kokkos {
+namespace Experimental {
+
+/** \brief Dynamic views are restricted to rank-one and no layout.
+ *         Subviews are not allowed.
+ */
+template< typename DataType , typename ... P >
+class DynamicView : public Kokkos::Experimental::ViewTraits< DataType , P ... >
+{ 
+public:
+
+  typedef ViewTraits< DataType , P ... >  traits ;
+
+private:
+
+  template< class , class ... > friend class DynamicView ;
+
+  typedef Kokkos::Experimental::Impl::SharedAllocationTracker   track_type ;
+
+  static_assert( traits::rank == 1 && traits::rank_dynamic == 1 
+               , "DynamicView must be rank-one" );
+
+  static_assert( std::is_trivial< typename traits::value_type >::value &&
+                 std::is_same< typename traits::specialize , void >::value
+               , "DynamicView must have trivial data type" );
+
+public:
+
+  typedef Kokkos::Experimental::MemoryPool< typename traits::device_type > memory_pool ;
+
+private:
+
+  memory_pool                    m_pool ;
+  track_type                     m_track ;
+  typename traits::value_type ** m_chunks ;
+  unsigned                       m_chunk_shift ;
+  unsigned                       m_chunk_mask ;
+  unsigned                       m_chunk_max ;
+
+public:
+
+  //----------------------------------------------------------------------
+
+  /** \brief  Compatible view of array of scalar types */
+  typedef DynamicView< typename traits::data_type ,
+                       typename traits::device_type >
+    array_type ;
+
+  /** \brief  Compatible view of const data type */
+  typedef DynamicView< typename traits::const_data_type ,
+                       typename traits::device_type >
+    const_type ;
+
+  /** \brief  Compatible view of non-const data type */
+  typedef DynamicView< typename traits::non_const_data_type ,
+                       typename traits::device_type >
+    non_const_type ;
+
+  /** \brief  Must be accessible everywhere */
+  typedef DynamicView  HostMirror ;
+
+  //----------------------------------------------------------------------
+
+  enum { Rank = 1 };
+
+  KOKKOS_INLINE_FUNCTION constexpr size_t size() const
+    {
+      return
+        Kokkos::Impl::VerifyExecutionCanAccessMemorySpace
+          < Kokkos::Impl::ActiveExecutionMemorySpace
+          , typename traits::memory_space
+          >::value 
+        ? // Runtime size is at the end of the chunk pointer array
+          (*reinterpret_cast<const uintptr_t*>( m_chunks + m_chunk_max ))
+          << m_chunk_shift
+        : 0 ;
+    }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_t extent( const iType & r ) const
+    { return r == 0 ? size() : 1 ; }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_t extent_int( const iType & r ) const
+    { return r == 0 ? size() : 1 ; }
+
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_0() const { return size(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_1() const { return 1 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_2() const { return 1 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_3() const { return 1 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_4() const { return 1 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_5() const { return 1 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_6() const { return 1 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_7() const { return 1 ; }
+
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_0() const { return 0 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_1() const { return 0 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_2() const { return 0 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_3() const { return 0 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_4() const { return 0 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_5() const { return 0 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_6() const { return 0 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_7() const { return 0 ; }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION void stride( iType * const s ) const { *s = 0 ; }
+
+  //----------------------------------------------------------------------
+  // Range span is the span which contains all members.
+
+  typedef typename traits::value_type &  reference_type ;
+  typedef typename traits::value_type *  pointer_type ;
+
+  enum { reference_type_is_lvalue_reference = std::is_lvalue_reference< reference_type >::value };
+
+  KOKKOS_INLINE_FUNCTION constexpr bool   span_is_contiguous() const { return false ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_t span() const { return 0 ; }
+  KOKKOS_INLINE_FUNCTION constexpr pointer_type data() const { return 0 ; }
+
+  //----------------------------------------
+
+  template< typename I0 , class ... Args >
+  KOKKOS_INLINE_FUNCTION
+  reference_type operator()( const I0 & i0 , const Args & ... args ) const
+    {
+      static_assert( Kokkos::Impl::are_integral<I0,Args...>::value
+                   , "Indices must be integral type" );
+
+      Kokkos::Impl::VerifyExecutionCanAccessMemorySpace
+        < Kokkos::Impl::ActiveExecutionMemorySpace
+        , typename traits::memory_space
+        >::verify();
+
+      // Which chunk is being indexed.
+      const uintptr_t ic = uintptr_t( i0 >> m_chunk_shift );
+
+      typename traits::value_type * volatile * const ch = m_chunks + ic ;
+
+      // Do bounds checking if enabled or if the chunk pointer is zero.
+      // If not bounds checking then we assume a non-zero pointer is valid.
+
+#if ! defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
+      if ( 0 == *ch )
+#endif
+      {
+        // Verify that allocation of the requested chunk in in progress.
+
+        // The allocated chunk counter is m_chunks[ m_chunk_max ]
+        const uintptr_t n = 
+          *reinterpret_cast<uintptr_t volatile *>( m_chunks + m_chunk_max );
+
+        if ( n <= ic ) {
+          Kokkos::abort("Kokkos::DynamicView array bounds error");
+        }
+
+        // Allocation of this chunk is in progress 
+        // so wait for allocation to complete.
+        while ( 0 == *ch );
+      }
+
+      return (*ch)[ i0 & m_chunk_mask ];
+    }
+
+  //----------------------------------------
+  /** \brief  Resizing in parallel only increases the array size,
+   *          never decrease.
+   */
+  KOKKOS_INLINE_FUNCTION
+  void resize_parallel( size_t n ) const
+    {
+      typedef typename traits::value_type value_type ;
+
+      Kokkos::Impl::VerifyExecutionCanAccessMemorySpace
+        < Kokkos::Impl::ActiveExecutionMemorySpace
+        , typename traits::memory_space >::verify();
+
+      const uintptr_t NC = ( n + m_chunk_mask ) >> m_chunk_shift ;
+
+      if ( m_chunk_max < NC ) {
+#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
+        printf("DynamicView::resize_parallel(%lu) m_chunk_max(%lu) NC(%lu)\n"
+              , n , m_chunk_max , NC );
+#endif
+        Kokkos::abort("DynamicView::resize_parallel exceeded maximum size");
+      }
+
+      typename traits::value_type * volatile * const ch = m_chunks ;
+
+      // The allocated chunk counter is m_chunks[ m_chunk_max ]
+      uintptr_t volatile * const pc =
+        reinterpret_cast<uintptr_t volatile*>( m_chunks + m_chunk_max );
+
+      // Potentially concurrent iteration of allocation to the required size.
+
+      for ( uintptr_t jc = *pc ; jc < NC ; ) {
+
+        // Claim the 'jc' chunk to-be-allocated index
+
+        const uintptr_t jc_try = jc ;
+
+        // Jump iteration to the chunk counter.
+        
+        jc = atomic_compare_exchange( pc , jc_try , jc_try + 1 );
+
+        if ( jc_try == jc ) {
+
+          ch[jc_try] = reinterpret_cast<value_type*>(
+            m_pool.allocate( sizeof(value_type) << m_chunk_shift ));
+
+          Kokkos::memory_fence();
+        }
+      }
+    }
+
+  /** \brief  Resizing in serial can grow or shrink the array size, */
+  inline
+  void resize_serial( size_t n )
+    {
+      Kokkos::Impl::VerifyExecutionCanAccessMemorySpace
+        < Kokkos::Impl::ActiveExecutionMemorySpace
+        , typename traits::memory_space >::verify();
+
+      const uintptr_t NC = ( n + m_chunk_mask ) >> m_chunk_shift ;
+
+      if ( m_chunk_max < NC ) {
+        Kokkos::abort("DynamicView::resize_serial exceeded maximum size");
+      }
+
+      uintptr_t * const pc =
+        reinterpret_cast<uintptr_t*>( m_chunks + m_chunk_max );
+
+      if ( *pc < NC ) {
+        while ( *pc < NC ) {
+          m_chunks[*pc] =
+            m_pool.allocate( sizeof(traits::value_type) << m_chunk_shift );
+          ++*pc ;
+        }
+      }
+      else {
+        while ( NC + 1 <= *pc ) {
+          --*pc ;        
+          m_pool.deallocate( m_chunks[*pc]
+                           , sizeof(traits::value_type) << m_chunk_shift );
+          m_chunks[*pc] = 0 ;
+        }
+      }
+    }
+
+  //----------------------------------------------------------------------
+
+  ~DynamicView() = default ;
+  DynamicView() = default ;
+  DynamicView( DynamicView && ) = default ;
+  DynamicView( const DynamicView & ) = default ;
+  DynamicView & operator = ( DynamicView && ) = default ;
+  DynamicView & operator = ( const DynamicView & ) = default ;
+
+  template< class RT , class ... RP >
+  KOKKOS_INLINE_FUNCTION
+  DynamicView( const DynamicView<RT,RP...> & rhs )
+    : m_pool( rhs.m_pool )
+    , m_track( rhs.m_track )
+    , m_chunks( rhs.m_chunks )
+    , m_chunk_shift( rhs.m_chunk_shift )
+    , m_chunk_mask( rhs.m_chunk_mask )
+    , m_chunk_max( rhs.m_chunk_max )
+    {
+    }
+
+  //----------------------------------------------------------------------
+
+  struct Destroy {
+    memory_pool                    m_pool ;
+    typename traits::value_type ** m_chunks ;
+    unsigned                       m_chunk_max ;
+    bool                           m_destroy ;
+
+    // Initialize or destroy array of chunk pointers.
+    // Two entries beyond the max chunks are allocation counters.
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()( unsigned i ) const
+      {
+        if ( m_destroy && i < m_chunk_max && 0 != m_chunks[i] ) {
+          m_pool.deallocate( m_chunks[i] , m_pool.get_min_block_size() );
+        }
+        m_chunks[i] = 0 ;
+      }
+
+    void execute( bool arg_destroy )
+      {
+        typedef Kokkos::RangePolicy< typename traits::execution_space > Range ;
+
+        m_destroy = arg_destroy ;
+
+        Kokkos::Impl::ParallelFor<Destroy,Range>
+          closure( *this , Range(0, m_chunk_max + 1) );
+
+        closure.execute();
+
+        traits::execution_space::fence();
+      }
+
+    void construct_shared_allocation()
+      { execute( false ); }
+
+    void destroy_shared_allocation()
+      { execute( true ); }
+
+    Destroy() = default ;
+    Destroy( Destroy && ) = default ;
+    Destroy( const Destroy & ) = default ;
+    Destroy & operator = ( Destroy && ) = default ;
+    Destroy & operator = ( const Destroy & ) = default ;
+
+    Destroy( const memory_pool & arg_pool
+           , typename traits::value_type ** arg_chunk
+           , const unsigned arg_chunk_max )
+     : m_pool( arg_pool )
+     , m_chunks( arg_chunk )
+     , m_chunk_max( arg_chunk_max )
+     , m_destroy( false )
+     {}
+  };
+
+
+  /**\brief  Allocation constructor 
+   *
+   *  Memory is allocated in chunks from the memory pool.
+   *  The chunk size conforms to the memory pool's chunk size.
+   *  A maximum size is required in order to allocate a
+   *  chunk-pointer array.
+   */
+  explicit inline
+  DynamicView( const std::string & arg_label
+             , const memory_pool & arg_pool
+             , const size_t        arg_size_max )
+    : m_pool( arg_pool )
+    , m_track()
+    , m_chunks(0)
+    // The memory pool chunk is guaranteed to be a power of two
+    , m_chunk_shift(
+        Kokkos::Impl::integral_power_of_two(
+          m_pool.get_min_block_size()/sizeof(typename traits::value_type)) )
+    , m_chunk_mask( ( 1 << m_chunk_shift ) - 1 )
+    , m_chunk_max( ( arg_size_max + m_chunk_mask ) >> m_chunk_shift )
+    {
+      Kokkos::Impl::VerifyExecutionCanAccessMemorySpace
+        < Kokkos::Impl::ActiveExecutionMemorySpace
+        , typename traits::memory_space >::verify();
+
+      // A functor to deallocate all of the chunks upon final destruction
+
+      typedef typename traits::memory_space  memory_space ;
+      typedef Kokkos::Experimental::Impl::SharedAllocationRecord< memory_space , Destroy > record_type ;
+
+      // Allocate chunk pointers and allocation counter
+      record_type * const record =
+        record_type::allocate( memory_space()
+                             , arg_label
+                             , ( sizeof(pointer_type) * ( m_chunk_max + 1 ) ) );
+
+      m_chunks = reinterpret_cast<pointer_type*>( record->data() );
+
+      record->m_destroy = Destroy( m_pool , m_chunks , m_chunk_max );
+
+      // Initialize to zero
+
+      record->m_destroy.construct_shared_allocation();
+
+      m_track.assign_allocated_record_to_uninitialized( record );
+    }
+};
+
+} // namespace Experimental
+} // namespace Kokkos
+
+namespace Kokkos {
+namespace Experimental {
+
+template< class T , class ... P >
+inline
+typename Kokkos::Experimental::DynamicView<T,P...>::HostMirror
+create_mirror_view( const Kokkos::Experimental::DynamicView<T,P...> & src )
+{
+  return src ;
+}
+
+template< class T , class ... DP , class ... SP >
+inline
+void deep_copy( const View<T,DP...> & dst
+              , const DynamicView<T,SP...> & src
+              )
+{
+  typedef View<T,DP...>        dst_type ;
+  typedef DynamicView<T,SP...> src_type ;
+
+  typedef typename ViewTraits<T,DP...>::execution_space  dst_execution_space ;
+  typedef typename ViewTraits<T,SP...>::memory_space     src_memory_space ;
+
+  enum { DstExecCanAccessSrc =
+   Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< typename dst_execution_space::memory_space , src_memory_space >::value };
+
+  if ( DstExecCanAccessSrc ) {
+    // Copying data between views in accessible memory spaces and either non-contiguous or incompatible shape.
+    Kokkos::Experimental::Impl::ViewRemap< dst_type , src_type >( dst , src );
+  }
+  else {
+    Kokkos::Impl::throw_runtime_exception("deep_copy given views that would require a temporary allocation");
+  }
+}
+
+template< class T , class ... DP , class ... SP >
+inline
+void deep_copy( const DynamicView<T,DP...> & dst
+              , const View<T,SP...> & src
+              )
+{
+  typedef DynamicView<T,SP...> dst_type ;
+  typedef View<T,DP...>        src_type ;
+
+  typedef typename ViewTraits<T,DP...>::execution_space  dst_execution_space ;
+  typedef typename ViewTraits<T,SP...>::memory_space     src_memory_space ;
+
+  enum { DstExecCanAccessSrc =
+   Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< typename dst_execution_space::memory_space , src_memory_space >::value };
+
+  if ( DstExecCanAccessSrc ) {
+    // Copying data between views in accessible memory spaces and either non-contiguous or incompatible shape.
+    Kokkos::Experimental::Impl::ViewRemap< dst_type , src_type >( dst , src );
+  }
+  else {
+    Kokkos::Impl::throw_runtime_exception("deep_copy given views that would require a temporary allocation");
+  }
+}
+
+} // namespace Experimental
+} // namespace Kokkos
+
+#endif /* #ifndef KOKKOS_DYNAMIC_VIEW_HPP */
+
diff --git a/lib/kokkos/containers/src/Kokkos_Functional.hpp b/lib/kokkos/containers/src/Kokkos_Functional.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..5c7350ef1cd3bb1ed68deff0c823ce3f7a5a3619
--- /dev/null
+++ b/lib/kokkos/containers/src/Kokkos_Functional.hpp
@@ -0,0 +1,173 @@
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+
+#ifndef KOKKOS_FUNCTIONAL_HPP
+#define KOKKOS_FUNCTIONAL_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <impl/Kokkos_Functional_impl.hpp>
+
+namespace Kokkos {
+
+// These should work for most types
+
+template <typename T>
+struct pod_hash
+{
+  typedef T argument_type;
+  typedef T first_argument_type;
+  typedef uint32_t second_argument_type;
+  typedef uint32_t result_type;
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  uint32_t operator()(T const & t) const
+  { return Impl::MurmurHash3_x86_32( &t, sizeof(T), 0); }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  uint32_t operator()(T const & t, uint32_t seed) const
+  { return Impl::MurmurHash3_x86_32( &t, sizeof(T), seed); }
+};
+
+template <typename T>
+struct pod_equal_to
+{
+  typedef T first_argument_type;
+  typedef T second_argument_type;
+  typedef bool result_type;
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool operator()(T const & a, T const & b) const
+  { return Impl::bitwise_equal(&a,&b); }
+};
+
+template <typename T>
+struct pod_not_equal_to
+{
+  typedef T first_argument_type;
+  typedef T second_argument_type;
+  typedef bool result_type;
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool operator()(T const & a, T const & b) const
+  { return !Impl::bitwise_equal(&a,&b); }
+};
+
+template <typename T>
+struct equal_to
+{
+  typedef T first_argument_type;
+  typedef T second_argument_type;
+  typedef bool result_type;
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool operator()(T const & a, T const & b) const
+  { return a == b; }
+};
+
+template <typename T>
+struct not_equal_to
+{
+  typedef T first_argument_type;
+  typedef T second_argument_type;
+  typedef bool result_type;
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool operator()(T const & a, T const & b) const
+  { return a != b; }
+};
+
+
+template <typename T>
+struct greater
+{
+  typedef T first_argument_type;
+  typedef T second_argument_type;
+  typedef bool result_type;
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool operator()(T const & a, T const & b) const
+  { return a > b; }
+};
+
+
+template <typename T>
+struct less
+{
+  typedef T first_argument_type;
+  typedef T second_argument_type;
+  typedef bool result_type;
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool operator()(T const & a, T const & b) const
+  { return a < b; }
+};
+
+template <typename T>
+struct greater_equal
+{
+  typedef T first_argument_type;
+  typedef T second_argument_type;
+  typedef bool result_type;
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool operator()(T const & a, T const & b) const
+  { return a >= b; }
+};
+
+
+template <typename T>
+struct less_equal
+{
+  typedef T first_argument_type;
+  typedef T second_argument_type;
+  typedef bool result_type;
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool operator()(T const & a, T const & b) const
+  { return a <= b; }
+};
+
+} // namespace Kokkos
+
+
+#endif //KOKKOS_FUNCTIONAL_HPP
+
+
diff --git a/lib/kokkos/containers/src/Kokkos_SegmentedView.hpp b/lib/kokkos/containers/src/Kokkos_SegmentedView.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..5dd7a98b893f0418fb31c7ae6026ac30c886f84b
--- /dev/null
+++ b/lib/kokkos/containers/src/Kokkos_SegmentedView.hpp
@@ -0,0 +1,531 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_SEGMENTED_VIEW_HPP_
+#define KOKKOS_SEGMENTED_VIEW_HPP_
+
+#include <Kokkos_Core.hpp>
+#include <impl/Kokkos_Error.hpp>
+#include <cstdio>
+
+#if ! KOKKOS_USING_EXP_VIEW
+
+namespace Kokkos {
+namespace Experimental {
+
+namespace Impl {
+
+template<class DataType, class Arg1Type, class Arg2Type, class Arg3Type>
+struct delete_segmented_view;
+
+template<class MemorySpace>
+inline
+void DeviceSetAllocatableMemorySize(size_t) {}
+
+#if defined( KOKKOS_HAVE_CUDA )
+
+template<>
+inline
+void DeviceSetAllocatableMemorySize<Kokkos::CudaSpace>(size_t size) {
+#ifdef __CUDACC__
+  size_t size_limit;
+  cudaDeviceGetLimit(&size_limit,cudaLimitMallocHeapSize);
+  if(size_limit<size)
+    cudaDeviceSetLimit(cudaLimitMallocHeapSize,2*size);
+  cudaDeviceGetLimit(&size_limit,cudaLimitMallocHeapSize);
+#endif
+}
+
+template<>
+inline
+void DeviceSetAllocatableMemorySize<Kokkos::CudaUVMSpace>(size_t size) {
+#ifdef __CUDACC__
+  size_t size_limit;
+  cudaDeviceGetLimit(&size_limit,cudaLimitMallocHeapSize);
+  if(size_limit<size)
+    cudaDeviceSetLimit(cudaLimitMallocHeapSize,2*size);
+  cudaDeviceGetLimit(&size_limit,cudaLimitMallocHeapSize);
+#endif
+}
+
+#endif /* #if defined( KOKKOS_HAVE_CUDA ) */
+
+}
+
+template< class DataType ,
+          class Arg1Type = void ,
+          class Arg2Type = void ,
+          class Arg3Type = void>
+class SegmentedView : public Kokkos::ViewTraits< DataType , Arg1Type , Arg2Type, Arg3Type >
+{
+public:
+  //! \name Typedefs for device types and various Kokkos::View specializations.
+  //@{
+  typedef Kokkos::ViewTraits< DataType , Arg1Type , Arg2Type, Arg3Type > traits ;
+
+  //! The type of a Kokkos::View on the device.
+  typedef Kokkos::View< typename traits::data_type ,
+                typename traits::array_layout ,
+                typename traits::memory_space ,
+                Kokkos::MemoryUnmanaged > t_dev ;
+
+
+private:
+  Kokkos::View<t_dev*,typename traits::memory_space> segments_;
+
+  Kokkos::View<int,typename traits::memory_space> realloc_lock;
+  Kokkos::View<int,typename traits::memory_space> nsegments_;
+
+  size_t segment_length_;
+  size_t segment_length_m1_;
+  int max_segments_;
+
+  int segment_length_log2;
+
+  // Dimensions, cardinality, capacity, and offset computation for
+  // multidimensional array view of contiguous memory.
+  // Inherits from Impl::Shape
+  typedef Kokkos::Impl::ViewOffset< typename traits::shape_type
+                          , typename traits::array_layout
+                          > offset_map_type ;
+
+  offset_map_type               m_offset_map ;
+
+  typedef Kokkos::View< typename traits::array_intrinsic_type ,
+                typename traits::array_layout ,
+                typename traits::memory_space ,
+                typename traits::memory_traits > array_type ;
+
+  typedef Kokkos::View< typename traits::const_data_type ,
+                typename traits::array_layout ,
+                typename traits::memory_space ,
+                typename traits::memory_traits > const_type ;
+
+  typedef Kokkos::View< typename traits::non_const_data_type ,
+                typename traits::array_layout ,
+                typename traits::memory_space ,
+                typename traits::memory_traits > non_const_type ;
+
+  typedef Kokkos::View< typename traits::non_const_data_type ,
+                typename traits::array_layout ,
+                HostSpace ,
+                void > HostMirror ;
+
+  template< bool Accessible >
+  KOKKOS_INLINE_FUNCTION
+  typename Kokkos::Impl::enable_if< Accessible , typename traits::size_type >::type
+  dimension_0_intern() const { return nsegments_() * segment_length_ ; }
+
+  template< bool Accessible >
+  KOKKOS_INLINE_FUNCTION
+  typename Kokkos::Impl::enable_if< ! Accessible , typename traits::size_type >::type
+  dimension_0_intern() const
+  {
+    // In Host space
+    int n = 0 ;
+#if ! defined( __CUDA_ARCH__ )
+    Kokkos::Impl::DeepCopy< HostSpace , typename traits::memory_space >( & n , nsegments_.ptr_on_device() , sizeof(int) );
+#endif
+
+    return n * segment_length_ ;
+  }
+
+public:
+
+  enum { Rank = traits::rank };
+
+  KOKKOS_INLINE_FUNCTION offset_map_type shape() const { return m_offset_map ; }
+
+  /* \brief return (current) size of dimension 0 */
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_0() const {
+    enum { Accessible = Kokkos::Impl::VerifyExecutionCanAccessMemorySpace<
+      Kokkos::Impl::ActiveExecutionMemorySpace, typename traits::memory_space >::value };
+    int n = SegmentedView::dimension_0_intern< Accessible >();
+    return n ;
+  }
+
+  /* \brief return size of dimension 1 */
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_1() const { return m_offset_map.N1 ; }
+  /* \brief return size of dimension 2 */
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_2() const { return m_offset_map.N2 ; }
+  /* \brief return size of dimension 3 */
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_3() const { return m_offset_map.N3 ; }
+  /* \brief return size of dimension 4 */
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_4() const { return m_offset_map.N4 ; }
+  /* \brief return size of dimension 5 */
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_5() const { return m_offset_map.N5 ; }
+  /* \brief return size of dimension 6 */
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_6() const { return m_offset_map.N6 ; }
+  /* \brief return size of dimension 7 */
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_7() const { return m_offset_map.N7 ; }
+
+  /* \brief return size of dimension 2 */
+  KOKKOS_INLINE_FUNCTION typename traits::size_type size() const {
+    return dimension_0() *
+        m_offset_map.N1 * m_offset_map.N2 * m_offset_map.N3 * m_offset_map.N4 *
+        m_offset_map.N5 * m_offset_map.N6 * m_offset_map.N7 ;
+  }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  typename traits::size_type dimension( const iType & i ) const {
+    if(i==0)
+      return dimension_0();
+    else
+      return Kokkos::Impl::dimension( m_offset_map , i );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  typename traits::size_type capacity() {
+    return segments_.dimension_0() *
+        m_offset_map.N1 * m_offset_map.N2 * m_offset_map.N3 * m_offset_map.N4 *
+        m_offset_map.N5 * m_offset_map.N6 * m_offset_map.N7;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  typename traits::size_type get_num_segments() {
+    enum { Accessible = Kokkos::Impl::VerifyExecutionCanAccessMemorySpace<
+      Kokkos::Impl::ActiveExecutionMemorySpace, typename traits::memory_space >::value };
+    int n = SegmentedView::dimension_0_intern< Accessible >();
+    return n/segment_length_ ;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  typename traits::size_type get_max_segments() {
+    return max_segments_;
+  }
+
+  /// \brief Constructor that allocates View objects with an initial length of 0.
+  ///
+  /// This constructor works mostly like the analogous constructor of View.
+  /// The first argument is a string label, which is entirely for your
+  /// benefit.  (Different SegmentedView objects may have the same label if
+  /// you like.)  The second argument 'view_length' is the size of the segments.
+  /// This number must be a power of two. The third argument n0 is the maximum
+  /// value for the first dimension of the segmented view. The maximal allocatable
+  /// number of Segments is thus: (n0+view_length-1)/view_length.
+  /// The arguments that follow are the other dimensions of the (1-7) of the
+  /// View objects.  For example, for a View with 3 runtime dimensions,
+  /// the first 4 integer arguments will be nonzero:
+  /// SegmentedView("Name",32768,10000000,8,4). This allocates a SegmentedView
+  /// with a maximum of 306 segments of dimension (32768,8,4). The logical size of
+  /// the segmented view is (n,8,4) with n between 0 and 10000000.
+  /// You may omit the integer arguments that follow.
+  template< class LabelType >
+  SegmentedView(const LabelType & label ,
+      const size_t view_length ,
+      const size_t n0 ,
+      const size_t n1 = 0 ,
+      const size_t n2 = 0 ,
+      const size_t n3 = 0 ,
+      const size_t n4 = 0 ,
+      const size_t n5 = 0 ,
+      const size_t n6 = 0 ,
+      const size_t n7 = 0
+      ): segment_length_(view_length),segment_length_m1_(view_length-1)
+  {
+    segment_length_log2 = -1;
+    size_t l = segment_length_;
+    while(l>0) {
+      l>>=1;
+      segment_length_log2++;
+    }
+    l = 1<<segment_length_log2;
+    if(l!=segment_length_)
+      Kokkos::Impl::throw_runtime_exception("Kokkos::SegmentedView requires a 'power of 2' segment length");
+
+    max_segments_ = (n0+segment_length_m1_)/segment_length_;
+
+    Impl::DeviceSetAllocatableMemorySize<typename traits::memory_space>(segment_length_*max_segments_*sizeof(typename traits::value_type));
+
+    segments_ = Kokkos::View<t_dev*,typename traits::execution_space>(label , max_segments_);
+    realloc_lock = Kokkos::View<int,typename traits::execution_space>("Lock");
+    nsegments_ = Kokkos::View<int,typename traits::execution_space>("nviews");
+    m_offset_map.assign( n0, n1, n2, n3, n4, n5, n6, n7, n0*n1*n2*n3*n4*n5*n6*n7 );
+
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  SegmentedView(const SegmentedView& src):
+    segments_(src.segments_),
+    realloc_lock (src.realloc_lock),
+    nsegments_ (src.nsegments_),
+    segment_length_(src.segment_length_),
+    segment_length_m1_(src.segment_length_m1_),
+    max_segments_ (src.max_segments_),
+    segment_length_log2(src.segment_length_log2),
+    m_offset_map (src.m_offset_map)
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  SegmentedView& operator= (const SegmentedView& src) {
+    segments_ = src.segments_;
+    realloc_lock = src.realloc_lock;
+    nsegments_ = src.nsegments_;
+    segment_length_= src.segment_length_;
+    segment_length_m1_= src.segment_length_m1_;
+    max_segments_ = src.max_segments_;
+    segment_length_log2= src.segment_length_log2;
+    m_offset_map = src.m_offset_map;
+    return *this;
+  }
+
+  ~SegmentedView() {
+    if ( !segments_.tracker().ref_counting()) { return; }
+    size_t ref_count = segments_.tracker().ref_count();
+    if(ref_count == 1u) {
+      Kokkos::fence();
+      typename Kokkos::View<int,typename traits::execution_space>::HostMirror h_nviews("h_nviews");
+      Kokkos::deep_copy(h_nviews,nsegments_);
+      Kokkos::parallel_for(h_nviews(),Impl::delete_segmented_view<DataType , Arg1Type , Arg2Type, Arg3Type>(*this));
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  t_dev get_segment(const int& i) const {
+    return segments_[i];
+  }
+
+  template< class MemberType>
+  KOKKOS_INLINE_FUNCTION
+  void grow (MemberType& team_member, const size_t& growSize) const {
+    if (growSize>max_segments_*segment_length_) {
+      printf ("Exceeding maxSize: %lu %lu\n", growSize, max_segments_*segment_length_);
+      return;
+    }
+
+    if(team_member.team_rank()==0) {
+      bool too_small = growSize > segment_length_ * nsegments_();
+      if (too_small) {
+        while(Kokkos::atomic_compare_exchange(&realloc_lock(),0,1) )
+          ; // get the lock
+        too_small = growSize > segment_length_ * nsegments_(); // Recheck once we have the lock
+        if(too_small) {
+          while(too_small) {
+            const size_t alloc_size = segment_length_*m_offset_map.N1*m_offset_map.N2*m_offset_map.N3*
+                m_offset_map.N4*m_offset_map.N5*m_offset_map.N6*m_offset_map.N7;
+            typename traits::non_const_value_type* const ptr = new typename traits::non_const_value_type[alloc_size];
+
+            segments_(nsegments_()) =
+                t_dev(ptr,segment_length_,m_offset_map.N1,m_offset_map.N2,m_offset_map.N3,m_offset_map.N4,m_offset_map.N5,m_offset_map.N6,m_offset_map.N7);
+            nsegments_()++;
+            too_small = growSize > segment_length_ * nsegments_();
+          }
+        }
+        realloc_lock() = 0; //release the lock
+      }
+    }
+    team_member.team_barrier();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void grow_non_thread_safe (const size_t& growSize) const {
+    if (growSize>max_segments_*segment_length_) {
+      printf ("Exceeding maxSize: %lu %lu\n", growSize, max_segments_*segment_length_);
+      return;
+    }
+    bool too_small = growSize > segment_length_ * nsegments_();
+    if(too_small) {
+      while(too_small) {
+        const size_t alloc_size = segment_length_*m_offset_map.N1*m_offset_map.N2*m_offset_map.N3*
+                            m_offset_map.N4*m_offset_map.N5*m_offset_map.N6*m_offset_map.N7;
+        typename traits::non_const_value_type* const ptr =
+          new typename traits::non_const_value_type[alloc_size];
+
+        segments_(nsegments_()) =
+          t_dev (ptr, segment_length_, m_offset_map.N1, m_offset_map.N2,
+                 m_offset_map.N3, m_offset_map.N4, m_offset_map.N5,
+                 m_offset_map.N6, m_offset_map.N7);
+        nsegments_()++;
+        too_small = growSize > segment_length_ * nsegments_();
+      }
+    }
+  }
+
+  template< typename iType0 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<( std::is_integral<iType0>::value && traits::rank == 1 )
+                         , typename traits::value_type &
+                         >::type
+    operator() ( const iType0 & i0 ) const
+    {
+      return segments_[i0>>segment_length_log2](i0&(segment_length_m1_));
+    }
+
+  template< typename iType0 , typename iType1 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<( std::is_integral<iType0>::value &&
+                            std::is_integral<iType1>::value &&
+                            traits::rank == 2 )
+                         , typename traits::value_type &
+                         >::type
+    operator() ( const iType0 & i0 , const iType1 & i1 ) const
+    {
+      return segments_[i0>>segment_length_log2](i0&(segment_length_m1_),i1);
+    }
+
+  template< typename iType0 , typename iType1 , typename iType2 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<( std::is_integral<iType0>::value &&
+                            std::is_integral<iType1>::value &&
+                            std::is_integral<iType2>::value &&
+                            traits::rank == 3 )
+                         , typename traits::value_type &
+                         >::type
+    operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 ) const
+    {
+      return segments_[i0>>segment_length_log2](i0&(segment_length_m1_),i1,i2);
+    }
+
+  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<( std::is_integral<iType0>::value &&
+                            std::is_integral<iType1>::value &&
+                            std::is_integral<iType2>::value &&
+                            std::is_integral<iType3>::value &&
+                            traits::rank == 4 )
+                         , typename traits::value_type &
+                         >::type
+    operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ) const
+    {
+      return segments_[i0>>segment_length_log2](i0&(segment_length_m1_),i1,i2,i3);
+    }
+
+  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
+            typename iType4 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<( std::is_integral<iType0>::value &&
+                            std::is_integral<iType1>::value &&
+                            std::is_integral<iType2>::value &&
+                            std::is_integral<iType3>::value &&
+                            std::is_integral<iType4>::value &&
+                            traits::rank == 5 )
+                         , typename traits::value_type &
+                         >::type
+    operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
+                 const iType4 & i4 ) const
+    {
+      return segments_[i0>>segment_length_log2](i0&(segment_length_m1_),i1,i2,i3,i4);
+    }
+
+  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
+            typename iType4 , typename iType5 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<( std::is_integral<iType0>::value &&
+                            std::is_integral<iType1>::value &&
+                            std::is_integral<iType2>::value &&
+                            std::is_integral<iType3>::value &&
+                            std::is_integral<iType4>::value &&
+                            std::is_integral<iType5>::value &&
+                            traits::rank == 6 )
+                         , typename traits::value_type &
+                         >::type
+    operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
+                 const iType4 & i4 , const iType5 & i5 ) const
+    {
+      return segments_[i0>>segment_length_log2](i0&(segment_length_m1_),i1,i2,i3,i4,i5);
+    }
+
+  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
+            typename iType4 , typename iType5 , typename iType6 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<( std::is_integral<iType0>::value &&
+                            std::is_integral<iType1>::value &&
+                            std::is_integral<iType2>::value &&
+                            std::is_integral<iType3>::value &&
+                            std::is_integral<iType4>::value &&
+                            std::is_integral<iType5>::value &&
+                            std::is_integral<iType6>::value &&
+                            traits::rank == 7 )
+                         , typename traits::value_type &
+                         >::type
+    operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
+                 const iType4 & i4 , const iType5 & i5 , const iType6 & i6 ) const
+    {
+      return segments_[i0>>segment_length_log2](i0&(segment_length_m1_),i1,i2,i3,i4,i5,i6);
+    }
+
+  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
+            typename iType4 , typename iType5 , typename iType6 , typename iType7 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<( std::is_integral<iType0>::value &&
+                            std::is_integral<iType1>::value &&
+                            std::is_integral<iType2>::value &&
+                            std::is_integral<iType3>::value &&
+                            std::is_integral<iType4>::value &&
+                            std::is_integral<iType5>::value &&
+                            std::is_integral<iType6>::value &&
+                            std::is_integral<iType7>::value &&
+                            traits::rank == 8 )
+                         , typename traits::value_type &
+                         >::type
+    operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
+                 const iType4 & i4 , const iType5 & i5 , const iType6 & i6 , const iType7 & i7 ) const
+    {
+      return segments_[i0>>segment_length_log2](i0&(segment_length_m1_),i1,i2,i3,i4,i5,i6,i7);
+    }
+};
+
+namespace Impl {
+template<class DataType, class Arg1Type, class Arg2Type, class Arg3Type>
+struct delete_segmented_view {
+  typedef SegmentedView<DataType , Arg1Type , Arg2Type, Arg3Type> view_type;
+  typedef typename view_type::execution_space execution_space;
+
+  view_type view_;
+  delete_segmented_view(view_type view):view_(view) {
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (int i) const {
+    delete [] view_.get_segment(i).ptr_on_device();
+  }
+};
+
+}
+}
+}
+
+#endif
+
+#endif
diff --git a/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp b/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..1ce38638a2b6a107d1439f7feebb0c90c4a8068f
--- /dev/null
+++ b/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp
@@ -0,0 +1,226 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STATICCRSGRAPH_HPP
+#define KOKKOS_STATICCRSGRAPH_HPP
+
+#include <string>
+#include <vector>
+
+#include <Kokkos_Core.hpp>
+
+namespace Kokkos {
+
+/// \class StaticCrsGraph
+/// \brief Compressed row storage array.
+///
+/// \tparam DataType The type of stored entries.  If a StaticCrsGraph is
+///   used as the graph of a sparse matrix, then this is usually an
+///   integer type, the type of the column indices in the sparse
+///   matrix.
+///
+/// \tparam Arg1Type The second template parameter, corresponding
+///   either to the Device type (if there are no more template
+///   parameters) or to the Layout type (if there is at least one more
+///   template parameter).
+///
+/// \tparam Arg2Type The third template parameter, which if provided
+///   corresponds to the Device type.
+///
+/// \tparam SizeType The type of row offsets.  Usually the default
+///   parameter suffices.  However, setting a nondefault value is
+///   necessary in some cases, for example, if you want to have a
+///   sparse matrices with dimensions (and therefore column indices)
+///   that fit in \c int, but want to store more than <tt>INT_MAX</tt>
+///   entries in the sparse matrix.
+///
+/// A row has a range of entries:
+/// <ul>
+/// <li> <tt> row_map[i0] <= entry < row_map[i0+1] </tt> </li>
+/// <li> <tt> 0 <= i1 < row_map[i0+1] - row_map[i0] </tt> </li>
+/// <li> <tt> entries( entry ,            i2 , i3 , ... ); </tt> </li>
+/// <li> <tt> entries( row_map[i0] + i1 , i2 , i3 , ... ); </tt> </li>
+/// </ul>
+template< class DataType,
+          class Arg1Type,
+          class Arg2Type = void,
+          typename SizeType = typename ViewTraits<DataType*, Arg1Type, Arg2Type, void >::size_type>
+class StaticCrsGraph {
+private:
+  typedef ViewTraits<DataType*, Arg1Type, Arg2Type, void> traits;
+
+public:
+  typedef DataType                                            data_type;
+  typedef typename traits::array_layout                       array_layout;
+  typedef typename traits::execution_space                    execution_space;
+  typedef typename traits::device_type                        device_type;
+  typedef SizeType                                            size_type;
+
+  typedef StaticCrsGraph< DataType , Arg1Type , Arg2Type , SizeType > staticcrsgraph_type;
+  typedef StaticCrsGraph< DataType , array_layout , typename traits::host_mirror_space , SizeType > HostMirror;
+  typedef View< const size_type* , array_layout, device_type >  row_map_type;
+  typedef View<       DataType*  , array_layout, device_type >  entries_type;
+
+  entries_type entries;
+  row_map_type row_map;
+
+  //! Construct an empty view.
+  StaticCrsGraph () : entries(), row_map() {}
+
+  //! Copy constructor (shallow copy).
+  StaticCrsGraph (const StaticCrsGraph& rhs) : entries (rhs.entries), row_map (rhs.row_map)
+  {}
+
+  template<class EntriesType, class RowMapType>
+  StaticCrsGraph (const EntriesType& entries_,const RowMapType& row_map_) : entries (entries_), row_map (row_map_)
+  {}
+
+  /** \brief  Assign to a view of the rhs array.
+   *          If the old view is the last view
+   *          then allocated memory is deallocated.
+   */
+  StaticCrsGraph& operator= (const StaticCrsGraph& rhs) {
+    entries = rhs.entries;
+    row_map = rhs.row_map;
+    return *this;
+  }
+
+  /**  \brief  Destroy this view of the array.
+   *           If the last view then allocated memory is deallocated.
+   */
+  ~StaticCrsGraph() {}
+
+  KOKKOS_INLINE_FUNCTION
+  size_type numRows() const {
+    return (row_map.dimension_0 () != 0) ?
+      row_map.dimension_0 () - static_cast<size_type> (1) :
+      static_cast<size_type> (0);
+  }
+};
+
+//----------------------------------------------------------------------------
+
+template< class StaticCrsGraphType , class InputSizeType >
+typename StaticCrsGraphType::staticcrsgraph_type
+create_staticcrsgraph( const std::string & label ,
+                 const std::vector< InputSizeType > & input );
+
+template< class StaticCrsGraphType , class InputSizeType >
+typename StaticCrsGraphType::staticcrsgraph_type
+create_staticcrsgraph( const std::string & label ,
+                 const std::vector< std::vector< InputSizeType > > & input );
+
+//----------------------------------------------------------------------------
+
+template< class DataType ,
+          class Arg1Type ,
+          class Arg2Type ,
+          typename SizeType >
+typename StaticCrsGraph< DataType , Arg1Type , Arg2Type , SizeType >::HostMirror
+create_mirror_view( const StaticCrsGraph<DataType,Arg1Type,Arg2Type,SizeType > & input );
+
+template< class DataType ,
+          class Arg1Type ,
+          class Arg2Type ,
+          typename SizeType >
+typename StaticCrsGraph< DataType , Arg1Type , Arg2Type , SizeType >::HostMirror
+create_mirror( const StaticCrsGraph<DataType,Arg1Type,Arg2Type,SizeType > & input );
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#include <impl/Kokkos_StaticCrsGraph_factory.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class GraphType >
+struct StaticCrsGraphMaximumEntry {
+
+  typedef typename GraphType::execution_space execution_space ;
+  typedef typename GraphType::data_type value_type ;
+
+  const typename GraphType::entries_type entries ;
+
+  StaticCrsGraphMaximumEntry( const GraphType & graph ) : entries( graph.entries ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const unsigned i , value_type & update ) const
+    { if ( update < entries(i) ) update = entries(i); }
+
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type & update ) const
+    { update = 0 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile value_type & update ,
+             volatile const value_type & input ) const
+    { if ( update < input ) update = input ; }
+};
+
+}
+
+template< class DataType, class Arg1Type, class Arg2Type, typename SizeType >
+DataType maximum_entry( const StaticCrsGraph< DataType , Arg1Type , Arg2Type , SizeType > & graph )
+{
+  typedef StaticCrsGraph<DataType,Arg1Type,Arg2Type,SizeType> GraphType ;
+  typedef Impl::StaticCrsGraphMaximumEntry< GraphType > FunctorType ;
+
+  DataType result = 0 ;
+  Kokkos::parallel_reduce( graph.entries.dimension_0(),
+                           FunctorType(graph), result );
+  return result ;
+}
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_CRSARRAY_HPP */
+
diff --git a/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp b/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..7a916c6ef7c449a041d6d2014033e34c3342f185
--- /dev/null
+++ b/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp
@@ -0,0 +1,848 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+/// \file Kokkos_UnorderedMap.hpp
+/// \brief Declaration and definition of Kokkos::UnorderedMap.
+///
+/// This header file declares and defines Kokkos::UnorderedMap and its
+/// related nonmember functions.
+
+#ifndef KOKKOS_UNORDERED_MAP_HPP
+#define KOKKOS_UNORDERED_MAP_HPP
+
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Functional.hpp>
+
+#include <Kokkos_Bitset.hpp>
+
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_UnorderedMap_impl.hpp>
+
+
+#include <iostream>
+
+#include <stdint.h>
+#include <stdexcept>
+
+
+namespace Kokkos {
+
+enum { UnorderedMapInvalidIndex = ~0u };
+
+/// \brief First element of the return value of UnorderedMap::insert().
+///
+/// Inserting an element into an UnorderedMap is not guaranteed to
+/// succeed.  There are three possible conditions:
+/// <ol>
+/// <li> <tt>INSERT_FAILED</tt>: The insert failed.  This usually
+///      means that the UnorderedMap ran out of space. </li>
+/// <li> <tt>INSERT_SUCCESS</tt>: The insert succeeded, and the key
+///      did <i>not</i> exist in the table before. </li>
+/// <li> <tt>INSERT_EXISTING</tt>: The insert succeeded, and the key
+///      <i>did</i> exist in the table before.  The new value was
+///      ignored and the old value was left in place. </li>
+/// </ol>
+
+class UnorderedMapInsertResult
+{
+private:
+  enum Status{
+     SUCCESS = 1u << 31
+   , EXISTING = 1u << 30
+   , FREED_EXISTING = 1u << 29
+   , LIST_LENGTH_MASK = ~(SUCCESS | EXISTING | FREED_EXISTING)
+  };
+
+public:
+  /// Did the map successful insert the key/value pair
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool success() const { return (m_status & SUCCESS); }
+
+  /// Was the key already present in the map
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool existing() const { return (m_status & EXISTING); }
+
+  /// Did the map fail to insert the key due to insufficent capacity
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool failed() const { return m_index == UnorderedMapInvalidIndex; }
+
+  /// Did the map lose a race condition to insert a dupulicate key/value pair
+  /// where an index was claimed that needed to be released
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool freed_existing() const { return (m_status & FREED_EXISTING); }
+
+  /// How many iterations through the insert loop did it take before the
+  /// map returned
+  KOKKOS_FORCEINLINE_FUNCTION
+  uint32_t list_position() const { return (m_status & LIST_LENGTH_MASK); }
+
+  /// Index where the key can be found as long as the insert did not fail
+  KOKKOS_FORCEINLINE_FUNCTION
+  uint32_t index() const { return m_index; }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  UnorderedMapInsertResult()
+    : m_index(UnorderedMapInvalidIndex)
+    , m_status(0)
+  {}
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  void increment_list_position()
+  {
+    m_status += (list_position() < LIST_LENGTH_MASK) ? 1u : 0u;
+  }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  void set_existing(uint32_t i, bool arg_freed_existing)
+  {
+    m_index = i;
+    m_status = EXISTING | (arg_freed_existing ? FREED_EXISTING : 0u) | list_position();
+  }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  void set_success(uint32_t i)
+  {
+    m_index = i;
+    m_status = SUCCESS | list_position();
+  }
+
+private:
+  uint32_t m_index;
+  uint32_t m_status;
+};
+
+/// \class UnorderedMap
+/// \brief Thread-safe, performance-portable lookup table.
+///
+/// This class provides a lookup table.  In terms of functionality,
+/// this class compares to std::unordered_map (new in C++11).
+/// "Unordered" means that keys are not stored in any particular
+/// order, unlike (for example) std::map.  "Thread-safe" means that
+/// lookups, insertion, and deletion are safe to call by multiple
+/// threads in parallel.  "Performance-portable" means that parallel
+/// performance of these operations is reasonable, on multiple
+/// hardware platforms.  Platforms on which performance has been
+/// tested include conventional Intel x86 multicore processors, Intel
+/// Xeon Phi ("MIC"), and NVIDIA GPUs.
+///
+/// Parallel performance portability entails design decisions that
+/// might differ from one's expectation for a sequential interface.
+/// This particularly affects insertion of single elements.  In an
+/// interface intended for sequential use, insertion might reallocate
+/// memory if the original allocation did not suffice to hold the new
+/// element.  In this class, insertion does <i>not</i> reallocate
+/// memory.  This means that it might fail.  insert() returns an enum
+/// which indicates whether the insert failed.  There are three
+/// possible conditions:
+/// <ol>
+/// <li> <tt>INSERT_FAILED</tt>: The insert failed.  This usually
+///      means that the UnorderedMap ran out of space. </li>
+/// <li> <tt>INSERT_SUCCESS</tt>: The insert succeeded, and the key
+///      did <i>not</i> exist in the table before. </li>
+/// <li> <tt>INSERT_EXISTING</tt>: The insert succeeded, and the key
+///      <i>did</i> exist in the table before.  The new value was
+///      ignored and the old value was left in place. </li>
+/// </ol>
+///
+/// \tparam Key Type of keys of the lookup table.  If \c const, users
+///   are not allowed to add or remove keys, though they are allowed
+///   to change values.  In that case, the implementation may make
+///   optimizations specific to the <tt>Device</tt>.  For example, if
+///   <tt>Device</tt> is \c Cuda, it may use texture fetches to access
+///   keys.
+///
+/// \tparam Value Type of values stored in the lookup table.  You may use
+///   \c void here, in which case the table will be a set of keys.  If
+///   \c const, users are not allowed to change entries.
+///   In that case, the implementation may make
+///   optimizations specific to the \c Device, such as using texture
+///   fetches to access values.
+///
+/// \tparam Device The Kokkos Device type.
+///
+/// \tparam Hasher Definition of the hash function for instances of
+///   <tt>Key</tt>.  The default will calculate a bitwise hash.
+///
+/// \tparam EqualTo Definition of the equality function for instances of
+///   <tt>Key</tt>.  The default will do a bitwise equality comparison.
+///
+template <   typename Key
+           , typename Value
+           , typename Device = Kokkos::DefaultExecutionSpace
+           , typename Hasher = pod_hash<typename Impl::remove_const<Key>::type>
+           , typename EqualTo = pod_equal_to<typename Impl::remove_const<Key>::type>
+        >
+class UnorderedMap
+{
+private:
+  typedef typename ViewTraits<Key,Device,void,void>::host_mirror_space host_mirror_space ;
+public:
+  //! \name Public types and constants
+  //@{
+
+  //key_types
+  typedef Key declared_key_type;
+  typedef typename Impl::remove_const<declared_key_type>::type key_type;
+  typedef typename Impl::add_const<key_type>::type const_key_type;
+
+  //value_types
+  typedef Value declared_value_type;
+  typedef typename Impl::remove_const<declared_value_type>::type value_type;
+  typedef typename Impl::add_const<value_type>::type const_value_type;
+
+  typedef Device execution_space;
+  typedef Hasher hasher_type;
+  typedef EqualTo  equal_to_type;
+  typedef uint32_t size_type;
+
+  //map_types
+  typedef UnorderedMap<declared_key_type,declared_value_type,execution_space,hasher_type,equal_to_type> declared_map_type;
+  typedef UnorderedMap<key_type,value_type,execution_space,hasher_type,equal_to_type>                   insertable_map_type;
+  typedef UnorderedMap<const_key_type,value_type,execution_space,hasher_type,equal_to_type>             modifiable_map_type;
+  typedef UnorderedMap<const_key_type,const_value_type,execution_space,hasher_type,equal_to_type>       const_map_type;
+
+  static const bool is_set = Impl::is_same<void,value_type>::value;
+  static const bool has_const_key = Impl::is_same<const_key_type,declared_key_type>::value;
+  static const bool has_const_value = is_set || Impl::is_same<const_value_type,declared_value_type>::value;
+
+  static const bool is_insertable_map = !has_const_key && (is_set || !has_const_value);
+  static const bool is_modifiable_map = has_const_key && !has_const_value;
+  static const bool is_const_map = has_const_key && has_const_value;
+
+
+  typedef UnorderedMapInsertResult insert_result;
+
+  typedef UnorderedMap<Key,Value,host_mirror_space,Hasher,EqualTo> HostMirror;
+
+  typedef Impl::UnorderedMapHistogram<const_map_type> histogram_type;
+
+  //@}
+
+private:
+  enum { invalid_index = ~static_cast<size_type>(0) };
+
+  typedef typename Impl::if_c< is_set, int, declared_value_type>::type impl_value_type;
+
+  typedef typename Impl::if_c<   is_insertable_map
+                               , View< key_type *, execution_space>
+                               , View< const key_type *, execution_space, MemoryTraits<RandomAccess> >
+                             >::type key_type_view;
+
+  typedef typename Impl::if_c<   is_insertable_map || is_modifiable_map
+                               , View< impl_value_type *, execution_space>
+                               , View< const impl_value_type *, execution_space, MemoryTraits<RandomAccess> >
+                             >::type value_type_view;
+
+  typedef typename Impl::if_c<   is_insertable_map
+                               , View< size_type *, execution_space>
+                               , View< const size_type *, execution_space, MemoryTraits<RandomAccess> >
+                             >::type size_type_view;
+
+  typedef typename Impl::if_c<   is_insertable_map
+                               , Bitset< execution_space >
+                               , ConstBitset< execution_space>
+                             >::type bitset_type;
+
+  enum { modified_idx = 0, erasable_idx = 1, failed_insert_idx = 2 };
+  enum { num_scalars = 3 };
+  typedef View< int[num_scalars], LayoutLeft, execution_space> scalars_view;
+
+public:
+  //! \name Public member functions
+  //@{
+
+  UnorderedMap()
+    : m_bounded_insert()
+    , m_hasher()
+    , m_equal_to()
+    , m_size()
+    , m_available_indexes()
+    , m_hash_lists()
+    , m_next_index()
+    , m_keys()
+    , m_values()
+    , m_scalars()
+  {}
+
+  /// \brief Constructor
+  ///
+  /// \param capacity_hint [in] Initial guess of how many unique keys will be inserted into the map
+  /// \param hash [in] Hasher function for \c Key instances.  The
+  ///   default value usually suffices.
+  UnorderedMap(  size_type capacity_hint, hasher_type hasher = hasher_type(), equal_to_type equal_to = equal_to_type() )
+    : m_bounded_insert(true)
+    , m_hasher(hasher)
+    , m_equal_to(equal_to)
+    , m_size()
+    , m_available_indexes(calculate_capacity(capacity_hint))
+    , m_hash_lists(ViewAllocateWithoutInitializing("UnorderedMap hash list"), Impl::find_hash_size(capacity()))
+    , m_next_index(ViewAllocateWithoutInitializing("UnorderedMap next index"), capacity()+1) // +1 so that the *_at functions can always return a valid reference
+    , m_keys("UnorderedMap keys",capacity()+1)
+    , m_values("UnorderedMap values",(is_set? 1 : capacity()+1))
+    , m_scalars("UnorderedMap scalars")
+  {
+    if (!is_insertable_map) {
+      throw std::runtime_error("Cannot construct a non-insertable (i.e. const key_type) unordered_map");
+    }
+
+    Kokkos::deep_copy(m_hash_lists, invalid_index);
+    Kokkos::deep_copy(m_next_index, invalid_index);
+  }
+
+  void reset_failed_insert_flag()
+  {
+    reset_flag(failed_insert_idx);
+  }
+
+  histogram_type get_histogram()
+  {
+    return histogram_type(*this);
+  }
+
+  //! Clear all entries in the table.
+  void clear()
+  {
+    m_bounded_insert = true;
+
+    if (capacity() == 0) return;
+
+    m_available_indexes.clear();
+
+    Kokkos::deep_copy(m_hash_lists, invalid_index);
+    Kokkos::deep_copy(m_next_index, invalid_index);
+    {
+      const key_type tmp = key_type();
+      Kokkos::deep_copy(m_keys,tmp);
+    }
+    if (is_set){
+      const impl_value_type tmp = impl_value_type();
+      Kokkos::deep_copy(m_values,tmp);
+    }
+    {
+      Kokkos::deep_copy(m_scalars, 0);
+    }
+  }
+
+  /// \brief Change the capacity of the the map
+  ///
+  /// If there are no failed inserts the current size of the map will
+  /// be used as a lower bound for the input capacity.
+  /// If the map is not empty and does not have failed inserts
+  /// and the capacity changes then the current data is copied
+  /// into the resized / rehashed map.
+  ///
+  /// This is <i>not</i> a device function; it may <i>not</i> be
+  /// called in a parallel kernel.
+  bool rehash(size_type requested_capacity = 0)
+  {
+    const bool bounded_insert = (capacity() == 0) || (size() == 0u);
+    return rehash(requested_capacity, bounded_insert );
+  }
+
+  bool rehash(size_type requested_capacity, bool bounded_insert)
+  {
+    if(!is_insertable_map) return false;
+
+    const size_type curr_size = size();
+    requested_capacity = (requested_capacity < curr_size) ? curr_size : requested_capacity;
+
+    insertable_map_type tmp(requested_capacity, m_hasher, m_equal_to);
+
+    if (curr_size) {
+      tmp.m_bounded_insert = false;
+      Impl::UnorderedMapRehash<insertable_map_type> f(tmp,*this);
+      f.apply();
+    }
+    tmp.m_bounded_insert = bounded_insert;
+
+    *this = tmp;
+
+    return true;
+  }
+
+  /// \brief The number of entries in the table.
+  ///
+  /// This method has undefined behavior when erasable() is true.
+  ///
+  /// Note that this is not a device function; it cannot be called in
+  /// a parallel kernel.  The value is not stored as a variable; it
+  /// must be computed.
+  size_type size() const
+  {
+    if( capacity() == 0u ) return 0u;
+    if (modified()) {
+      m_size = m_available_indexes.count();
+      reset_flag(modified_idx);
+    }
+    return m_size;
+  }
+
+  /// \brief The current number of failed insert() calls.
+  ///
+  /// This is <i>not</i> a device function; it may <i>not</i> be
+  /// called in a parallel kernel.  The value is not stored as a
+  /// variable; it must be computed.
+  bool failed_insert() const
+  {
+    return get_flag(failed_insert_idx);
+  }
+
+  bool erasable() const
+  {
+    return is_insertable_map ? get_flag(erasable_idx) : false;
+  }
+
+  bool begin_erase()
+  {
+    bool result = !erasable();
+    if (is_insertable_map && result) {
+      execution_space::fence();
+      set_flag(erasable_idx);
+      execution_space::fence();
+    }
+    return result;
+  }
+
+  bool end_erase()
+  {
+    bool result = erasable();
+    if (is_insertable_map && result) {
+      execution_space::fence();
+      Impl::UnorderedMapErase<declared_map_type> f(*this);
+      f.apply();
+      execution_space::fence();
+      reset_flag(erasable_idx);
+    }
+    return result;
+  }
+
+  /// \brief The maximum number of entries that the table can hold.
+  ///
+  /// This <i>is</i> a device function; it may be called in a parallel
+  /// kernel.
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type capacity() const
+  { return m_available_indexes.size(); }
+
+  /// \brief The number of hash table "buckets."
+  ///
+  /// This is different than the number of entries that the table can
+  /// hold.  Each key hashes to an index in [0, hash_capacity() - 1].
+  /// That index can hold zero or more entries.  This class decides
+  /// what hash_capacity() should be, given the user's upper bound on
+  /// the number of entries the table must be able to hold.
+  ///
+  /// This <i>is</i> a device function; it may be called in a parallel
+  /// kernel.
+  KOKKOS_INLINE_FUNCTION
+  size_type hash_capacity() const
+  { return m_hash_lists.dimension_0(); }
+
+  //---------------------------------------------------------------------------
+  //---------------------------------------------------------------------------
+
+
+  /// This <i>is</i> a device function; it may be called in a parallel
+  /// kernel.  As discussed in the class documentation, it need not
+  /// succeed.  The return value tells you if it did.
+  ///
+  /// \param k [in] The key to attempt to insert.
+  /// \param v [in] The corresponding value to attempt to insert.  If
+  ///   using this class as a set (with Value = void), then you need not
+  ///   provide this value.
+  KOKKOS_INLINE_FUNCTION
+  insert_result insert(key_type const& k, impl_value_type const&v = impl_value_type()) const
+  {
+    insert_result result;
+
+    if ( !is_insertable_map || capacity() == 0u || m_scalars((int)erasable_idx) ) {
+      return result;
+    }
+
+    if ( !m_scalars((int)modified_idx) ) {
+      m_scalars((int)modified_idx) = true;
+    }
+
+    int volatile & failed_insert_ref = m_scalars((int)failed_insert_idx) ;
+
+    const size_type hash_value = m_hasher(k);
+    const size_type hash_list = hash_value % m_hash_lists.dimension_0();
+
+    size_type * curr_ptr   = & m_hash_lists[ hash_list ];
+    size_type new_index    = invalid_index ;
+
+    // Force integer multiply to long
+    size_type index_hint = static_cast<size_type>( (static_cast<double>(hash_list) * capacity()) / m_hash_lists.dimension_0());
+
+    size_type find_attempts = 0;
+
+    enum { bounded_find_attempts = 32u };
+    const size_type max_attempts = (m_bounded_insert && (bounded_find_attempts < m_available_indexes.max_hint()) ) ?
+                                    bounded_find_attempts :
+                                    m_available_indexes.max_hint();
+
+    bool not_done = true ;
+
+#if defined( __MIC__ )
+      #pragma noprefetch
+#endif
+    while ( not_done ) {
+
+      // Continue searching the unordered list for this key,
+      // list will only be appended during insert phase.
+      // Need volatile_load as other threads may be appending.
+      size_type curr = volatile_load(curr_ptr);
+
+      KOKKOS_NONTEMPORAL_PREFETCH_LOAD(&m_keys[curr != invalid_index ? curr : 0]);
+#if defined( __MIC__ )
+      #pragma noprefetch
+#endif
+      while ( curr != invalid_index && ! m_equal_to( volatile_load(&m_keys[curr]), k) ) {
+        result.increment_list_position();
+        index_hint = curr;
+        curr_ptr = &m_next_index[curr];
+        curr = volatile_load(curr_ptr);
+        KOKKOS_NONTEMPORAL_PREFETCH_LOAD(&m_keys[curr != invalid_index ? curr : 0]);
+      }
+
+      //------------------------------------------------------------
+      // If key already present then return that index.
+      if ( curr != invalid_index ) {
+
+        const bool free_existing = new_index != invalid_index;
+        if ( free_existing ) {
+          // Previously claimed an unused entry that was not inserted.
+          // Release this unused entry immediately.
+          if (!m_available_indexes.reset(new_index) ) {
+            printf("Unable to free existing\n");
+          }
+
+        }
+
+        result.set_existing(curr, free_existing);
+        not_done = false ;
+      }
+      //------------------------------------------------------------
+      // Key is not currently in the map.
+      // If the thread has claimed an entry try to insert now.
+      else {
+
+        //------------------------------------------------------------
+        // If have not already claimed an unused entry then do so now.
+        if (new_index == invalid_index) {
+
+          bool found = false;
+          // use the hash_list as the flag for the search direction
+          Kokkos::tie(found, index_hint) = m_available_indexes.find_any_unset_near( index_hint, hash_list );
+
+          // found and index and this thread set it
+          if ( !found && ++find_attempts >= max_attempts ) {
+            failed_insert_ref = true;
+            not_done = false ;
+          }
+          else if (m_available_indexes.set(index_hint) ) {
+            new_index = index_hint;
+            // Set key and value
+            KOKKOS_NONTEMPORAL_PREFETCH_STORE(&m_keys[new_index]);
+            m_keys[new_index] = k ;
+
+            if (!is_set) {
+              KOKKOS_NONTEMPORAL_PREFETCH_STORE(&m_values[new_index]);
+              m_values[new_index] = v ;
+            }
+
+            // Do not proceed until key and value are updated in global memory
+            memory_fence();
+          }
+        }
+        else if (failed_insert_ref) {
+          not_done = false;
+        }
+
+        // Attempt to append claimed entry into the list.
+        // Another thread may also be trying to append the same list so protect with atomic.
+        if ( new_index != invalid_index &&
+             curr ==  atomic_compare_exchange(curr_ptr, static_cast<size_type>(invalid_index), new_index) ) {
+          // Succeeded in appending
+          result.set_success(new_index);
+          not_done = false ;
+        }
+      }
+    } // while ( not_done )
+
+    return result ;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  bool erase(key_type const& k) const
+  {
+    bool result = false;
+
+    if(is_insertable_map && 0u < capacity() && m_scalars((int)erasable_idx)) {
+
+      if ( ! m_scalars((int)modified_idx) ) {
+        m_scalars((int)modified_idx) = true;
+      }
+
+      size_type index = find(k);
+      if (valid_at(index)) {
+        m_available_indexes.reset(index);
+        result = true;
+      }
+    }
+
+    return result;
+  }
+
+  /// \brief Find the given key \c k, if it exists in the table.
+  ///
+  /// \return If the key exists in the table, the index of the
+  ///   value corresponding to that key; otherwise, an invalid index.
+  ///
+  /// This <i>is</i> a device function; it may be called in a parallel
+  /// kernel.
+  KOKKOS_INLINE_FUNCTION
+  size_type find( const key_type & k) const
+  {
+    size_type curr = 0u < capacity() ? m_hash_lists( m_hasher(k) % m_hash_lists.dimension_0() ) : invalid_index ;
+
+    KOKKOS_NONTEMPORAL_PREFETCH_LOAD(&m_keys[curr != invalid_index ? curr : 0]);
+    while (curr != invalid_index && !m_equal_to( m_keys[curr], k) ) {
+      KOKKOS_NONTEMPORAL_PREFETCH_LOAD(&m_keys[curr != invalid_index ? curr : 0]);
+      curr = m_next_index[curr];
+    }
+
+    return curr;
+  }
+
+  /// \brief Does the key exist in the map
+  ///
+  /// This <i>is</i> a device function; it may be called in a parallel
+  /// kernel.
+  KOKKOS_INLINE_FUNCTION
+  bool exists( const key_type & k) const
+  {
+    return valid_at(find(k));
+  }
+
+
+  /// \brief Get the value with \c i as its direct index.
+  ///
+  /// \param i [in] Index directly into the array of entries.
+  ///
+  /// This <i>is</i> a device function; it may be called in a parallel
+  /// kernel.
+  ///
+  /// 'const value_type' via Cuda texture fetch must return by value.
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename Impl::if_c< (is_set || has_const_value), impl_value_type, impl_value_type &>::type
+  value_at(size_type i) const
+  {
+    return m_values[ is_set ? 0 : (i < capacity() ? i : capacity()) ];
+  }
+
+  /// \brief Get the key with \c i as its direct index.
+  ///
+  /// \param i [in] Index directly into the array of entries.
+  ///
+  /// This <i>is</i> a device function; it may be called in a parallel
+  /// kernel.
+  KOKKOS_FORCEINLINE_FUNCTION
+  key_type key_at(size_type i) const
+  {
+    return m_keys[ i < capacity() ? i : capacity() ];
+  }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool valid_at(size_type i) const
+  {
+    return m_available_indexes.test(i);
+  }
+
+  template <typename SKey, typename SValue>
+  UnorderedMap( UnorderedMap<SKey,SValue,Device,Hasher,EqualTo> const& src,
+                typename Impl::enable_if< Impl::UnorderedMapCanAssign<declared_key_type,declared_value_type,SKey,SValue>::value,int>::type = 0
+              )
+    : m_bounded_insert(src.m_bounded_insert)
+    , m_hasher(src.m_hasher)
+    , m_equal_to(src.m_equal_to)
+    , m_size(src.m_size)
+    , m_available_indexes(src.m_available_indexes)
+    , m_hash_lists(src.m_hash_lists)
+    , m_next_index(src.m_next_index)
+    , m_keys(src.m_keys)
+    , m_values(src.m_values)
+    , m_scalars(src.m_scalars)
+  {}
+
+
+  template <typename SKey, typename SValue>
+  typename Impl::enable_if< Impl::UnorderedMapCanAssign<declared_key_type,declared_value_type,SKey,SValue>::value
+                           ,declared_map_type & >::type
+  operator=( UnorderedMap<SKey,SValue,Device,Hasher,EqualTo> const& src)
+  {
+    m_bounded_insert = src.m_bounded_insert;
+    m_hasher = src.m_hasher;
+    m_equal_to = src.m_equal_to;
+    m_size = src.m_size;
+    m_available_indexes = src.m_available_indexes;
+    m_hash_lists = src.m_hash_lists;
+    m_next_index = src.m_next_index;
+    m_keys = src.m_keys;
+    m_values = src.m_values;
+    m_scalars = src.m_scalars;
+    return *this;
+  }
+
+  template <typename SKey, typename SValue, typename SDevice>
+  typename Impl::enable_if< Impl::is_same< typename Impl::remove_const<SKey>::type, key_type>::value &&
+                            Impl::is_same< typename Impl::remove_const<SValue>::type, value_type>::value
+                          >::type
+  create_copy_view( UnorderedMap<SKey, SValue, SDevice, Hasher,EqualTo> const& src)
+  {
+    if (m_hash_lists.ptr_on_device() != src.m_hash_lists.ptr_on_device()) {
+
+      insertable_map_type tmp;
+
+      tmp.m_bounded_insert = src.m_bounded_insert;
+      tmp.m_hasher = src.m_hasher;
+      tmp.m_equal_to = src.m_equal_to;
+      tmp.m_size = src.size();
+      tmp.m_available_indexes = bitset_type( src.capacity() );
+      tmp.m_hash_lists        = size_type_view( ViewAllocateWithoutInitializing("UnorderedMap hash list"), src.m_hash_lists.dimension_0() );
+      tmp.m_next_index        = size_type_view( ViewAllocateWithoutInitializing("UnorderedMap next index"), src.m_next_index.dimension_0() );
+      tmp.m_keys              = key_type_view( ViewAllocateWithoutInitializing("UnorderedMap keys"), src.m_keys.dimension_0() );
+      tmp.m_values            = value_type_view( ViewAllocateWithoutInitializing("UnorderedMap values"), src.m_values.dimension_0() );
+      tmp.m_scalars           = scalars_view("UnorderedMap scalars");
+
+      Kokkos::deep_copy(tmp.m_available_indexes, src.m_available_indexes);
+
+      typedef Kokkos::Impl::DeepCopy< typename execution_space::memory_space, typename SDevice::memory_space > raw_deep_copy;
+
+      raw_deep_copy(tmp.m_hash_lists.ptr_on_device(), src.m_hash_lists.ptr_on_device(), sizeof(size_type)*src.m_hash_lists.dimension_0());
+      raw_deep_copy(tmp.m_next_index.ptr_on_device(), src.m_next_index.ptr_on_device(), sizeof(size_type)*src.m_next_index.dimension_0());
+      raw_deep_copy(tmp.m_keys.ptr_on_device(), src.m_keys.ptr_on_device(), sizeof(key_type)*src.m_keys.dimension_0());
+      if (!is_set) {
+        raw_deep_copy(tmp.m_values.ptr_on_device(), src.m_values.ptr_on_device(), sizeof(impl_value_type)*src.m_values.dimension_0());
+      }
+      raw_deep_copy(tmp.m_scalars.ptr_on_device(), src.m_scalars.ptr_on_device(), sizeof(int)*num_scalars );
+
+      *this = tmp;
+    }
+  }
+
+  //@}
+private: // private member functions
+
+  bool modified() const
+  {
+    return get_flag(modified_idx);
+  }
+
+  void set_flag(int flag) const
+  {
+    typedef Kokkos::Impl::DeepCopy< typename execution_space::memory_space, Kokkos::HostSpace > raw_deep_copy;
+    const int true_ = true;
+    raw_deep_copy(m_scalars.ptr_on_device() + flag, &true_, sizeof(int));
+  }
+
+  void reset_flag(int flag) const
+  {
+    typedef Kokkos::Impl::DeepCopy< typename execution_space::memory_space, Kokkos::HostSpace > raw_deep_copy;
+    const int false_ = false;
+    raw_deep_copy(m_scalars.ptr_on_device() + flag, &false_, sizeof(int));
+  }
+
+  bool get_flag(int flag) const
+  {
+    typedef Kokkos::Impl::DeepCopy< Kokkos::HostSpace, typename execution_space::memory_space > raw_deep_copy;
+    int result = false;
+    raw_deep_copy(&result, m_scalars.ptr_on_device() + flag, sizeof(int));
+    return result;
+  }
+
+  static uint32_t calculate_capacity(uint32_t capacity_hint)
+  {
+    // increase by 16% and round to nears multiple of 128
+    return capacity_hint ? ((static_cast<uint32_t>(7ull*capacity_hint/6u) + 127u)/128u)*128u : 128u;
+  }
+
+private: // private members
+  bool              m_bounded_insert;
+  hasher_type       m_hasher;
+  equal_to_type     m_equal_to;
+  mutable size_type m_size;
+  bitset_type       m_available_indexes;
+  size_type_view    m_hash_lists;
+  size_type_view    m_next_index;
+  key_type_view     m_keys;
+  value_type_view   m_values;
+  scalars_view      m_scalars;
+
+  template <typename KKey, typename VValue, typename DDevice, typename HHash, typename EEqualTo>
+  friend class UnorderedMap;
+
+  template <typename UMap>
+  friend struct Impl::UnorderedMapErase;
+
+  template <typename UMap>
+  friend struct Impl::UnorderedMapHistogram;
+
+  template <typename UMap>
+  friend struct Impl::UnorderedMapPrint;
+};
+
+// Specialization of deep_copy for two UnorderedMap objects.
+template <  typename DKey, typename DT, typename DDevice
+          , typename SKey, typename ST, typename SDevice
+          , typename Hasher, typename EqualTo >
+inline void deep_copy(         UnorderedMap<DKey, DT, DDevice, Hasher, EqualTo> & dst
+                       , const UnorderedMap<SKey, ST, SDevice, Hasher, EqualTo> & src )
+{
+  dst.create_copy_view(src);
+}
+
+
+} // namespace Kokkos
+
+#endif //KOKKOS_UNORDERED_MAP_HPP
diff --git a/lib/kokkos/containers/src/Kokkos_Vector.hpp b/lib/kokkos/containers/src/Kokkos_Vector.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..6a360e8d19df8ef1330b43222d602bed58e28a9f
--- /dev/null
+++ b/lib/kokkos/containers/src/Kokkos_Vector.hpp
@@ -0,0 +1,283 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_VECTOR_HPP
+#define KOKKOS_VECTOR_HPP
+
+#include <Kokkos_Core_fwd.hpp>
+#include <Kokkos_DualView.hpp>
+
+/* Drop in replacement for std::vector based on Kokkos::DualView
+ * Most functions only work on the host (it will not compile if called from device kernel)
+ *
+ */
+  namespace Kokkos {
+
+template< class Scalar, class Arg1Type = void>
+class vector : public DualView<Scalar*,LayoutLeft,Arg1Type> {
+
+  typedef Scalar value_type;
+  typedef Scalar* pointer;
+  typedef const Scalar* const_pointer;
+  typedef Scalar* reference;
+  typedef const Scalar* const_reference;
+  typedef Scalar* iterator;
+  typedef const Scalar* const_iterator;
+
+private:
+  size_t _size;
+  typedef size_t size_type;
+  float _extra_storage;
+  typedef DualView<Scalar*,LayoutLeft,Arg1Type> DV;
+
+
+public:
+#ifdef KOKKOS_CUDA_USE_UVM
+  KOKKOS_INLINE_FUNCTION Scalar& operator() (int i) const {return DV::h_view(i);};
+  KOKKOS_INLINE_FUNCTION Scalar& operator[] (int i) const {return DV::h_view(i);};
+#else
+  inline Scalar& operator() (int i) const {return DV::h_view(i);};
+  inline Scalar& operator[] (int i) const {return DV::h_view(i);};
+#endif
+
+  /* Member functions which behave like std::vector functions */
+
+  vector():DV() {
+    _size = 0;
+    _extra_storage = 1.1;
+    DV::modified_host() = 1;
+  };
+
+
+  vector(int n, Scalar val=Scalar()):DualView<Scalar*,LayoutLeft,Arg1Type>("Vector",size_t(n*(1.1))) {
+    _size = n;
+    _extra_storage = 1.1;
+    DV::modified_host() = 1;
+
+    assign(n,val);
+  }
+
+
+  void resize(size_t n) {
+    if(n>=capacity())
+      DV::resize(size_t (n*_extra_storage));
+    _size = n;
+  }
+
+  void resize(size_t n, const Scalar& val) {
+    assign(n,val);
+  }
+
+  void assign (size_t n, const Scalar& val) {
+
+    /* Resize if necessary (behavour of std:vector) */
+
+    if(n>capacity())
+      DV::resize(size_t (n*_extra_storage));
+    _size = n;
+
+          /* Assign value either on host or on device */
+
+    if( DV::modified_host() >= DV::modified_device() ) {
+      set_functor_host f(DV::h_view,val);
+      parallel_for(n,f);
+      DV::t_host::execution_space::fence();
+      DV::modified_host()++;
+    } else {
+      set_functor f(DV::d_view,val);
+      parallel_for(n,f);
+      DV::t_dev::execution_space::fence();
+      DV::modified_device()++;
+    }
+  }
+
+  void reserve(size_t n) {
+    DV::resize(size_t (n*_extra_storage));
+  }
+
+  void push_back(Scalar val) {
+    DV::modified_host()++;
+    if(_size == capacity()) {
+      size_t new_size = _size*_extra_storage;
+      if(new_size == _size) new_size++;
+      DV::resize(new_size);
+    }
+
+    DV::h_view(_size) = val;
+    _size++;
+
+  };
+
+  void pop_back() {
+    _size--;
+  };
+
+  void clear() {
+    _size = 0;
+  }
+
+  size_type size() const {return _size;};
+  size_type max_size() const {return 2000000000;}
+  size_type capacity() const {return DV::capacity();};
+  bool empty() const {return _size==0;};
+
+  iterator begin() const {return &DV::h_view(0);};
+
+  iterator end() const {return &DV::h_view(_size);};
+
+
+  /* std::algorithms wich work originally with iterators, here they are implemented as member functions */
+
+  size_t
+  lower_bound (const size_t& start,
+               const size_t& theEnd,
+               const Scalar& comp_val) const
+  {
+    int lower = start; // FIXME (mfh 24 Apr 2014) narrowing conversion
+    int upper = _size > theEnd? theEnd : _size-1; // FIXME (mfh 24 Apr 2014) narrowing conversion
+    if (upper <= lower) {
+      return theEnd;
+    }
+
+    Scalar lower_val = DV::h_view(lower);
+    Scalar upper_val = DV::h_view(upper);
+    size_t idx = (upper+lower)/2;
+    Scalar val = DV::h_view(idx);
+    if(val>upper_val) return upper;
+    if(val<lower_val) return start;
+
+    while(upper>lower) {
+      if(comp_val>val) {
+        lower = ++idx;
+      } else {
+        upper = idx;
+      }
+      idx = (upper+lower)/2;
+      val = DV::h_view(idx);
+    }
+    return idx;
+  }
+
+  bool is_sorted() {
+    for(int i=0;i<_size-1;i++) {
+      if(DV::h_view(i)>DV::h_view(i+1)) return false;
+    }
+    return true;
+  }
+
+  iterator find(Scalar val) const {
+    if(_size == 0) return end();
+
+    int upper,lower,current;
+    current = _size/2;
+    upper = _size-1;
+    lower = 0;
+
+    if((val<DV::h_view(0)) || (val>DV::h_view(_size-1)) ) return end();
+
+    while(upper>lower)
+    {
+      if(val>DV::h_view(current)) lower = current+1;
+      else upper = current;
+      current = (upper+lower)/2;
+    }
+
+    if(val==DV::h_view(current)) return &DV::h_view(current);
+    else return end();
+  }
+
+  /* Additional functions for data management */
+
+  void device_to_host(){
+    deep_copy(DV::h_view,DV::d_view);
+  }
+  void host_to_device() const {
+    deep_copy(DV::d_view,DV::h_view);
+  }
+
+  void on_host() {
+    DV::modified_host() = DV::modified_device() + 1;
+  }
+  void on_device() {
+    DV::modified_device() = DV::modified_host() + 1;
+  }
+
+  void set_overallocation(float extra) {
+    _extra_storage = 1.0 + extra;
+  }
+
+
+public:
+  struct set_functor {
+    typedef typename DV::t_dev::execution_space execution_space;
+    typename DV::t_dev _data;
+    Scalar _val;
+
+    set_functor(typename DV::t_dev data, Scalar val) :
+      _data(data),_val(val) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (const int &i) const {
+      _data(i) = _val;
+    }
+  };
+
+  struct set_functor_host {
+    typedef typename DV::t_host::execution_space execution_space;
+    typename DV::t_host _data;
+    Scalar _val;
+
+    set_functor_host(typename DV::t_host data, Scalar val) :
+      _data(data),_val(val) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (const int &i) const {
+      _data(i) = _val;
+    }
+  };
+
+};
+
+
+}
+#endif
diff --git a/lib/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp b/lib/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..df2fbed5a6709ca74edc0628fb45d39238da0ade
--- /dev/null
+++ b/lib/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp
@@ -0,0 +1,109 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_BITSET_IMPL_HPP
+#define KOKKOS_BITSET_IMPL_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <impl/Kokkos_BitOps.hpp>
+#include <stdint.h>
+
+#include <cstdio>
+#include <climits>
+#include <iostream>
+#include <iomanip>
+
+namespace Kokkos {
+namespace Impl {
+
+KOKKOS_FORCEINLINE_FUNCTION
+unsigned rotate_right( unsigned i, int r )
+{
+  enum { size = static_cast<int>( sizeof(unsigned) * CHAR_BIT ) };
+  return r ? ( ( i >> r ) | ( i << ( size - r ) ) ) : i ;
+}
+
+template < typename Bitset >
+struct BitsetCount
+{
+  typedef Bitset                                                  bitset_type;
+  typedef typename bitset_type::execution_space::execution_space  execution_space;
+  typedef typename bitset_type::size_type                         size_type;
+  typedef size_type                                               value_type;
+
+  bitset_type m_bitset;
+
+  BitsetCount( bitset_type const& bitset )
+    : m_bitset(bitset)
+  {}
+
+  size_type apply() const
+  {
+    size_type count = 0u;
+    parallel_reduce( m_bitset.m_blocks.dimension_0(), *this, count );
+    return count;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type & count ) const
+  {
+    count = 0u;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile value_type & count, const volatile size_type & incr ) const
+  {
+    count += incr;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( size_type i, value_type & count ) const
+  {
+    count += bit_count( m_bitset.m_blocks[i] );
+  }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+#endif // KOKKOS_BITSET_IMPL_HPP
diff --git a/lib/kokkos/containers/src/impl/Kokkos_Functional_impl.hpp b/lib/kokkos/containers/src/impl/Kokkos_Functional_impl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..c87bb8a3a37cb6820d31bdd691cf447b20bbd185
--- /dev/null
+++ b/lib/kokkos/containers/src/impl/Kokkos_Functional_impl.hpp
@@ -0,0 +1,195 @@
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+
+#ifndef KOKKOS_FUNCTIONAL_IMPL_HPP
+#define KOKKOS_FUNCTIONAL_IMPL_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <stdint.h>
+
+namespace Kokkos { namespace Impl {
+
+// MurmurHash3 was written by Austin Appleby, and is placed in the public
+// domain. The author hereby disclaims copyright to this source code.
+KOKKOS_FORCEINLINE_FUNCTION
+uint32_t getblock32 ( const uint8_t * p, int i )
+{
+// used to avoid aliasing error which could cause errors with
+// forced inlining
+  return    ((uint32_t)p[i*4+0])
+          | ((uint32_t)p[i*4+1] << 8)
+          | ((uint32_t)p[i*4+2] << 16)
+          | ((uint32_t)p[i*4+3] << 24);
+}
+
+KOKKOS_FORCEINLINE_FUNCTION
+uint32_t rotl32 ( uint32_t x, int8_t r )
+{ return (x << r) | (x >> (32 - r)); }
+
+KOKKOS_FORCEINLINE_FUNCTION
+uint32_t fmix32 ( uint32_t h )
+{
+  h ^= h >> 16;
+  h *= 0x85ebca6b;
+  h ^= h >> 13;
+  h *= 0xc2b2ae35;
+  h ^= h >> 16;
+
+  return h;
+}
+
+KOKKOS_INLINE_FUNCTION
+uint32_t MurmurHash3_x86_32 ( const void * key, int len, uint32_t seed )
+{
+  const uint8_t * data = (const uint8_t*)key;
+  const int nblocks = len / 4;
+
+  uint32_t h1 = seed;
+
+  const uint32_t c1 = 0xcc9e2d51;
+  const uint32_t c2 = 0x1b873593;
+
+  //----------
+  // body
+
+  for(int i=0; i<nblocks; ++i)
+  {
+    uint32_t k1 = getblock32(data,i);
+
+    k1 *= c1;
+    k1 = rotl32(k1,15);
+    k1 *= c2;
+
+    h1 ^= k1;
+    h1 = rotl32(h1,13);
+    h1 = h1*5+0xe6546b64;
+  }
+
+  //----------
+  // tail
+
+  const uint8_t * tail = (const uint8_t*)(data + nblocks*4);
+
+  uint32_t k1 = 0;
+
+  switch(len & 3)
+  {
+  case 3: k1 ^= tail[2] << 16;
+  case 2: k1 ^= tail[1] << 8;
+  case 1: k1 ^= tail[0];
+          k1 *= c1; k1 = rotl32(k1,15); k1 *= c2; h1 ^= k1;
+  };
+
+  //----------
+  // finalization
+
+  h1 ^= len;
+
+  h1 = fmix32(h1);
+
+  return h1;
+}
+
+
+#if defined( __GNUC__ ) /* GNU C   */ || \
+    defined( __GNUG__ ) /* GNU C++ */ || \
+    defined( __clang__ )
+
+#define KOKKOS_MAY_ALIAS __attribute__((__may_alias__))
+
+#else
+
+#define KOKKOS_MAY_ALIAS
+
+#endif
+
+template <typename T>
+KOKKOS_FORCEINLINE_FUNCTION
+bool bitwise_equal(T const * const a_ptr, T const * const b_ptr)
+{
+  typedef uint64_t KOKKOS_MAY_ALIAS T64;
+  typedef uint32_t KOKKOS_MAY_ALIAS T32;
+  typedef uint16_t KOKKOS_MAY_ALIAS T16;
+  typedef uint8_t  KOKKOS_MAY_ALIAS T8;
+
+  enum {
+    NUM_8  = sizeof(T),
+    NUM_16 = NUM_8 / 2,
+    NUM_32 = NUM_8 / 4,
+    NUM_64 = NUM_8 / 8
+  };
+
+  union {
+    T   const * const ptr;
+    T64 const * const ptr64;
+    T32 const * const ptr32;
+    T16 const * const ptr16;
+    T8  const * const ptr8;
+  } a = {a_ptr}, b = {b_ptr};
+
+  bool result = true;
+
+  for (int i=0; i < NUM_64; ++i) {
+    result = result && a.ptr64[i] == b.ptr64[i];
+  }
+
+  if ( NUM_64*2 < NUM_32 ) {
+    result = result && a.ptr32[NUM_64*2] == b.ptr32[NUM_64*2];
+  }
+
+  if ( NUM_32*2 < NUM_16 ) {
+    result = result && a.ptr16[NUM_32*2] == b.ptr16[NUM_32*2];
+  }
+
+  if ( NUM_16*2 < NUM_8 ) {
+    result = result && a.ptr8[NUM_16*2] == b.ptr8[NUM_16*2];
+  }
+
+  return result;
+}
+
+
+
+#undef KOKKOS_MAY_ALIAS
+
+}} // namespace Kokkos::Impl
+
+#endif //KOKKOS_FUNCTIONAL_IMPL_HPP
diff --git a/lib/kokkos/containers/src/impl/Kokkos_StaticCrsGraph_factory.hpp b/lib/kokkos/containers/src/impl/Kokkos_StaticCrsGraph_factory.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..c52fc24359b8f7bd34489d94914ea304f7bc3425
--- /dev/null
+++ b/lib/kokkos/containers/src/impl/Kokkos_StaticCrsGraph_factory.hpp
@@ -0,0 +1,208 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_IMPL_STATICCRSGRAPH_FACTORY_HPP
+#define KOKKOS_IMPL_STATICCRSGRAPH_FACTORY_HPP
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+template< class DataType , class Arg1Type , class Arg2Type , typename SizeType >
+inline
+typename StaticCrsGraph< DataType , Arg1Type , Arg2Type , SizeType >::HostMirror
+create_mirror_view( const StaticCrsGraph<DataType,Arg1Type,Arg2Type,SizeType > & view ,
+                    typename Impl::enable_if< ViewTraits<DataType,Arg1Type,Arg2Type,void>::is_hostspace >::type * = 0 )
+{
+  return view ;
+}
+
+template< class DataType , class Arg1Type , class Arg2Type , typename SizeType >
+inline
+typename StaticCrsGraph< DataType , Arg1Type , Arg2Type , SizeType >::HostMirror
+create_mirror( const StaticCrsGraph<DataType,Arg1Type,Arg2Type,SizeType > & view )
+{
+  // Force copy:
+  //typedef Impl::ViewAssignment< Impl::ViewDefault > alloc ; // unused
+  typedef StaticCrsGraph< DataType , Arg1Type , Arg2Type , SizeType >  staticcrsgraph_type ;
+
+  typename staticcrsgraph_type::HostMirror               tmp ;
+  typename staticcrsgraph_type::row_map_type::HostMirror tmp_row_map = create_mirror( view.row_map);
+
+  // Allocation to match:
+  tmp.row_map = tmp_row_map ; // Assignment of 'const' from 'non-const'
+  tmp.entries = create_mirror( view.entries );
+
+
+  // Deep copy:
+  deep_copy( tmp_row_map , view.row_map );
+  deep_copy( tmp.entries , view.entries );
+
+  return tmp ;
+}
+
+template< class DataType , class Arg1Type , class Arg2Type , typename SizeType >
+inline
+typename StaticCrsGraph< DataType , Arg1Type , Arg2Type , SizeType >::HostMirror
+create_mirror_view( const StaticCrsGraph<DataType,Arg1Type,Arg2Type,SizeType > & view ,
+                    typename Impl::enable_if< ! ViewTraits<DataType,Arg1Type,Arg2Type,void>::is_hostspace >::type * = 0 )
+{
+  return create_mirror( view );
+}
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+template< class StaticCrsGraphType , class InputSizeType >
+inline
+typename StaticCrsGraphType::staticcrsgraph_type
+create_staticcrsgraph( const std::string & label ,
+                 const std::vector< InputSizeType > & input )
+{
+  typedef StaticCrsGraphType                  output_type ;
+  //typedef std::vector< InputSizeType >  input_type ; // unused
+
+  typedef typename output_type::entries_type   entries_type ;
+
+  typedef View< typename output_type::size_type [] ,
+                typename output_type::array_layout ,
+                typename output_type::execution_space > work_type ;
+
+  output_type output ;
+
+  // Create the row map:
+
+  const size_t length = input.size();
+
+  {
+    work_type row_work( "tmp" , length + 1 );
+
+    typename work_type::HostMirror row_work_host =
+      create_mirror_view( row_work );
+
+    size_t sum = 0 ;
+    row_work_host[0] = 0 ;
+    for ( size_t i = 0 ; i < length ; ++i ) {
+      row_work_host[i+1] = sum += input[i];
+    }
+
+    deep_copy( row_work , row_work_host );
+
+    output.entries   = entries_type( label , sum );
+    output.row_map   = row_work ;
+  }
+
+  return output ;
+}
+
+//----------------------------------------------------------------------------
+
+template< class StaticCrsGraphType , class InputSizeType >
+inline
+typename StaticCrsGraphType::staticcrsgraph_type
+create_staticcrsgraph( const std::string & label ,
+                 const std::vector< std::vector< InputSizeType > > & input )
+{
+  typedef StaticCrsGraphType                  output_type ;
+  typedef typename output_type::entries_type  entries_type ;
+
+  static_assert( entries_type::rank == 1
+               , "Graph entries view must be rank one" );
+
+  typedef View< typename output_type::size_type [] ,
+                typename output_type::array_layout ,
+                typename output_type::execution_space > work_type ;
+
+  output_type output ;
+
+    // Create the row map:
+
+  const size_t length = input.size();
+
+  {
+    work_type row_work( "tmp" , length + 1 );
+
+    typename work_type::HostMirror row_work_host =
+      create_mirror_view( row_work );
+
+    size_t sum = 0 ;
+    row_work_host[0] = 0 ;
+    for ( size_t i = 0 ; i < length ; ++i ) {
+      row_work_host[i+1] = sum += input[i].size();
+    }
+
+    deep_copy( row_work , row_work_host );
+
+    output.entries   = entries_type( label , sum );
+    output.row_map   = row_work ;
+  }
+
+  // Fill in the entries:
+  {
+    typename entries_type::HostMirror host_entries =
+      create_mirror_view( output.entries );
+
+    size_t sum = 0 ;
+    for ( size_t i = 0 ; i < length ; ++i ) {
+      for ( size_t j = 0 ; j < input[i].size() ; ++j , ++sum ) {
+        host_entries( sum ) = input[i][j] ;
+      }
+    }
+
+    deep_copy( output.entries , host_entries );
+  }
+
+  return output ;
+}
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_IMPL_CRSARRAY_FACTORY_HPP */
+
diff --git a/lib/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.cpp b/lib/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..843fd3a8089999ab80b23506c2206e7a5de325e9
--- /dev/null
+++ b/lib/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.cpp
@@ -0,0 +1,101 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_UnorderedMap.hpp>
+
+namespace Kokkos { namespace Impl {
+
+uint32_t find_hash_size(uint32_t size)
+{
+  if (size == 0u) return 0u;
+
+  // these primes try to preserve randomness of hash
+  static const uint32_t primes [] = {
+        3, 7, 13, 23, 53, 97, 193, 389, 769, 1543
+      , 2237, 2423, 2617, 2797, 2999, 3167, 3359, 3539
+      , 3727, 3911, 4441 , 4787 , 5119 , 5471 , 5801 , 6143 , 6521 , 6827
+      , 7177 , 7517 , 7853 , 8887 , 9587 , 10243 , 10937 , 11617 , 12289
+      , 12967 , 13649 , 14341 , 15013 , 15727
+      , 17749 , 19121 , 20479 , 21859 , 23209 , 24593 , 25939 , 27329
+      , 28669 , 30047 , 31469 , 35507 , 38231 , 40961 , 43711 , 46439
+      , 49157 , 51893 , 54617 , 57347 , 60077 , 62801 , 70583 , 75619
+      , 80669 , 85703 , 90749 , 95783 , 100823 , 105871 , 110909 , 115963
+      , 120997 , 126031 , 141157 , 151237 , 161323 , 171401 , 181499 , 191579
+      , 201653 , 211741 , 221813 , 231893 , 241979 , 252079
+      , 282311 , 302483 , 322649 , 342803 , 362969 , 383143 , 403301 , 423457
+      , 443629 , 463787 , 483953 , 504121 , 564617 , 604949 , 645313 , 685609
+      , 725939 , 766273 , 806609 , 846931 , 887261 , 927587 , 967919 , 1008239
+      , 1123477 , 1198397 , 1273289 , 1348177 , 1423067 , 1497983 , 1572869
+      , 1647761 , 1722667 , 1797581 , 1872461 , 1947359 , 2022253
+      , 2246953 , 2396759 , 2546543 , 2696363 , 2846161 , 2995973 , 3145739
+      , 3295541 , 3445357 , 3595117 , 3744941 , 3894707 , 4044503
+      , 4493921 , 4793501 , 5093089 , 5392679 , 5692279 , 5991883 , 6291469
+      , 6591059 , 6890641 , 7190243 , 7489829 , 7789447 , 8089033
+      , 8987807 , 9586981 , 10186177 , 10785371 , 11384539 , 11983729
+      , 12582917 , 13182109 , 13781291 , 14380469 , 14979667 , 15578861
+      , 16178053 , 17895707 , 19014187 , 20132683 , 21251141 , 22369661
+      , 23488103 , 24606583 , 25725083 , 26843549 , 27962027 , 29080529
+      , 30198989 , 31317469 , 32435981 , 35791397 , 38028379 , 40265327
+      , 42502283 , 44739259 , 46976221 , 49213237 , 51450131 , 53687099
+      , 55924061 , 58161041 , 60397993 , 62634959 , 64871921
+      , 71582857 , 76056727 , 80530643 , 85004567 , 89478503 , 93952427
+      , 98426347 , 102900263 , 107374217 , 111848111 , 116322053 , 120795971
+      , 125269877 , 129743807 , 143165587 , 152113427 , 161061283 , 170009141
+      , 178956983 , 187904819 , 196852693 , 205800547 , 214748383 , 223696237
+      , 232644089 , 241591943 , 250539763 , 259487603 , 268435399
+  };
+
+  const uint32_t num_primes = sizeof(primes)/sizeof(uint32_t);
+
+  uint32_t hsize = primes[num_primes-1] ;
+  for (uint32_t i = 0; i < num_primes; ++i) {
+    if (size <= primes[i]) {
+      hsize = primes[i];
+      break;
+    }
+  }
+  return hsize;
+}
+
+}} // namespace Kokkos::Impl
+
diff --git a/lib/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.hpp b/lib/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b788c966e9c5a04d0ce4ca626190d241ec273008
--- /dev/null
+++ b/lib/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.hpp
@@ -0,0 +1,297 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_UNORDERED_MAP_IMPL_HPP
+#define KOKKOS_UNORDERED_MAP_IMPL_HPP
+
+#include <Kokkos_Core_fwd.hpp>
+#include <stdint.h>
+
+#include <cstdio>
+#include <climits>
+#include <iostream>
+#include <iomanip>
+
+namespace Kokkos { namespace Impl {
+
+uint32_t find_hash_size( uint32_t size );
+
+template <typename Map>
+struct UnorderedMapRehash
+{
+  typedef Map map_type;
+  typedef typename map_type::const_map_type const_map_type;
+  typedef typename map_type::execution_space execution_space;
+  typedef typename map_type::size_type size_type;
+
+  map_type       m_dst;
+  const_map_type m_src;
+
+  UnorderedMapRehash( map_type const& dst, const_map_type const& src)
+    : m_dst(dst), m_src(src)
+  {}
+
+  void apply() const
+  {
+    parallel_for(m_src.capacity(), *this);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(size_type i) const
+  {
+    if ( m_src.valid_at(i) )
+      m_dst.insert(m_src.key_at(i), m_src.value_at(i));
+  }
+
+};
+
+template <typename UMap>
+struct UnorderedMapErase
+{
+  typedef UMap map_type;
+  typedef typename map_type::execution_space execution_space;
+  typedef typename map_type::size_type size_type;
+  typedef typename map_type::key_type key_type;
+  typedef typename map_type::impl_value_type value_type;
+
+  map_type m_map;
+
+  UnorderedMapErase( map_type const& map)
+    : m_map(map)
+  {}
+
+  void apply() const
+  {
+    parallel_for(m_map.m_hash_lists.dimension_0(), *this);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( size_type i ) const
+  {
+    const size_type invalid_index = map_type::invalid_index;
+
+    size_type curr = m_map.m_hash_lists(i);
+    size_type next = invalid_index;
+
+    // remove erased head of the linked-list
+    while (curr != invalid_index && !m_map.valid_at(curr)) {
+      next = m_map.m_next_index[curr];
+      m_map.m_next_index[curr] = invalid_index;
+      m_map.m_keys[curr] = key_type();
+      if (m_map.is_set) m_map.m_values[curr] = value_type();
+      curr = next;
+      m_map.m_hash_lists(i) = next;
+    }
+
+    // if the list is non-empty and the head is valid
+    if (curr != invalid_index && m_map.valid_at(curr) ) {
+      size_type prev = curr;
+      curr = m_map.m_next_index[prev];
+
+      while (curr != invalid_index) {
+        next = m_map.m_next_index[curr];
+        if (m_map.valid_at(curr)) {
+          prev = curr;
+        }
+        else {
+          // remove curr from list
+          m_map.m_next_index[prev] = next;
+          m_map.m_next_index[curr] = invalid_index;
+          m_map.m_keys[curr] = key_type();
+          if (map_type::is_set) m_map.m_values[curr] = value_type();
+        }
+        curr = next;
+      }
+    }
+  }
+};
+
+template <typename UMap>
+struct UnorderedMapHistogram
+{
+  typedef UMap map_type;
+  typedef typename map_type::execution_space execution_space;
+  typedef typename map_type::size_type size_type;
+
+  typedef View<int[100], execution_space> histogram_view;
+  typedef typename histogram_view::HostMirror host_histogram_view;
+
+  map_type m_map;
+  histogram_view m_length;
+  histogram_view m_distance;
+  histogram_view m_block_distance;
+
+  UnorderedMapHistogram( map_type const& map)
+    : m_map(map)
+    , m_length("UnorderedMap Histogram")
+    , m_distance("UnorderedMap Histogram")
+    , m_block_distance("UnorderedMap Histogram")
+  {}
+
+  void calculate()
+  {
+    parallel_for(m_map.m_hash_lists.dimension_0(), *this);
+  }
+
+  void clear()
+  {
+    Kokkos::deep_copy(m_length, 0);
+    Kokkos::deep_copy(m_distance, 0);
+    Kokkos::deep_copy(m_block_distance, 0);
+  }
+
+  void print_length(std::ostream &out)
+  {
+    host_histogram_view host_copy = create_mirror_view(m_length);
+    Kokkos::deep_copy(host_copy, m_length);
+
+    for (int i=0, size = host_copy.dimension_0(); i<size; ++i)
+    {
+      out << host_copy[i] << " , ";
+    }
+    out << "\b\b\b   " << std::endl;
+  }
+
+  void print_distance(std::ostream &out)
+  {
+    host_histogram_view host_copy = create_mirror_view(m_distance);
+    Kokkos::deep_copy(host_copy, m_distance);
+
+    for (int i=0, size = host_copy.dimension_0(); i<size; ++i)
+    {
+      out << host_copy[i] << " , ";
+    }
+    out << "\b\b\b   " << std::endl;
+  }
+
+  void print_block_distance(std::ostream &out)
+  {
+    host_histogram_view host_copy = create_mirror_view(m_block_distance);
+    Kokkos::deep_copy(host_copy, m_block_distance);
+
+    for (int i=0, size = host_copy.dimension_0(); i<size; ++i)
+    {
+      out << host_copy[i] << " , ";
+    }
+    out << "\b\b\b   " << std::endl;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( size_type i ) const
+  {
+    const size_type invalid_index = map_type::invalid_index;
+
+    uint32_t length = 0;
+    size_type min_index = ~0u, max_index = 0;
+    for (size_type curr = m_map.m_hash_lists(i); curr != invalid_index; curr = m_map.m_next_index[curr]) {
+      ++length;
+      min_index = (curr < min_index) ? curr : min_index;
+      max_index = (max_index < curr) ? curr : max_index;
+    }
+
+    size_type distance = (0u < length) ? max_index - min_index : 0u;
+    size_type blocks = (0u < length) ? max_index/32u - min_index/32u : 0u;
+
+    // normalize data
+    length   = length   < 100u ? length   : 99u;
+    distance = distance < 100u ? distance : 99u;
+    blocks   = blocks   < 100u ? blocks   : 99u;
+
+    if (0u < length)
+    {
+      atomic_fetch_add( &m_length(length), 1);
+      atomic_fetch_add( &m_distance(distance), 1);
+      atomic_fetch_add( &m_block_distance(blocks), 1);
+    }
+  }
+};
+
+template <typename UMap>
+struct UnorderedMapPrint
+{
+  typedef UMap map_type;
+  typedef typename map_type::execution_space execution_space;
+  typedef typename map_type::size_type size_type;
+
+  map_type m_map;
+
+  UnorderedMapPrint( map_type const& map)
+    : m_map(map)
+  {}
+
+  void apply()
+  {
+    parallel_for(m_map.m_hash_lists.dimension_0(), *this);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( size_type i ) const
+  {
+    const size_type invalid_index = map_type::invalid_index;
+
+    uint32_t list = m_map.m_hash_lists(i);
+    for (size_type curr = list, ii=0; curr != invalid_index; curr = m_map.m_next_index[curr], ++ii) {
+      printf("%d[%d]: %d->%d\n", list, ii, m_map.key_at(curr), m_map.value_at(curr));
+    }
+  }
+};
+
+template <typename DKey, typename DValue, typename SKey, typename SValue>
+struct UnorderedMapCanAssign : public false_ {};
+
+template <typename Key, typename Value>
+struct UnorderedMapCanAssign<Key,Value,Key,Value> : public true_ {};
+
+template <typename Key, typename Value>
+struct UnorderedMapCanAssign<const Key,Value,Key,Value> : public true_ {};
+
+template <typename Key, typename Value>
+struct UnorderedMapCanAssign<const Key,const Value,Key,Value> : public true_ {};
+
+template <typename Key, typename Value>
+struct UnorderedMapCanAssign<const Key,const Value,const Key,Value> : public true_ {};
+
+
+}} //Kokkos::Impl
+
+#endif // KOKKOS_UNORDERED_MAP_IMPL_HPP
diff --git a/lib/kokkos/containers/unit_tests/CMakeLists.txt b/lib/kokkos/containers/unit_tests/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7fff0f835bb2e704914fe5df16556d6c4199a916
--- /dev/null
+++ b/lib/kokkos/containers/unit_tests/CMakeLists.txt
@@ -0,0 +1,40 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../src )
+
+SET(SOURCES
+  UnitTestMain.cpp 
+  TestCuda.cpp
+  )
+
+SET(LIBRARIES kokkoscore)
+
+IF(Kokkos_ENABLE_Pthread)
+  LIST( APPEND SOURCES
+    TestThreads.cpp
+  )
+ENDIF()
+
+IF(Kokkos_ENABLE_Serial)
+  LIST( APPEND SOURCES
+    TestSerial.cpp
+  )
+ENDIF()
+
+IF(Kokkos_ENABLE_OpenMP)
+  LIST( APPEND SOURCES
+    TestOpenMP.cpp
+  )
+ENDIF()
+
+
+TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  UnitTest
+  SOURCES ${SOURCES}
+  COMM serial mpi
+  NUM_MPI_PROCS 1
+  FAIL_REGULAR_EXPRESSION "  FAILED  "
+  TESTONLYLIBS kokkos_gtest
+  )
+  
diff --git a/lib/kokkos/containers/unit_tests/Makefile b/lib/kokkos/containers/unit_tests/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..48e3ff61d04b9de210a7f1976217f4d1aca9e8e8
--- /dev/null
+++ b/lib/kokkos/containers/unit_tests/Makefile
@@ -0,0 +1,92 @@
+KOKKOS_PATH = ../..
+
+GTEST_PATH = ../../TPL/gtest
+
+vpath %.cpp ${KOKKOS_PATH}/containers/unit_tests
+
+default: build_all
+	echo "End Build"
+
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+	CXX = $(NVCC_WRAPPER)
+	CXXFLAGS ?= -O3
+	LINK = $(CXX)
+	LDFLAGS ?= -lpthread
+else
+	CXX ?= g++
+	CXXFLAGS ?= -O3
+	LINK ?= $(CXX)
+	LDFLAGS ?= -lpthread
+endif
+
+KOKKOS_CXXFLAGS += -I$(GTEST_PATH) -I${KOKKOS_PATH}/containers/unit_tests
+
+TEST_TARGETS = 
+TARGETS = 
+
+ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+	OBJ_CUDA = TestCuda.o UnitTestMain.o gtest-all.o
+	TARGETS += KokkosContainers_UnitTest_Cuda
+	TEST_TARGETS += test-cuda
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
+	OBJ_THREADS = TestThreads.o UnitTestMain.o gtest-all.o
+	TARGETS += KokkosContainers_UnitTest_Threads
+	TEST_TARGETS += test-threads
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
+	OBJ_OPENMP = TestOpenMP.o UnitTestMain.o gtest-all.o
+	TARGETS += KokkosContainers_UnitTest_OpenMP
+	TEST_TARGETS += test-openmp
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
+	OBJ_SERIAL = TestSerial.o UnitTestMain.o gtest-all.o
+	TARGETS += KokkosContainers_UnitTest_Serial
+	TEST_TARGETS += test-serial
+endif
+
+KokkosContainers_UnitTest_Cuda: $(OBJ_CUDA) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_CUDA) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_UnitTest_Cuda
+
+KokkosContainers_UnitTest_Threads: $(OBJ_THREADS) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_THREADS) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_UnitTest_Threads
+
+KokkosContainers_UnitTest_OpenMP: $(OBJ_OPENMP) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_OPENMP) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_UnitTest_OpenMP
+
+KokkosContainers_UnitTest_Serial: $(OBJ_SERIAL) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_SERIAL) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_UnitTest_Serial
+
+test-cuda: KokkosContainers_UnitTest_Cuda
+	./KokkosContainers_UnitTest_Cuda
+
+test-threads: KokkosContainers_UnitTest_Threads
+	./KokkosContainers_UnitTest_Threads
+
+test-openmp: KokkosContainers_UnitTest_OpenMP
+	./KokkosContainers_UnitTest_OpenMP
+
+test-serial: KokkosContainers_UnitTest_Serial
+	./KokkosContainers_UnitTest_Serial
+
+build_all: $(TARGETS)
+
+test: $(TEST_TARGETS)
+
+clean: kokkos-clean 
+	rm -f *.o $(TARGETS)
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
+
+gtest-all.o:$(GTEST_PATH)/gtest/gtest-all.cc 
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $(GTEST_PATH)/gtest/gtest-all.cc
+
diff --git a/lib/kokkos/containers/unit_tests/TestBitset.hpp b/lib/kokkos/containers/unit_tests/TestBitset.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..76fb30edcb68aa37f7beb55352212211bcf586c3
--- /dev/null
+++ b/lib/kokkos/containers/unit_tests/TestBitset.hpp
@@ -0,0 +1,285 @@
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+
+#ifndef KOKKOS_TEST_BITSET_HPP
+#define KOKKOS_TEST_BITSET_HPP
+
+#include <gtest/gtest.h>
+#include <iostream>
+
+
+namespace Test {
+
+namespace Impl {
+
+template <typename Bitset, bool Set>
+struct TestBitset
+{
+  typedef Bitset bitset_type;
+  typedef typename bitset_type::execution_space execution_space;
+  typedef uint32_t value_type;
+
+  bitset_type m_bitset;
+
+  TestBitset( bitset_type const& bitset)
+    : m_bitset(bitset)
+  {}
+
+  unsigned testit(unsigned collisions)
+  {
+    execution_space::fence();
+
+    unsigned count = 0;
+    Kokkos::parallel_reduce( m_bitset.size()*collisions, *this, count);
+    return count;
+  }
+
+
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type & v ) const { v = 0; }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile value_type & dst, const volatile value_type & src ) const
+  { dst += src; }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(uint32_t i, value_type & v) const
+  {
+    i = i % m_bitset.size();
+    if (Set) {
+      if (m_bitset.set(i)) {
+        if (m_bitset.test(i)) ++v;
+      }
+    }
+    else {
+      if (m_bitset.reset(i)) {
+        if (!m_bitset.test(i)) ++v;
+      }
+    }
+  }
+
+};
+
+template <typename Bitset>
+struct TestBitsetTest
+{
+  typedef Bitset bitset_type;
+  typedef typename bitset_type::execution_space execution_space;
+  typedef uint32_t value_type;
+
+  bitset_type m_bitset;
+
+  TestBitsetTest( bitset_type const& bitset)
+    : m_bitset(bitset)
+  {}
+
+  unsigned testit()
+  {
+    execution_space::fence();
+
+    unsigned count = 0;
+    Kokkos::parallel_reduce( m_bitset.size(), *this, count);
+    return count;
+  }
+
+
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type & v ) const { v = 0; }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile value_type & dst, const volatile value_type & src ) const
+  { dst += src; }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(uint32_t i, value_type & v) const
+  {
+    if (m_bitset.test( i )) ++v;
+  }
+};
+
+template <typename Bitset, bool Set>
+struct TestBitsetAny
+{
+  typedef Bitset bitset_type;
+  typedef typename bitset_type::execution_space execution_space;
+  typedef uint32_t value_type;
+
+  bitset_type m_bitset;
+
+  TestBitsetAny( bitset_type const& bitset)
+    : m_bitset(bitset)
+  {}
+
+  unsigned testit()
+  {
+    execution_space::fence();
+
+    unsigned count = 0;
+    Kokkos::parallel_reduce( m_bitset.size(), *this, count);
+    return count;
+  }
+
+
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type & v ) const { v = 0; }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile value_type & dst, const volatile value_type & src ) const
+  { dst += src; }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(uint32_t i, value_type & v) const
+  {
+    bool result = false;
+    unsigned attempts = 0;
+    uint32_t hint = (i >> 4) << 4;
+    while (attempts < m_bitset.max_hint()) {
+      if (Set) {
+        Kokkos::tie(result, hint) = m_bitset.find_any_unset_near(hint, i);
+        if (result && m_bitset.set(hint)) {
+          ++v;
+          break;
+        }
+        else if (!result) {
+          ++attempts;
+        }
+      }
+      else {
+        Kokkos::tie(result, hint) = m_bitset.find_any_set_near(hint, i);
+        if (result && m_bitset.reset(hint)) {
+          ++v;
+          break;
+        }
+        else if (!result) {
+          ++attempts;
+        }
+      }
+    }
+  }
+
+};
+} // namespace Impl
+
+
+
+template <typename Device>
+void test_bitset()
+{
+  typedef Kokkos::Bitset< Device > bitset_type;
+  typedef Kokkos::ConstBitset< Device > const_bitset_type;
+
+  //unsigned test_sizes[] = { 0u, 1000u, 1u<<14, 1u<<16, 10000001 };
+  unsigned test_sizes[] = { 1000u, 1u<<14, 1u<<16, 10000001 };
+
+  for (int i=0, end = sizeof(test_sizes)/sizeof(unsigned); i<end; ++i) {
+
+    //std::cout << "Bitset " << test_sizes[i] << std::endl;
+
+    bitset_type bitset(test_sizes[i]);
+
+    //std::cout << "  Check inital count " << std::endl;
+    // nothing should be set
+    {
+      Impl::TestBitsetTest< bitset_type > f(bitset);
+      uint32_t count = f.testit();
+      EXPECT_EQ(0u, count);
+      EXPECT_EQ(count, bitset.count());
+    }
+
+    //std::cout << "  Check set() " << std::endl;
+    bitset.set();
+    // everything should be set
+    {
+      Impl::TestBitsetTest< const_bitset_type > f(bitset);
+      uint32_t count = f.testit();
+      EXPECT_EQ(bitset.size(), count);
+      EXPECT_EQ(count, bitset.count());
+    }
+
+    //std::cout << "  Check reset() " << std::endl;
+    bitset.reset();
+    EXPECT_EQ(0u, bitset.count());
+
+    //std::cout << "  Check set(i) " << std::endl;
+    // test setting bits
+    {
+      Impl::TestBitset< bitset_type, true > f(bitset);
+      uint32_t count = f.testit(10u);
+      EXPECT_EQ( bitset.size(), bitset.count());
+      EXPECT_EQ( bitset.size(), count );
+    }
+
+    //std::cout << "  Check reset(i) " << std::endl;
+    // test resetting bits
+    {
+      Impl::TestBitset< bitset_type, false > f(bitset);
+      uint32_t count = f.testit(10u);
+      EXPECT_EQ( bitset.size(), count);
+      EXPECT_EQ( 0u, bitset.count() );
+    }
+
+
+    //std::cout << "  Check find_any_set(i) " << std::endl;
+    // test setting any bits
+    {
+      Impl::TestBitsetAny< bitset_type, true > f(bitset);
+      uint32_t count = f.testit();
+      EXPECT_EQ( bitset.size(), bitset.count());
+      EXPECT_EQ( bitset.size(), count );
+    }
+
+    //std::cout << "  Check find_any_unset(i) " << std::endl;
+    // test resetting any bits
+    {
+      Impl::TestBitsetAny< bitset_type, false > f(bitset);
+      uint32_t count = f.testit();
+      EXPECT_EQ( bitset.size(), count);
+      EXPECT_EQ( 0u, bitset.count() );
+    }
+
+  }
+
+}
+
+} // namespace Test
+
+#endif //KOKKOS_TEST_BITSET_HPP
+
diff --git a/lib/kokkos/containers/unit_tests/TestComplex.hpp b/lib/kokkos/containers/unit_tests/TestComplex.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..94c04b61f46759d91f0738723d487980c8cb2a83
--- /dev/null
+++ b/lib/kokkos/containers/unit_tests/TestComplex.hpp
@@ -0,0 +1,263 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+
+
+#ifndef KOKKOS_TEST_COMPLEX_HPP
+#define KOKKOS_TEST_COMPLEX_HPP
+
+#include <Kokkos_Complex.hpp>
+#include <gtest/gtest.h>
+#include <iostream>
+
+namespace Test {
+
+namespace Impl {
+  template <typename RealType>
+  void testComplexConstructors () {
+    typedef Kokkos::complex<RealType> complex_type;
+
+    complex_type z1;
+    complex_type z2 (0.0, 0.0);
+    complex_type z3 (1.0, 0.0);
+    complex_type z4 (0.0, 1.0);
+    complex_type z5 (-1.0, -2.0);
+
+    ASSERT_TRUE( z1 == z2 );
+    ASSERT_TRUE( z1 != z3 );
+    ASSERT_TRUE( z1 != z4 );
+    ASSERT_TRUE( z1 != z5 );
+
+    ASSERT_TRUE( z2 != z3 );
+    ASSERT_TRUE( z2 != z4 );
+    ASSERT_TRUE( z2 != z5 );
+
+    ASSERT_TRUE( z3 != z4 );
+    ASSERT_TRUE( z3 != z5 );
+
+    complex_type z6 (-1.0, -2.0);
+    ASSERT_TRUE( z5 == z6 );
+
+    // Make sure that complex has value semantics, in particular, that
+    // equality tests use values and not pointers, so that
+    // reassignment actually changes the value.
+    z1 = complex_type (-3.0, -4.0);
+    ASSERT_TRUE( z1.real () == -3.0 );
+    ASSERT_TRUE( z1.imag () == -4.0 );
+    ASSERT_TRUE( z1 != z2 );
+
+    complex_type z7 (1.0);
+    ASSERT_TRUE( z3 == z7 );
+    ASSERT_TRUE( z7 == 1.0 );
+    ASSERT_TRUE( z7 != -1.0 );
+
+    z7 = complex_type (5.0);
+    ASSERT_TRUE( z7.real () == 5.0 );
+    ASSERT_TRUE( z7.imag () == 0.0 );
+  }
+
+  template <typename RealType>
+  void testPlus () {
+    typedef Kokkos::complex<RealType> complex_type;
+
+    complex_type z1 (1.0, -1.0);
+    complex_type z2 (-1.0, 1.0);
+    complex_type z3 = z1 + z2;
+    ASSERT_TRUE( z3 == complex_type (0.0, 0.0) );
+  }
+
+  template <typename RealType>
+  void testMinus () {
+    typedef Kokkos::complex<RealType> complex_type;
+
+    // Test binary minus.
+    complex_type z1 (1.0, -1.0);
+    complex_type z2 (-1.0, 1.0);
+    complex_type z3 = z1 - z2;
+    ASSERT_TRUE( z3 == complex_type (2.0, -2.0) );
+
+    // Test unary minus.
+    complex_type z4 (3.0, -4.0);
+    ASSERT_TRUE( -z1 == complex_type (-3.0, 4.0) );
+  }
+
+  template <typename RealType>
+  void testTimes () {
+    typedef Kokkos::complex<RealType> complex_type;
+
+    complex_type z1 (1.0, -1.0);
+    complex_type z2 (-1.0, 1.0);
+    complex_type z3 = z1 * z2;
+    ASSERT_TRUE( z3 == complex_type (0.0, 2.0) );
+
+    // Make sure that std::complex * Kokkos::complex works too.
+    std::complex<RealType> z4 (-1.0, 1.0);
+    complex_type z5 = z4 * z1;
+    ASSERT_TRUE( z5 == complex_type (0.0, 2.0) );
+  }
+
+  template <typename RealType>
+  void testDivide () {
+    typedef Kokkos::complex<RealType> complex_type;
+
+    // Test division of a complex number by a real number.
+    complex_type z1 (1.0, -1.0);
+    complex_type z2 (1.0 / 2.0, -1.0 / 2.0);
+    ASSERT_TRUE( z1 / 2.0 == z2 );
+
+    // (-1+2i)/(1-i) == ((-1+2i)(1+i)) / ((1-i)(1+i))
+    // (-1+2i)(1+i) == -3 + i
+    complex_type z3 (-1.0, 2.0);
+    complex_type z4 (1.0, -1.0);
+    complex_type z5 (-3.0, 1.0);
+    ASSERT_TRUE(z3 * Kokkos::conj (z4) == z5 );
+
+    // Test division of a complex number by a complex number.
+    // This assumes that RealType is a floating-point type.
+    complex_type z6 (Kokkos::real (z5) / 2.0,
+                     Kokkos::imag (z5) / 2.0);
+
+    complex_type z7 = z3 / z4;
+    ASSERT_TRUE( z7 == z6 );
+  }
+
+  template <typename RealType>
+  void testOutsideKernel () {
+    testComplexConstructors<RealType> ();
+    testPlus<RealType> ();
+    testTimes<RealType> ();
+    testDivide<RealType> ();
+  }
+
+
+  template<typename RealType, typename Device>
+  void testCreateView () {
+    typedef Kokkos::complex<RealType> complex_type;
+    Kokkos::View<complex_type*, Device> x ("x", 10);
+    ASSERT_TRUE( x.dimension_0 () == 10 );
+
+    // Test that View assignment works.
+    Kokkos::View<complex_type*, Device> x_nonconst = x;
+    Kokkos::View<const complex_type*, Device> x_const = x;
+  }
+
+  template<typename RealType, typename Device>
+  class Fill {
+  public:
+    typedef typename Device::execution_space execution_space;
+
+    typedef Kokkos::View<Kokkos::complex<RealType>*, Device> view_type;
+    typedef typename view_type::size_type size_type;
+
+    KOKKOS_INLINE_FUNCTION
+    void operator () (const size_type i) const {
+      x_(i) = val_;
+    }
+
+    Fill (const view_type& x, const Kokkos::complex<RealType>& val) :
+      x_ (x), val_ (val)
+    {}
+
+  private:
+    view_type x_;
+    const Kokkos::complex<RealType> val_;
+  };
+
+  template<typename RealType, typename Device>
+  class Sum {
+  public:
+    typedef typename Device::execution_space execution_space;
+
+    typedef Kokkos::View<const Kokkos::complex<RealType>*, Device> view_type;
+    typedef typename view_type::size_type size_type;
+    typedef Kokkos::complex<RealType> value_type;
+
+    KOKKOS_INLINE_FUNCTION
+    void operator () (const size_type i, Kokkos::complex<RealType>& sum) const {
+      sum += x_(i);
+    }
+
+    Sum (const view_type& x) : x_ (x) {}
+
+  private:
+    view_type x_;
+  };
+
+  template<typename RealType, typename Device>
+  void testInsideKernel () {
+    typedef Kokkos::complex<RealType> complex_type;
+    typedef Kokkos::View<complex_type*, Device> view_type;
+    typedef typename view_type::size_type size_type;
+
+    const size_type N = 1000;
+    view_type x ("x", N);
+    ASSERT_TRUE( x.dimension_0 () == N );
+
+    // Kokkos::parallel_reduce (N, [=] (const size_type i, complex_type& result) {
+    //     result += x[i];
+    //   });
+
+    Kokkos::parallel_for (N, Fill<RealType, Device> (x, complex_type (1.0, -1.0)));
+
+    complex_type sum;
+    Kokkos::parallel_reduce (N, Sum<RealType, Device> (x), sum);
+
+    ASSERT_TRUE( sum.real () == 1000.0 && sum.imag () == -1000.0 );
+  }
+} // namespace Impl
+
+
+template <typename Device>
+void testComplex ()
+{
+  Impl::testOutsideKernel<float> ();
+  Impl::testOutsideKernel<double> ();
+
+  Impl::testCreateView<float, Device> ();
+  Impl::testCreateView<double, Device> ();
+
+  Impl::testInsideKernel<float, Device> ();
+  Impl::testInsideKernel<double, Device> ();
+}
+
+
+} // namespace Test
+
+#endif // KOKKOS_TEST_COMPLEX_HPP
diff --git a/lib/kokkos/containers/unit_tests/TestCuda.cpp b/lib/kokkos/containers/unit_tests/TestCuda.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e30160b24e3a57d927924067d171ee8b49540357
--- /dev/null
+++ b/lib/kokkos/containers/unit_tests/TestCuda.cpp
@@ -0,0 +1,227 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <iostream>
+#include <iomanip>
+#include <stdint.h>
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+#include <Kokkos_Bitset.hpp>
+#include <Kokkos_UnorderedMap.hpp>
+#include <Kokkos_Vector.hpp>
+
+#include <TestBitset.hpp>
+#include <TestUnorderedMap.hpp>
+#include <TestStaticCrsGraph.hpp>
+#include <TestVector.hpp>
+#include <TestDualView.hpp>
+#include <TestDynamicView.hpp>
+#include <TestSegmentedView.hpp>
+
+#include <Kokkos_DynRankView.hpp>
+#include <TestDynViewAPI.hpp>
+
+//----------------------------------------------------------------------------
+
+
+#ifdef KOKKOS_HAVE_CUDA
+
+namespace Test {
+
+class cuda : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+    std::cout << std::setprecision(5) << std::scientific;
+    Kokkos::HostSpace::execution_space::initialize();
+    Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice(0) );
+  }
+  static void TearDownTestCase()
+  {
+    Kokkos::Cuda::finalize();
+    Kokkos::HostSpace::execution_space::finalize();
+  }
+};
+
+TEST_F( cuda , dyn_view_api) {
+  TestDynViewAPI< double , Kokkos::Cuda >();
+}
+
+TEST_F( cuda , staticcrsgraph )
+{
+  TestStaticCrsGraph::run_test_graph< Kokkos::Cuda >();
+  TestStaticCrsGraph::run_test_graph2< Kokkos::Cuda >();
+}
+
+
+void cuda_test_insert_close(  uint32_t num_nodes
+                            , uint32_t num_inserts
+                            , uint32_t num_duplicates
+                           )
+{
+  test_insert< Kokkos::Cuda >( num_nodes, num_inserts, num_duplicates, true);
+}
+
+void cuda_test_insert_far(  uint32_t num_nodes
+                          , uint32_t num_inserts
+                          , uint32_t num_duplicates
+                         )
+{
+  test_insert< Kokkos::Cuda >( num_nodes, num_inserts, num_duplicates, false);
+}
+
+void cuda_test_failed_insert(  uint32_t num_nodes )
+{
+  test_failed_insert< Kokkos::Cuda >( num_nodes );
+}
+
+void cuda_test_deep_copy(  uint32_t num_nodes )
+{
+  test_deep_copy< Kokkos::Cuda >( num_nodes );
+}
+
+void cuda_test_vector_combinations(unsigned int size)
+{
+  test_vector_combinations<int,Kokkos::Cuda>(size);
+}
+
+void cuda_test_dualview_combinations(unsigned int size)
+{
+  test_dualview_combinations<int,Kokkos::Cuda>(size);
+}
+
+void cuda_test_segmented_view(unsigned int size)
+{
+  test_segmented_view<double,Kokkos::Cuda>(size);
+}
+
+void cuda_test_bitset()
+{
+  test_bitset<Kokkos::Cuda>();
+}
+
+
+
+/*TEST_F( cuda, bitset )
+{
+  cuda_test_bitset();
+}*/
+
+#define CUDA_INSERT_TEST( name, num_nodes, num_inserts, num_duplicates, repeat )                                \
+  TEST_F( cuda, UnorderedMap_insert_##name##_##num_nodes##_##num_inserts##_##num_duplicates##_##repeat##x) {   \
+    for (int i=0; i<repeat; ++i)                                                                                \
+      cuda_test_insert_##name(num_nodes,num_inserts,num_duplicates);                                            \
+  }
+
+#define CUDA_FAILED_INSERT_TEST( num_nodes, repeat )                           \
+  TEST_F( cuda, UnorderedMap_failed_insert_##num_nodes##_##repeat##x) {       \
+    for (int i=0; i<repeat; ++i)                                               \
+      cuda_test_failed_insert(num_nodes);                                      \
+  }
+
+#define CUDA_ASSIGNEMENT_TEST( num_nodes, repeat )                               \
+  TEST_F( cuda, UnorderedMap_assignment_operators_##num_nodes##_##repeat##x) {  \
+    for (int i=0; i<repeat; ++i)                                                 \
+      cuda_test_assignment_operators(num_nodes);                                 \
+  }
+
+#define CUDA_DEEP_COPY( num_nodes, repeat )                             \
+  TEST_F( cuda, UnorderedMap_deep_copy##num_nodes##_##repeat##x) {       \
+    for (int i=0; i<repeat; ++i)                                               \
+      cuda_test_deep_copy(num_nodes);                     \
+  }
+
+#define CUDA_VECTOR_COMBINE_TEST( size )                             \
+  TEST_F( cuda, vector_combination##size##x) {       \
+      cuda_test_vector_combinations(size);                     \
+  }
+
+#define CUDA_DUALVIEW_COMBINE_TEST( size )                             \
+  TEST_F( cuda, dualview_combination##size##x) {       \
+      cuda_test_dualview_combinations(size);                     \
+  }
+
+#define CUDA_SEGMENTEDVIEW_TEST( size )                             \
+  TEST_F( cuda, segmentedview_##size##x) {       \
+      cuda_test_segmented_view(size);                     \
+  }
+
+CUDA_DUALVIEW_COMBINE_TEST( 10 )
+CUDA_VECTOR_COMBINE_TEST( 10 )
+CUDA_VECTOR_COMBINE_TEST( 3057 )
+
+
+CUDA_INSERT_TEST(close,               100000, 90000, 100, 500)
+CUDA_INSERT_TEST(far,                 100000, 90000, 100, 500)
+CUDA_DEEP_COPY( 10000, 1 )
+CUDA_FAILED_INSERT_TEST( 10000, 1000 )
+CUDA_SEGMENTEDVIEW_TEST( 200 )
+
+
+#undef CUDA_INSERT_TEST
+#undef CUDA_FAILED_INSERT_TEST
+#undef CUDA_ASSIGNEMENT_TEST
+#undef CUDA_DEEP_COPY
+#undef CUDA_VECTOR_COMBINE_TEST
+#undef CUDA_DUALVIEW_COMBINE_TEST
+#undef CUDA_SEGMENTEDVIEW_TEST
+
+
+TEST_F( cuda , dynamic_view )
+{
+  typedef TestDynamicView< double , Kokkos::CudaUVMSpace >
+    TestDynView ;
+
+  for ( int i = 0 ; i < 10 ; ++i ) {
+    TestDynView::run( 100000 + 100 * i );
+  }
+}
+
+
+}
+
+#endif  /* #ifdef KOKKOS_HAVE_CUDA */
+
diff --git a/lib/kokkos/containers/unit_tests/TestDualView.hpp b/lib/kokkos/containers/unit_tests/TestDualView.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e72c69f7d41cf7d493becfcbb863e5f1d9f6679f
--- /dev/null
+++ b/lib/kokkos/containers/unit_tests/TestDualView.hpp
@@ -0,0 +1,121 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_TEST_DUALVIEW_HPP
+#define KOKKOS_TEST_DUALVIEW_HPP
+
+#include <gtest/gtest.h>
+#include <iostream>
+#include <cstdlib>
+#include <cstdio>
+#include <impl/Kokkos_Timer.hpp>
+
+namespace Test {
+
+namespace Impl {
+
+  template <typename Scalar, class Device>
+  struct test_dualview_combinations
+  {
+    typedef test_dualview_combinations<Scalar,Device> self_type;
+
+    typedef Scalar scalar_type;
+    typedef Device execution_space;
+
+    Scalar reference;
+    Scalar result;
+
+    template <typename ViewType>
+    Scalar run_me(unsigned int n,unsigned int m){
+      if(n<10) n = 10;
+      if(m<3) m = 3;
+      ViewType a("A",n,m);
+
+      Kokkos::deep_copy( a.d_view , 1 );
+
+      a.template modify<typename ViewType::execution_space>();
+      a.template sync<typename ViewType::host_mirror_space>();
+
+      a.h_view(5,1) = 3;
+      a.h_view(6,1) = 4;
+      a.h_view(7,2) = 5;
+      a.template modify<typename ViewType::host_mirror_space>();
+      ViewType b = Kokkos::subview(a,std::pair<unsigned int, unsigned int>(6,9),std::pair<unsigned int, unsigned int>(0,1));
+      a.template sync<typename ViewType::execution_space>();
+      b.template modify<typename ViewType::execution_space>();
+
+      Kokkos::deep_copy( b.d_view , 2 );
+
+      a.template sync<typename ViewType::host_mirror_space>();
+      Scalar count = 0;
+      for(unsigned int i = 0; i<a.d_view.dimension_0(); i++)
+        for(unsigned int j = 0; j<a.d_view.dimension_1(); j++)
+          count += a.h_view(i,j);
+      return count -  a.d_view.dimension_0()*a.d_view.dimension_1()-2-4-3*2;
+    }
+
+
+    test_dualview_combinations(unsigned int size)
+    {
+      result = run_me< Kokkos::DualView<Scalar**,Kokkos::LayoutLeft,Device> >(size,3);
+    }
+
+   };
+
+} // namespace Impl
+
+
+
+
+template <typename Scalar, typename Device>
+void test_dualview_combinations(unsigned int size)
+{
+  Impl::test_dualview_combinations<Scalar,Device> test(size);
+  ASSERT_EQ( test.result,0);
+
+}
+
+
+} // namespace Test
+
+#endif //KOKKOS_TEST_UNORDERED_MAP_HPP
diff --git a/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp b/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e71ccc0091f0ad8c67de46fe91b4b08e43dcc27d
--- /dev/null
+++ b/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp
@@ -0,0 +1,1559 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+#include <stdexcept>
+#include <sstream>
+#include <iostream>
+
+/*--------------------------------------------------------------------------*/
+
+
+/*--------------------------------------------------------------------------*/
+
+namespace Test {
+
+template< class T , class ... P >
+size_t allocation_count( const Kokkos::Experimental::DynRankView<T,P...> & view )
+{
+  const size_t card  = view.size();
+  const size_t alloc = view.span();
+
+  return card <= alloc ? alloc : 0 ;
+}
+
+/*--------------------------------------------------------------------------*/
+
+template< typename T, class DeviceType>
+struct TestViewOperator
+{
+  typedef DeviceType  execution_space ;
+
+  static const unsigned N = 100 ;
+  static const unsigned D = 3 ;
+
+  typedef Kokkos::Experimental::DynRankView< T , execution_space > view_type ;
+
+  const view_type v1 ;
+  const view_type v2 ;
+
+  TestViewOperator()
+    : v1( "v1" , N , D )
+    , v2( "v2" , N , D )
+    {}
+
+  static void testit()
+  {
+    Kokkos::parallel_for( N , TestViewOperator() );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const unsigned i ) const
+  {
+    const unsigned X = 0 ;
+    const unsigned Y = 1 ;
+    const unsigned Z = 2 ;
+
+    v2(i,X) = v1(i,X);
+    v2(i,Y) = v1(i,Y);
+    v2(i,Z) = v1(i,Z);
+  }
+};
+
+/*--------------------------------------------------------------------------*/
+
+template< class DataType ,
+          class DeviceType ,
+          unsigned Rank >
+struct TestViewOperator_LeftAndRight ;
+
+template< class DataType , class DeviceType >
+struct TestViewOperator_LeftAndRight< DataType , DeviceType , 7 >
+{
+  typedef DeviceType                          execution_space ;
+  typedef typename execution_space::memory_space  memory_space ;
+  typedef typename execution_space::size_type     size_type ;
+
+  typedef int value_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update ,
+                    const volatile value_type & input )
+    { update |= input ; }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init( value_type & update )
+    { update = 0 ; }
+
+
+  typedef Kokkos::
+    Experimental::DynRankView< DataType, Kokkos::LayoutLeft, execution_space > left_view ;
+
+  typedef Kokkos::
+    Experimental::DynRankView< DataType, Kokkos::LayoutRight, execution_space > right_view ;
+
+  left_view    left ;
+  right_view   right ;
+  long         left_alloc ;
+  long         right_alloc ;
+
+  TestViewOperator_LeftAndRight(unsigned N0, unsigned N1, unsigned N2, unsigned N3, unsigned N4, unsigned N5, unsigned N6 )
+    : left(  "left" , N0, N1, N2, N3, N4, N5, N6 )
+    , right( "right" , N0, N1, N2, N3, N4, N5, N6 )
+    , left_alloc( allocation_count( left ) )
+    , right_alloc( allocation_count( right ) )
+    {}
+
+  static void testit(unsigned N0, unsigned N1, unsigned N2, unsigned N3, unsigned N4, unsigned N5, unsigned N6 )
+  {
+    TestViewOperator_LeftAndRight driver(N0, N1, N2, N3, N4, N5, N6 );
+
+    int error_flag = 0 ;
+
+    Kokkos::parallel_reduce( 1 , driver , error_flag );
+
+    ASSERT_EQ( error_flag , 0 );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type , value_type & update ) const
+  {
+    long offset ;
+
+    offset = -1 ;
+    for ( unsigned i6 = 0 ; i6 < unsigned(left.dimension_6()) ; ++i6 )
+    for ( unsigned i5 = 0 ; i5 < unsigned(left.dimension_5()) ; ++i5 )
+    for ( unsigned i4 = 0 ; i4 < unsigned(left.dimension_4()) ; ++i4 )
+    for ( unsigned i3 = 0 ; i3 < unsigned(left.dimension_3()) ; ++i3 )
+    for ( unsigned i2 = 0 ; i2 < unsigned(left.dimension_2()) ; ++i2 )
+    for ( unsigned i1 = 0 ; i1 < unsigned(left.dimension_1()) ; ++i1 )
+    for ( unsigned i0 = 0 ; i0 < unsigned(left.dimension_0()) ; ++i0 )
+    {
+      const long j = & left( i0, i1, i2, i3, i4, i5, i6 ) -
+                     & left(  0,  0,  0,  0,  0,  0,  0 );
+      if ( j <= offset || left_alloc <= j ) { update |= 1 ; }
+      offset = j ;
+    }
+
+    offset = -1 ;
+    for ( unsigned i0 = 0 ; i0 < unsigned(right.dimension_0()) ; ++i0 )
+    for ( unsigned i1 = 0 ; i1 < unsigned(right.dimension_1()) ; ++i1 )
+    for ( unsigned i2 = 0 ; i2 < unsigned(right.dimension_2()) ; ++i2 )
+    for ( unsigned i3 = 0 ; i3 < unsigned(right.dimension_3()) ; ++i3 )
+    for ( unsigned i4 = 0 ; i4 < unsigned(right.dimension_4()) ; ++i4 )
+    for ( unsigned i5 = 0 ; i5 < unsigned(right.dimension_5()) ; ++i5 )
+    for ( unsigned i6 = 0 ; i6 < unsigned(right.dimension_6()) ; ++i6 )
+    {
+      const long j = & right( i0, i1, i2, i3, i4, i5, i6 ) -
+                     & right(  0,  0,  0,  0,  0,  0,  0 );
+      if ( j <= offset || right_alloc <= j ) { update |= 2 ; }
+      offset = j ;
+    }
+  }
+};
+
+template< class DataType , class DeviceType >
+struct TestViewOperator_LeftAndRight< DataType , DeviceType , 6 >
+{
+  typedef DeviceType                          execution_space ;
+  typedef typename execution_space::memory_space  memory_space ;
+  typedef typename execution_space::size_type     size_type ;
+
+  typedef int value_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update ,
+                    const volatile value_type & input )
+    { update |= input ; }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init( value_type & update )
+    { update = 0 ; }
+
+
+  typedef Kokkos::
+    Experimental::DynRankView< DataType, Kokkos::LayoutLeft, execution_space > left_view ;
+
+  typedef Kokkos::
+    Experimental::DynRankView< DataType, Kokkos::LayoutRight, execution_space > right_view ;
+
+  left_view    left ;
+  right_view   right ;
+  long         left_alloc ;
+  long         right_alloc ;
+
+  TestViewOperator_LeftAndRight(unsigned N0, unsigned N1, unsigned N2, unsigned N3, unsigned N4, unsigned N5 )
+    : left(  "left" , N0, N1, N2, N3, N4, N5 )
+    , right( "right" , N0, N1, N2, N3, N4, N5 )
+    , left_alloc( allocation_count( left ) )
+    , right_alloc( allocation_count( right ) )
+    {}
+
+  static void testit(unsigned N0, unsigned N1, unsigned N2, unsigned N3, unsigned N4, unsigned N5)
+  {
+    TestViewOperator_LeftAndRight driver (N0, N1, N2, N3, N4, N5);
+
+    int error_flag = 0 ;
+
+    Kokkos::parallel_reduce( 1 , driver , error_flag );
+
+    ASSERT_EQ( error_flag , 0 );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type , value_type & update ) const
+  {
+    long offset ;
+
+    offset = -1 ;
+    for ( unsigned i5 = 0 ; i5 < unsigned(left.dimension_5()) ; ++i5 )
+    for ( unsigned i4 = 0 ; i4 < unsigned(left.dimension_4()) ; ++i4 )
+    for ( unsigned i3 = 0 ; i3 < unsigned(left.dimension_3()) ; ++i3 )
+    for ( unsigned i2 = 0 ; i2 < unsigned(left.dimension_2()) ; ++i2 )
+    for ( unsigned i1 = 0 ; i1 < unsigned(left.dimension_1()) ; ++i1 )
+    for ( unsigned i0 = 0 ; i0 < unsigned(left.dimension_0()) ; ++i0 )
+    {
+      const long j = & left( i0, i1, i2, i3, i4, i5 ) -
+                     & left(  0,  0,  0,  0,  0,  0 );
+      if ( j <= offset || left_alloc <= j ) { update |= 1 ; }
+      offset = j ;
+    }
+
+    offset = -1 ;
+    for ( unsigned i0 = 0 ; i0 < unsigned(right.dimension_0()) ; ++i0 )
+    for ( unsigned i1 = 0 ; i1 < unsigned(right.dimension_1()) ; ++i1 )
+    for ( unsigned i2 = 0 ; i2 < unsigned(right.dimension_2()) ; ++i2 )
+    for ( unsigned i3 = 0 ; i3 < unsigned(right.dimension_3()) ; ++i3 )
+    for ( unsigned i4 = 0 ; i4 < unsigned(right.dimension_4()) ; ++i4 )
+    for ( unsigned i5 = 0 ; i5 < unsigned(right.dimension_5()) ; ++i5 )
+    {
+      const long j = & right( i0, i1, i2, i3, i4, i5 ) -
+                     & right(  0,  0,  0,  0,  0,  0 );
+      if ( j <= offset || right_alloc <= j ) { update |= 2 ; }
+      offset = j ;
+    }
+  }
+};
+
+template< class DataType , class DeviceType >
+struct TestViewOperator_LeftAndRight< DataType , DeviceType , 5 >
+{
+  typedef DeviceType                          execution_space ;
+  typedef typename execution_space::memory_space  memory_space ;
+  typedef typename execution_space::size_type     size_type ;
+
+  typedef int value_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update ,
+                    const volatile value_type & input )
+    { update |= input ; }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init( value_type & update )
+    { update = 0 ; }
+
+
+  typedef Kokkos::
+    Experimental::DynRankView< DataType, Kokkos::LayoutLeft, execution_space > left_view ;
+
+  typedef Kokkos::
+    Experimental::DynRankView< DataType, Kokkos::LayoutRight, execution_space > right_view ;
+
+  typedef Kokkos::
+    Experimental::DynRankView< DataType, Kokkos::LayoutStride, execution_space > stride_view ;
+
+  left_view    left ;
+  right_view   right ;
+  stride_view  left_stride ;
+  stride_view  right_stride ;
+  long         left_alloc ;
+  long         right_alloc ;
+
+  TestViewOperator_LeftAndRight(unsigned N0, unsigned N1, unsigned N2, unsigned N3, unsigned N4 )
+    : left(  "left" , N0, N1, N2, N3, N4 )
+    , right( "right" , N0, N1, N2, N3, N4 )
+    , left_stride( left )
+    , right_stride( right )
+    , left_alloc( allocation_count( left ) )
+    , right_alloc( allocation_count( right ) )
+    {}
+
+  static void testit(unsigned N0, unsigned N1, unsigned N2, unsigned N3, unsigned N4)
+  {
+    TestViewOperator_LeftAndRight driver(N0, N1, N2, N3, N4);
+
+    int error_flag = 0 ;
+
+    Kokkos::parallel_reduce( 1 , driver , error_flag );
+
+    ASSERT_EQ( error_flag , 0 );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type , value_type & update ) const
+  {
+    long offset ;
+
+    offset = -1 ;
+    for ( unsigned i4 = 0 ; i4 < unsigned(left.dimension_4()) ; ++i4 )
+    for ( unsigned i3 = 0 ; i3 < unsigned(left.dimension_3()) ; ++i3 )
+    for ( unsigned i2 = 0 ; i2 < unsigned(left.dimension_2()) ; ++i2 )
+    for ( unsigned i1 = 0 ; i1 < unsigned(left.dimension_1()) ; ++i1 )
+    for ( unsigned i0 = 0 ; i0 < unsigned(left.dimension_0()) ; ++i0 )
+    {
+      const long j = & left( i0, i1, i2, i3, i4 ) -
+                     & left(  0,  0,  0,  0,  0 );
+      if ( j <= offset || left_alloc <= j ) { update |= 1 ; }
+      offset = j ;
+
+      if ( & left( i0, i1, i2, i3, i4 ) !=
+           & left_stride( i0, i1, i2, i3, i4 ) ) { update |= 4 ; }
+    }
+
+    offset = -1 ;
+    for ( unsigned i0 = 0 ; i0 < unsigned(right.dimension_0()) ; ++i0 )
+    for ( unsigned i1 = 0 ; i1 < unsigned(right.dimension_1()) ; ++i1 )
+    for ( unsigned i2 = 0 ; i2 < unsigned(right.dimension_2()) ; ++i2 )
+    for ( unsigned i3 = 0 ; i3 < unsigned(right.dimension_3()) ; ++i3 )
+    for ( unsigned i4 = 0 ; i4 < unsigned(right.dimension_4()) ; ++i4 )
+    {
+      const long j = & right( i0, i1, i2, i3, i4 ) -
+                     & right(  0,  0,  0,  0,  0 );
+      if ( j <= offset || right_alloc <= j ) { update |= 2 ; }
+      offset = j ;
+
+      if ( & right( i0, i1, i2, i3, i4 ) !=
+           & right_stride( i0, i1, i2, i3, i4 ) ) { update |= 8 ; }
+    }
+  }
+};
+
+template< class DataType , class DeviceType >
+struct TestViewOperator_LeftAndRight< DataType , DeviceType , 4 >
+{
+  typedef DeviceType                          execution_space ;
+  typedef typename execution_space::memory_space  memory_space ;
+  typedef typename execution_space::size_type     size_type ;
+
+  typedef int value_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update ,
+                    const volatile value_type & input )
+    { update |= input ; }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init( value_type & update )
+    { update = 0 ; }
+
+
+  typedef Kokkos::
+   Experimental::DynRankView< DataType, Kokkos::LayoutLeft, execution_space > left_view ;
+
+  typedef Kokkos::
+    Experimental::DynRankView< DataType, Kokkos::LayoutRight, execution_space > right_view ;
+
+  left_view    left ;
+  right_view   right ;
+  long         left_alloc ;
+  long         right_alloc ;
+
+  TestViewOperator_LeftAndRight(unsigned N0, unsigned N1, unsigned N2, unsigned N3)
+    : left(  "left" , N0, N1, N2, N3 )
+    , right( "right" , N0, N1, N2, N3 )
+    , left_alloc( allocation_count( left ) )
+    , right_alloc( allocation_count( right ) )
+    {}
+
+  static void testit(unsigned N0, unsigned N1, unsigned N2, unsigned N3)
+  {
+    TestViewOperator_LeftAndRight driver (N0, N1, N2, N3);
+
+    int error_flag = 0 ;
+
+    Kokkos::parallel_reduce( 1 , driver , error_flag );
+
+    ASSERT_EQ( error_flag , 0 );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type , value_type & update ) const
+  {
+    long offset ;
+
+    offset = -1 ;
+    for ( unsigned i3 = 0 ; i3 < unsigned(left.dimension_3()) ; ++i3 )
+    for ( unsigned i2 = 0 ; i2 < unsigned(left.dimension_2()) ; ++i2 )
+    for ( unsigned i1 = 0 ; i1 < unsigned(left.dimension_1()) ; ++i1 )
+    for ( unsigned i0 = 0 ; i0 < unsigned(left.dimension_0()) ; ++i0 )
+    {
+      const long j = & left( i0, i1, i2, i3 ) -
+                     & left(  0,  0,  0,  0 );
+      if ( j <= offset || left_alloc <= j ) { update |= 1 ; }
+      offset = j ;
+    }
+
+    offset = -1 ;
+    for ( unsigned i0 = 0 ; i0 < unsigned(right.dimension_0()) ; ++i0 )
+    for ( unsigned i1 = 0 ; i1 < unsigned(right.dimension_1()) ; ++i1 )
+    for ( unsigned i2 = 0 ; i2 < unsigned(right.dimension_2()) ; ++i2 )
+    for ( unsigned i3 = 0 ; i3 < unsigned(right.dimension_3()) ; ++i3 )
+    {
+      const long j = & right( i0, i1, i2, i3 ) -
+                     & right(  0,  0,  0,  0 );
+      if ( j <= offset || right_alloc <= j ) { update |= 2 ; }
+      offset = j ;
+    }
+  }
+};
+
+template< class DataType , class DeviceType >
+struct TestViewOperator_LeftAndRight< DataType , DeviceType , 3 >
+{
+  typedef DeviceType                          execution_space ;
+  typedef typename execution_space::memory_space  memory_space ;
+  typedef typename execution_space::size_type     size_type ;
+
+  typedef int value_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update ,
+                    const volatile value_type & input )
+    { update |= input ; }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init( value_type & update )
+    { update = 0 ; }
+
+
+  typedef Kokkos::
+    Experimental::DynRankView< DataType, Kokkos::LayoutLeft, execution_space > left_view ;
+
+  typedef Kokkos::
+    Experimental::DynRankView< DataType, Kokkos::LayoutRight, execution_space > right_view ;
+
+  typedef Kokkos::
+    Experimental::DynRankView< DataType, Kokkos::LayoutStride, execution_space > stride_view ;
+
+  left_view    left ;
+  right_view   right ;
+  stride_view  left_stride ;
+  stride_view  right_stride ;
+  long         left_alloc ;
+  long         right_alloc ;
+
+  TestViewOperator_LeftAndRight(unsigned N0, unsigned N1, unsigned N2)
+    : left(  std::string("left") , N0, N1, N2 )
+    , right( std::string("right") , N0, N1, N2 )
+    , left_stride( left )
+    , right_stride( right )
+    , left_alloc( allocation_count( left ) )
+    , right_alloc( allocation_count( right ) )
+    {}
+
+  static void testit(unsigned N0, unsigned N1, unsigned N2)
+  {
+    TestViewOperator_LeftAndRight driver (N0, N1, N2);
+
+    int error_flag = 0 ;
+
+    Kokkos::parallel_reduce( 1 , driver , error_flag );
+
+    ASSERT_EQ( error_flag , 0 );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type , value_type & update ) const
+  {
+    long offset ;
+
+    offset = -1 ;
+    for ( unsigned i2 = 0 ; i2 < unsigned(left.dimension_2()) ; ++i2 )
+    for ( unsigned i1 = 0 ; i1 < unsigned(left.dimension_1()) ; ++i1 )
+    for ( unsigned i0 = 0 ; i0 < unsigned(left.dimension_0()) ; ++i0 )
+    {
+      const long j = & left( i0, i1, i2 ) -
+                     & left(  0,  0,  0 );
+      if ( j <= offset || left_alloc <= j ) { update |= 1 ; }
+      offset = j ;
+
+      if ( & left(i0,i1,i2) != & left_stride(i0,i1,i2) ) { update |= 4 ; }
+    }
+
+    offset = -1 ;
+    for ( unsigned i0 = 0 ; i0 < unsigned(right.dimension_0()) ; ++i0 )
+    for ( unsigned i1 = 0 ; i1 < unsigned(right.dimension_1()) ; ++i1 )
+    for ( unsigned i2 = 0 ; i2 < unsigned(right.dimension_2()) ; ++i2 )
+    {
+      const long j = & right( i0, i1, i2 ) -
+                     & right(  0,  0,  0 );
+      if ( j <= offset || right_alloc <= j ) { update |= 2 ; }
+      offset = j ;
+
+      if ( & right(i0,i1,i2) != & right_stride(i0,i1,i2) ) { update |= 8 ; }
+    }
+
+    for ( unsigned i0 = 0 ; i0 < unsigned(left.dimension_0()) ; ++i0 )
+    for ( unsigned i1 = 0 ; i1 < unsigned(left.dimension_1()) ; ++i1 )
+    for ( unsigned i2 = 0 ; i2 < unsigned(left.dimension_2()) ; ++i2 )
+    {
+      if ( & left(i0,i1,i2)  != & left(i0,i1,i2,0,0,0,0) )  { update |= 3 ; }
+      if ( & right(i0,i1,i2) != & right(i0,i1,i2,0,0,0,0) ) { update |= 3 ; }
+    }
+  }
+};
+
+template< class DataType , class DeviceType >
+struct TestViewOperator_LeftAndRight< DataType , DeviceType , 2 >
+{
+  typedef DeviceType                          execution_space ;
+  typedef typename execution_space::memory_space  memory_space ;
+  typedef typename execution_space::size_type     size_type ;
+
+  typedef int value_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update ,
+                    const volatile value_type & input )
+    { update |= input ; }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init( value_type & update )
+    { update = 0 ; }
+
+
+  typedef Kokkos::
+    Experimental::DynRankView< DataType, Kokkos::LayoutLeft, execution_space > left_view ;
+
+  typedef Kokkos::
+    Experimental::DynRankView< DataType, Kokkos::LayoutRight, execution_space > right_view ;
+
+  left_view    left ;
+  right_view   right ;
+  long         left_alloc ;
+  long         right_alloc ;
+
+  TestViewOperator_LeftAndRight(unsigned N0, unsigned N1)
+    : left(  "left" , N0, N1 )
+    , right( "right" , N0, N1 )
+    , left_alloc( allocation_count( left ) )
+    , right_alloc( allocation_count( right ) )
+    {}
+
+  static void testit(unsigned N0, unsigned N1)
+  {
+    TestViewOperator_LeftAndRight driver(N0, N1);
+
+    int error_flag = 0 ;
+
+    Kokkos::parallel_reduce( 1 , driver , error_flag );
+
+    ASSERT_EQ( error_flag , 0 );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type , value_type & update ) const
+  {
+    long offset ;
+
+    offset = -1 ;
+    for ( unsigned i1 = 0 ; i1 < unsigned(left.dimension_1()) ; ++i1 )
+    for ( unsigned i0 = 0 ; i0 < unsigned(left.dimension_0()) ; ++i0 )
+    {
+      const long j = & left( i0, i1 ) -
+                     & left(  0,  0 );
+      if ( j <= offset || left_alloc <= j ) { update |= 1 ; }
+      offset = j ;
+    }
+
+    offset = -1 ;
+    for ( unsigned i0 = 0 ; i0 < unsigned(right.dimension_0()) ; ++i0 )
+    for ( unsigned i1 = 0 ; i1 < unsigned(right.dimension_1()) ; ++i1 )
+    {
+      const long j = & right( i0, i1 ) -
+                     & right(  0,  0 );
+      if ( j <= offset || right_alloc <= j ) { update |= 2 ; }
+      offset = j ;
+    }
+
+    for ( unsigned i0 = 0 ; i0 < unsigned(left.dimension_0()) ; ++i0 )
+    for ( unsigned i1 = 0 ; i1 < unsigned(left.dimension_1()) ; ++i1 )
+    {
+      if ( & left(i0,i1)  != & left(i0,i1,0,0,0,0,0) )  { update |= 3 ; }
+      if ( & right(i0,i1) != & right(i0,i1,0,0,0,0,0) ) { update |= 3 ; }
+    }
+  }
+};
+
+template< class DataType , class DeviceType >
+struct TestViewOperator_LeftAndRight< DataType , DeviceType , 1 >
+{
+  typedef DeviceType                          execution_space ;
+  typedef typename execution_space::memory_space  memory_space ;
+  typedef typename execution_space::size_type     size_type ;
+
+  typedef int value_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update ,
+                    const volatile value_type & input )
+    { update |= input ; }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init( value_type & update )
+    { update = 0 ; }
+
+
+  typedef Kokkos::
+    Experimental::DynRankView< DataType, Kokkos::LayoutLeft, execution_space > left_view ;
+
+  typedef Kokkos::
+    Experimental::DynRankView< DataType, Kokkos::LayoutRight, execution_space > right_view ;
+
+  typedef Kokkos::
+    Experimental::DynRankView< DataType, Kokkos::LayoutStride, execution_space > stride_view ;
+
+  left_view    left ;
+  right_view   right ;
+  stride_view  left_stride ;
+  stride_view  right_stride ;
+  long         left_alloc ;
+  long         right_alloc ;
+
+  TestViewOperator_LeftAndRight(unsigned N0)
+    : left(  "left" , N0 )
+    , right( "right" , N0 )
+    , left_stride( left )
+    , right_stride( right )
+    , left_alloc( allocation_count( left ) )
+    , right_alloc( allocation_count( right ) )
+    {}
+
+  static void testit(unsigned N0)
+  {
+    TestViewOperator_LeftAndRight driver (N0) ;
+
+    int error_flag = 0 ;
+
+    Kokkos::parallel_reduce( 1 , driver , error_flag );
+
+    ASSERT_EQ( error_flag , 0 );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type , value_type & update ) const
+  {
+    for ( unsigned i0 = 0 ; i0 < unsigned(left.dimension_0()) ; ++i0 )
+    {
+      if ( & left(i0)  != & left(i0,0,0,0,0,0,0) )  { update |= 3 ; }
+      if ( & right(i0) != & right(i0,0,0,0,0,0,0) ) { update |= 3 ; }
+      if ( & left(i0)  != & left_stride(i0) ) { update |= 4 ; }
+      if ( & right(i0) != & right_stride(i0) ) { update |= 8 ; }
+    }
+  }
+};
+
+/*--------------------------------------------------------------------------*/
+
+template< typename T, class DeviceType >
+class TestDynViewAPI
+{
+public:
+  typedef DeviceType        device ;
+
+  enum { N0 = 1000 ,
+         N1 = 3 ,
+         N2 = 5 ,
+         N3 = 7 };
+
+  typedef Kokkos::Experimental::DynRankView< T , device > dView0 ;
+  typedef Kokkos::Experimental::DynRankView< const T , device > const_dView0 ;
+
+  typedef Kokkos::Experimental::DynRankView< T, device, Kokkos::MemoryUnmanaged > dView0_unmanaged ;
+  typedef typename dView0::host_mirror_space host_drv_space ;
+
+  typedef Kokkos::Experimental::View< T , device >        View0 ;
+  typedef Kokkos::Experimental::View< T* , device >       View1 ;
+  typedef Kokkos::Experimental::View< T******* , device > View7 ;
+
+  typedef typename View0::host_mirror_space  host_view_space ;
+
+  TestDynViewAPI()
+  {
+    run_test_resize_realloc();
+    run_test_mirror();
+    run_test_scalar();
+    run_test();
+    run_test_const();
+    run_test_subview();
+    run_test_subview_strided();
+    run_test_vector();
+
+    TestViewOperator< T , device >::testit();
+    TestViewOperator_LeftAndRight< int , device , 7 >::testit(2,3,4,2,3,4,2); 
+    TestViewOperator_LeftAndRight< int , device , 6 >::testit(2,3,4,2,3,4); 
+    TestViewOperator_LeftAndRight< int , device , 5 >::testit(2,3,4,2,3);
+    TestViewOperator_LeftAndRight< int , device , 4 >::testit(2,3,4,2);
+    TestViewOperator_LeftAndRight< int , device , 3 >::testit(2,3,4);
+    TestViewOperator_LeftAndRight< int , device , 2 >::testit(2,3);
+    TestViewOperator_LeftAndRight< int , device , 1 >::testit(2);
+  }
+
+  static void run_test_resize_realloc()
+  {
+    dView0 drv0("drv0", 10, 20, 30);
+    ASSERT_EQ( drv0.rank(), 3);
+
+    Kokkos::Experimental::resize(drv0, 5, 10);
+    ASSERT_EQ( drv0.rank(), 2);
+    ASSERT_EQ( drv0.dimension_0(), 5);
+    ASSERT_EQ( drv0.dimension_1(), 10);
+    ASSERT_EQ( drv0.dimension_2(), 1);
+
+    Kokkos::Experimental::realloc(drv0, 10, 20);
+    ASSERT_EQ( drv0.rank(), 2);
+    ASSERT_EQ( drv0.dimension_0(), 10);
+    ASSERT_EQ( drv0.dimension_1(), 20);
+    ASSERT_EQ( drv0.dimension_2(), 1);
+
+  }
+
+  static void run_test_mirror()
+  {
+    typedef Kokkos::Experimental::DynRankView< int , host_drv_space > view_type ;
+    typedef typename view_type::HostMirror mirror_type ;
+    view_type a("a");
+    mirror_type am = Kokkos::Experimental::create_mirror_view(a);
+    mirror_type ax = Kokkos::Experimental::create_mirror(a);
+    ASSERT_EQ( & a() , & am() );
+    ASSERT_EQ( a.rank() , am.rank() );
+    ASSERT_EQ( ax.rank() , am.rank() );
+
+    if (Kokkos::HostSpace::execution_space::is_initialized() )
+    {
+      Kokkos::DynRankView<double, Kokkos::LayoutLeft, Kokkos::HostSpace> a_h("A",1000);
+      auto a_h2 = Kokkos::create_mirror(Kokkos::HostSpace(),a_h);
+      auto a_d = Kokkos::create_mirror(typename device::memory_space(),a_h);
+  
+      int equal_ptr_h_h2  = (a_h.data() ==a_h2.data())?1:0;
+      int equal_ptr_h_d   = (a_h.data() ==a_d. data())?1:0;
+      int equal_ptr_h2_d  = (a_h2.data()==a_d. data())?1:0;
+  
+      ASSERT_EQ(equal_ptr_h_h2,0);
+      ASSERT_EQ(equal_ptr_h_d ,0);
+      ASSERT_EQ(equal_ptr_h2_d,0);
+  
+      ASSERT_EQ(a_h.dimension_0(),a_h2.dimension_0());
+      ASSERT_EQ(a_h.dimension_0(),a_d .dimension_0());
+
+      ASSERT_EQ(a_h.rank(),a_h2.rank());
+      ASSERT_EQ(a_h.rank(),a_d.rank());
+    }
+    if (Kokkos::HostSpace::execution_space::is_initialized() )
+    {
+      Kokkos::DynRankView<double, Kokkos::LayoutRight, Kokkos::HostSpace> a_h("A",1000);
+      auto a_h2 = Kokkos::create_mirror(Kokkos::HostSpace(),a_h);
+      auto a_d = Kokkos::create_mirror(typename device::memory_space(),a_h);
+  
+      int equal_ptr_h_h2  = (a_h.data() ==a_h2.data())?1:0;
+      int equal_ptr_h_d   = (a_h.data() ==a_d. data())?1:0;
+      int equal_ptr_h2_d  = (a_h2.data()==a_d. data())?1:0;
+  
+      ASSERT_EQ(equal_ptr_h_h2,0);
+      ASSERT_EQ(equal_ptr_h_d ,0);
+      ASSERT_EQ(equal_ptr_h2_d,0);
+  
+      ASSERT_EQ(a_h.dimension_0(),a_h2.dimension_0());
+      ASSERT_EQ(a_h.dimension_0(),a_d .dimension_0());
+
+      ASSERT_EQ(a_h.rank(),a_h2.rank());
+      ASSERT_EQ(a_h.rank(),a_d.rank());
+    }
+
+    if (Kokkos::HostSpace::execution_space::is_initialized() )
+    {
+      Kokkos::DynRankView<double, Kokkos::LayoutLeft, Kokkos::HostSpace> a_h("A",1000);
+      auto a_h2 = Kokkos::create_mirror_view(Kokkos::HostSpace(),a_h);
+      auto a_d = Kokkos::create_mirror_view(typename device::memory_space(),a_h);
+  
+      int equal_ptr_h_h2  = a_h.data() ==a_h2.data()?1:0;
+      int equal_ptr_h_d   = a_h.data() ==a_d. data()?1:0;
+      int equal_ptr_h2_d  = a_h2.data()==a_d. data()?1:0;
+  
+      int is_same_memspace = std::is_same<Kokkos::HostSpace,typename device::memory_space>::value?1:0; 
+      ASSERT_EQ(equal_ptr_h_h2,1);
+      ASSERT_EQ(equal_ptr_h_d ,is_same_memspace);
+      ASSERT_EQ(equal_ptr_h2_d ,is_same_memspace);
+  
+      ASSERT_EQ(a_h.dimension_0(),a_h2.dimension_0());
+      ASSERT_EQ(a_h.dimension_0(),a_d .dimension_0());
+
+      ASSERT_EQ(a_h.rank(),a_h2.rank());
+      ASSERT_EQ(a_h.rank(),a_d.rank());
+    } 
+    if (Kokkos::HostSpace::execution_space::is_initialized() )
+    {
+      Kokkos::DynRankView<double, Kokkos::LayoutRight, Kokkos::HostSpace> a_h("A",1000);
+      auto a_h2 = Kokkos::create_mirror_view(Kokkos::HostSpace(),a_h);
+      auto a_d = Kokkos::create_mirror_view(typename device::memory_space(),a_h);
+  
+      int equal_ptr_h_h2  = a_h.data() ==a_h2.data()?1:0;
+      int equal_ptr_h_d   = a_h.data() ==a_d. data()?1:0;
+      int equal_ptr_h2_d  = a_h2.data()==a_d. data()?1:0;
+  
+      int is_same_memspace = std::is_same<Kokkos::HostSpace,typename device::memory_space>::value?1:0; 
+      ASSERT_EQ(equal_ptr_h_h2,1);
+      ASSERT_EQ(equal_ptr_h_d ,is_same_memspace);
+      ASSERT_EQ(equal_ptr_h2_d ,is_same_memspace);
+  
+      ASSERT_EQ(a_h.dimension_0(),a_h2.dimension_0());
+      ASSERT_EQ(a_h.dimension_0(),a_d .dimension_0());
+
+      ASSERT_EQ(a_h.rank(),a_h2.rank());
+      ASSERT_EQ(a_h.rank(),a_d.rank());
+    } 
+    if (Kokkos::HostSpace::execution_space::is_initialized() )
+    {
+      typedef Kokkos::DynRankView< int , Kokkos::LayoutStride , Kokkos::HostSpace > view_stride_type ;
+      unsigned order[] = { 6,5,4,3,2,1,0 }, dimen[] = { N0, N1, N2, 2, 2, 2, 2 }; //LayoutRight equivalent
+      view_stride_type a_h( "a" , Kokkos::LayoutStride::order_dimensions(7, order, dimen) );
+      auto a_h2 = Kokkos::create_mirror_view(Kokkos::HostSpace(),a_h);
+      auto a_d = Kokkos::create_mirror_view(typename device::memory_space(),a_h);
+  
+      int equal_ptr_h_h2  = a_h.data() ==a_h2.data()?1:0;
+      int equal_ptr_h_d   = a_h.data() ==a_d. data()?1:0;
+      int equal_ptr_h2_d  = a_h2.data()==a_d. data()?1:0;
+  
+      int is_same_memspace = std::is_same<Kokkos::HostSpace,typename device::memory_space>::value?1:0; 
+      ASSERT_EQ(equal_ptr_h_h2,1);
+      ASSERT_EQ(equal_ptr_h_d ,is_same_memspace);
+      ASSERT_EQ(equal_ptr_h2_d ,is_same_memspace);
+  
+      ASSERT_EQ(a_h.dimension_0(),a_h2.dimension_0());
+      ASSERT_EQ(a_h.dimension_0(),a_d .dimension_0());
+
+      ASSERT_EQ(a_h.rank(),a_h2.rank());
+      ASSERT_EQ(a_h.rank(),a_d.rank());
+    }
+  }
+
+  static void run_test_scalar()
+  {
+    typedef typename dView0::HostMirror  hView0 ; //HostMirror of DynRankView is a DynRankView
+
+    dView0 dx , dy ;
+    hView0 hx , hy ;
+
+    dx = dView0( "dx" );
+    dy = dView0( "dy" );
+
+    hx = Kokkos::Experimental::create_mirror( dx );
+    hy = Kokkos::Experimental::create_mirror( dy );
+
+    hx() = 1 ;
+
+    Kokkos::Experimental::deep_copy( dx , hx );
+    Kokkos::Experimental::deep_copy( dy , dx );
+    Kokkos::Experimental::deep_copy( hy , dy );
+
+    ASSERT_EQ( hx(), hy() );
+    ASSERT_EQ( dx.rank() , hx.rank() );
+    ASSERT_EQ( dy.rank() , hy.rank() );
+
+  //View - DynRankView Interoperability tests
+  // deep_copy DynRankView to View
+    View0 vx("vx");
+    Kokkos::deep_copy( vx , dx );
+    ASSERT_EQ( rank(dx) , rank(vx) );
+
+    View0 vy("vy");
+    Kokkos::deep_copy( vy , dy );
+    ASSERT_EQ( rank(dy) , rank(vy) );
+
+  // deep_copy View to DynRankView 
+    dView0 dxx("dxx");
+    Kokkos::deep_copy( dxx , vx );
+    ASSERT_EQ( rank(dxx) , rank(vx) );
+
+
+    View7 vcast = dx.ConstDownCast();
+    ASSERT_EQ( dx.dimension_0() , vcast.dimension_0() );
+    ASSERT_EQ( dx.dimension_1() , vcast.dimension_1() );
+    ASSERT_EQ( dx.dimension_2() , vcast.dimension_2() );
+    ASSERT_EQ( dx.dimension_3() , vcast.dimension_3() );
+    ASSERT_EQ( dx.dimension_4() , vcast.dimension_4() );
+
+    View7 vcast1( dy.ConstDownCast() );
+    ASSERT_EQ( dy.dimension_0() , vcast1.dimension_0() );
+    ASSERT_EQ( dy.dimension_1() , vcast1.dimension_1() );
+    ASSERT_EQ( dy.dimension_2() , vcast1.dimension_2() );
+    ASSERT_EQ( dy.dimension_3() , vcast1.dimension_3() );
+    ASSERT_EQ( dy.dimension_4() , vcast1.dimension_4() );
+
+  //View - DynRankView Interoperability tests
+  // copy View to DynRankView
+    dView0 dfromvx( vx );
+    auto hmx = Kokkos::create_mirror_view(dfromvx) ;
+    Kokkos::deep_copy(hmx , dfromvx);
+    auto hvx = Kokkos::create_mirror_view(vx) ;
+    Kokkos::deep_copy(hvx , vx);
+    ASSERT_EQ( rank(hvx) , rank(hmx) );
+    ASSERT_EQ( hvx.dimension_0() , hmx.dimension_0() );
+    ASSERT_EQ( hvx.dimension_1() , hmx.dimension_1() );
+
+  // copy-assign View to DynRankView
+    dView0 dfromvy = vy ;
+    auto hmy = Kokkos::create_mirror_view(dfromvy) ;
+    Kokkos::deep_copy(hmy , dfromvy);
+    auto hvy = Kokkos::create_mirror_view(vy) ;
+    Kokkos::deep_copy(hvy , vy);
+    ASSERT_EQ( rank(hvy) , rank(hmy) );
+    ASSERT_EQ( hvy.dimension_0() , hmy.dimension_0() );
+    ASSERT_EQ( hvy.dimension_1() , hmy.dimension_1() );
+
+
+    View7 vtest1("vtest1",2,2,2,2,2,2,2);
+    dView0 dfromv1( vtest1 );
+    ASSERT_EQ( dfromv1.rank() , vtest1.Rank );
+    ASSERT_EQ( dfromv1.dimension_0() , vtest1.dimension_0() );
+    ASSERT_EQ( dfromv1.dimension_1() , vtest1.dimension_1() );
+    ASSERT_EQ( dfromv1.use_count() , vtest1.use_count() );
+
+    dView0 dfromv2( vcast );
+    ASSERT_EQ( dfromv2.rank() , vcast.Rank );
+    ASSERT_EQ( dfromv2.dimension_0() , vcast.dimension_0() );
+    ASSERT_EQ( dfromv2.dimension_1() , vcast.dimension_1() );
+    ASSERT_EQ( dfromv2.use_count() , vcast.use_count() );
+
+    dView0 dfromv3 = vcast1;
+    ASSERT_EQ( dfromv3.rank() , vcast1.Rank );
+    ASSERT_EQ( dfromv3.dimension_0() , vcast1.dimension_0() );
+    ASSERT_EQ( dfromv3.dimension_1() , vcast1.dimension_1() );
+    ASSERT_EQ( dfromv3.use_count() , vcast1.use_count() );
+  }
+
+  static void run_test()
+  {
+    // mfh 14 Feb 2014: This test doesn't actually create instances of
+    // these types.  In order to avoid "declared but unused typedef"
+    // warnings, we declare empty instances of these types, with the
+    // usual "(void)" marker to avoid compiler warnings for unused
+    // variables.
+
+    typedef typename dView0::HostMirror  hView0 ;
+
+    {
+      hView0 thing;
+      (void) thing;
+    }
+
+    dView0 d_uninitialized(Kokkos::ViewAllocateWithoutInitializing("uninit"),10,20);
+    ASSERT_TRUE( d_uninitialized.data() != nullptr );
+    ASSERT_EQ( d_uninitialized.rank() , 2 );
+    ASSERT_EQ( d_uninitialized.dimension_0() , 10 );
+    ASSERT_EQ( d_uninitialized.dimension_1() , 20 );
+    ASSERT_EQ( d_uninitialized.dimension_2() , 1  );
+
+    dView0 dx , dy , dz ;
+    hView0 hx , hy , hz ;
+
+    ASSERT_TRUE( Kokkos::Experimental::is_dyn_rank_view<dView0>::value );
+    ASSERT_FALSE( Kokkos::Experimental::is_dyn_rank_view< Kokkos::View<double> >::value );
+
+    ASSERT_TRUE( dx.ptr_on_device() == 0 ); //Okay with UVM
+    ASSERT_TRUE( dy.ptr_on_device() == 0 );  //Okay with UVM
+    ASSERT_TRUE( dz.ptr_on_device() == 0 ); //Okay with UVM
+    ASSERT_TRUE( hx.ptr_on_device() == 0 );
+    ASSERT_TRUE( hy.ptr_on_device() == 0 );
+    ASSERT_TRUE( hz.ptr_on_device() == 0 );
+    ASSERT_EQ( dx.dimension_0() , 0u ); //Okay with UVM
+    ASSERT_EQ( dy.dimension_0() , 0u ); //Okay with UVM
+    ASSERT_EQ( dz.dimension_0() , 0u ); //Okay with UVM
+    ASSERT_EQ( hx.dimension_0() , 0u );
+    ASSERT_EQ( hy.dimension_0() , 0u );
+    ASSERT_EQ( hz.dimension_0() , 0u );
+    ASSERT_EQ( dx.rank() , 0u ); //Okay with UVM
+    ASSERT_EQ( hx.rank() , 0u );
+
+    dx = dView0( "dx" , N1 , N2 , N3 );
+    dy = dView0( "dy" , N1 , N2 , N3 );
+
+    hx = hView0( "hx" , N1 , N2 , N3 );
+    hy = hView0( "hy" , N1 , N2 , N3 );
+
+    ASSERT_EQ( dx.dimension_0() , unsigned(N1) ); //Okay with UVM
+    ASSERT_EQ( dy.dimension_0() , unsigned(N1) ); //Okay with UVM
+    ASSERT_EQ( hx.dimension_0() , unsigned(N1) );
+    ASSERT_EQ( hy.dimension_0() , unsigned(N1) );
+    ASSERT_EQ( dx.rank() , 3 ); //Okay with UVM
+    ASSERT_EQ( hx.rank() , 3 );
+
+    dx = dView0( "dx" , N0 , N1 , N2 , N3 );
+    dy = dView0( "dy" , N0 , N1 , N2 , N3 );
+    hx = hView0( "hx" , N0 , N1 , N2 , N3 );
+    hy = hView0( "hy" , N0 , N1 , N2 , N3 );
+
+    ASSERT_EQ( dx.dimension_0() , unsigned(N0) );
+    ASSERT_EQ( dy.dimension_0() , unsigned(N0) );
+    ASSERT_EQ( hx.dimension_0() , unsigned(N0) );
+    ASSERT_EQ( hy.dimension_0() , unsigned(N0) );
+    ASSERT_EQ( dx.rank() , 4 );
+    ASSERT_EQ( dy.rank() , 4 );
+    ASSERT_EQ( hx.rank() , 4 );
+    ASSERT_EQ( hy.rank() , 4 );
+
+    ASSERT_EQ( dx.use_count() , size_t(1) );
+
+    dView0_unmanaged unmanaged_dx = dx;
+    ASSERT_EQ( dx.use_count() , size_t(1) );
+
+
+    dView0_unmanaged unmanaged_from_ptr_dx = dView0_unmanaged(dx.ptr_on_device(),
+                                                              dx.dimension_0(),
+                                                              dx.dimension_1(),
+                                                              dx.dimension_2(),
+                                                              dx.dimension_3());
+
+
+    {
+      // Destruction of this view should be harmless
+      const_dView0 unmanaged_from_ptr_const_dx( dx.ptr_on_device() ,
+                                                dx.dimension_0() ,
+                                                dx.dimension_1() ,
+                                                dx.dimension_2() ,
+                                                dx.dimension_3() );
+    }
+
+    const_dView0 const_dx = dx ;
+    ASSERT_EQ( dx.use_count() , size_t(2) );
+
+    {
+      const_dView0 const_dx2;
+      const_dx2 = const_dx;
+      ASSERT_EQ( dx.use_count() , size_t(3) );
+
+      const_dx2 = dy;
+      ASSERT_EQ( dx.use_count() , size_t(2) );
+
+      const_dView0 const_dx3(dx);
+      ASSERT_EQ( dx.use_count() , size_t(3) );
+      
+      dView0_unmanaged dx4_unmanaged(dx);
+      ASSERT_EQ( dx.use_count() , size_t(3) );
+    }
+
+    ASSERT_EQ( dx.use_count() , size_t(2) );
+
+
+    ASSERT_FALSE( dx.ptr_on_device() == 0 );
+    ASSERT_FALSE( const_dx.ptr_on_device() == 0 );
+    ASSERT_FALSE( unmanaged_dx.ptr_on_device() == 0 );
+    ASSERT_FALSE( unmanaged_from_ptr_dx.ptr_on_device() == 0 );
+    ASSERT_FALSE( dy.ptr_on_device() == 0 );
+    ASSERT_NE( dx , dy );
+
+    ASSERT_EQ( dx.dimension_0() , unsigned(N0) );
+    ASSERT_EQ( dx.dimension_1() , unsigned(N1) );
+    ASSERT_EQ( dx.dimension_2() , unsigned(N2) );
+    ASSERT_EQ( dx.dimension_3() , unsigned(N3) );
+
+    ASSERT_EQ( dy.dimension_0() , unsigned(N0) );
+    ASSERT_EQ( dy.dimension_1() , unsigned(N1) );
+    ASSERT_EQ( dy.dimension_2() , unsigned(N2) );
+    ASSERT_EQ( dy.dimension_3() , unsigned(N3) );
+
+    ASSERT_EQ( unmanaged_from_ptr_dx.capacity(),unsigned(N0)*unsigned(N1)*unsigned(N2)*unsigned(N3) );
+
+    hx = Kokkos::Experimental::create_mirror( dx );
+    hy = Kokkos::Experimental::create_mirror( dy );
+
+    ASSERT_EQ( hx.rank() , dx.rank() );
+    ASSERT_EQ( hy.rank() , dy.rank() );
+
+    ASSERT_EQ( hx.dimension_0() , unsigned(N0) );
+    ASSERT_EQ( hx.dimension_1() , unsigned(N1) );
+    ASSERT_EQ( hx.dimension_2() , unsigned(N2) );
+    ASSERT_EQ( hx.dimension_3() , unsigned(N3) );
+
+    ASSERT_EQ( hy.dimension_0() , unsigned(N0) );
+    ASSERT_EQ( hy.dimension_1() , unsigned(N1) );
+    ASSERT_EQ( hy.dimension_2() , unsigned(N2) );
+    ASSERT_EQ( hy.dimension_3() , unsigned(N3) );
+
+    // T v1 = hx() ;    // Generates compile error as intended
+    // T v2 = hx(0,0) ; // Generates compile error as intended
+    // hx(0,0) = v2 ;   // Generates compile error as intended
+
+/*
+#if ! KOKKOS_USING_EXP_VIEW
+    // Testing with asynchronous deep copy with respect to device
+    {
+      size_t count = 0 ;
+      for ( size_t ip = 0 ; ip < N0 ; ++ip ) {
+      for ( size_t i1 = 0 ; i1 < hx.dimension_1() ; ++i1 ) {
+      for ( size_t i2 = 0 ; i2 < hx.dimension_2() ; ++i2 ) {
+      for ( size_t i3 = 0 ; i3 < hx.dimension_3() ; ++i3 ) {
+        hx(ip,i1,i2,i3) = ++count ;
+      }}}}
+
+
+      Kokkos::deep_copy(typename hView0::execution_space(), dx , hx );
+      Kokkos::deep_copy(typename hView0::execution_space(), dy , dx );
+      Kokkos::deep_copy(typename hView0::execution_space(), hy , dy );
+
+      for ( size_t ip = 0 ; ip < N0 ; ++ip ) {
+      for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
+      for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
+      for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
+        { ASSERT_EQ( hx(ip,i1,i2,i3) , hy(ip,i1,i2,i3) ); }
+      }}}}
+
+      Kokkos::deep_copy(typename hView0::execution_space(), dx , T(0) );
+      Kokkos::deep_copy(typename hView0::execution_space(), hx , dx );
+
+      for ( size_t ip = 0 ; ip < N0 ; ++ip ) {
+      for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
+      for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
+      for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
+        { ASSERT_EQ( hx(ip,i1,i2,i3) , T(0) ); }
+      }}}}
+    }
+
+    // Testing with asynchronous deep copy with respect to host
+    {
+      size_t count = 0 ;
+      for ( size_t ip = 0 ; ip < N0 ; ++ip ) {
+      for ( size_t i1 = 0 ; i1 < hx.dimension_1() ; ++i1 ) {
+      for ( size_t i2 = 0 ; i2 < hx.dimension_2() ; ++i2 ) {
+      for ( size_t i3 = 0 ; i3 < hx.dimension_3() ; ++i3 ) {
+        hx(ip,i1,i2,i3) = ++count ;
+      }}}}
+
+      Kokkos::deep_copy(typename dView0::execution_space(), dx , hx );
+      Kokkos::deep_copy(typename dView0::execution_space(), dy , dx );
+      Kokkos::deep_copy(typename dView0::execution_space(), hy , dy );
+
+      for ( size_t ip = 0 ; ip < N0 ; ++ip ) {
+      for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
+      for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
+      for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
+        { ASSERT_EQ( hx(ip,i1,i2,i3) , hy(ip,i1,i2,i3) ); }
+      }}}}
+
+      Kokkos::deep_copy(typename dView0::execution_space(), dx , T(0) );
+      Kokkos::deep_copy(typename dView0::execution_space(), hx , dx );
+
+      for ( size_t ip = 0 ; ip < N0 ; ++ip ) {
+      for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
+      for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
+      for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
+        { ASSERT_EQ( hx(ip,i1,i2,i3) , T(0) ); }
+      }}}}
+    }
+#endif */ // #if ! KOKKOS_USING_EXP_VIEW
+
+    // Testing with synchronous deep copy
+    {
+      size_t count = 0 ;
+      for ( size_t ip = 0 ; ip < N0 ; ++ip ) {
+      for ( size_t i1 = 0 ; i1 < hx.dimension_1() ; ++i1 ) {
+      for ( size_t i2 = 0 ; i2 < hx.dimension_2() ; ++i2 ) {
+      for ( size_t i3 = 0 ; i3 < hx.dimension_3() ; ++i3 ) {
+        hx(ip,i1,i2,i3) = ++count ;
+      }}}}
+
+      Kokkos::Experimental::deep_copy( dx , hx );
+      Kokkos::Experimental::deep_copy( dy , dx );
+      Kokkos::Experimental::deep_copy( hy , dy );
+
+      for ( size_t ip = 0 ; ip < N0 ; ++ip ) {
+      for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
+      for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
+      for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
+        { ASSERT_EQ( hx(ip,i1,i2,i3) , hy(ip,i1,i2,i3) ); }
+      }}}}
+
+      Kokkos::Experimental::deep_copy( dx , T(0) );
+      Kokkos::Experimental::deep_copy( hx , dx );
+
+      for ( size_t ip = 0 ; ip < N0 ; ++ip ) {
+      for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
+      for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
+      for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
+        { ASSERT_EQ( hx(ip,i1,i2,i3) , T(0) ); }
+      }}}}
+//    ASSERT_EQ( hx(0,0,0,0,0,0,0,0) , T(0) ); //Test rank8 op behaves properly - if implemented
+    }
+
+    dz = dx ; ASSERT_EQ( dx, dz); ASSERT_NE( dy, dz);
+    dz = dy ; ASSERT_EQ( dy, dz); ASSERT_NE( dx, dz);
+
+    dx = dView0();
+    ASSERT_TRUE( dx.ptr_on_device() == 0 );
+    ASSERT_FALSE( dy.ptr_on_device() == 0 );
+    ASSERT_FALSE( dz.ptr_on_device() == 0 );
+    dy = dView0();
+    ASSERT_TRUE( dx.ptr_on_device() == 0 );
+    ASSERT_TRUE( dy.ptr_on_device() == 0 );
+    ASSERT_FALSE( dz.ptr_on_device() == 0 );
+    dz = dView0();
+    ASSERT_TRUE( dx.ptr_on_device() == 0 );
+    ASSERT_TRUE( dy.ptr_on_device() == 0 );
+    ASSERT_TRUE( dz.ptr_on_device() == 0 );
+
+  //View - DynRankView Interoperability tests
+    // deep_copy from view to dynrankview
+    const int testdim = 4;
+    dView0 dxx("dxx",testdim);
+    View1  vxx("vxx",testdim);
+    auto hvxx = Kokkos::create_mirror_view(vxx); 
+    for (int i = 0; i < testdim; ++i)
+      { hvxx(i) = i; }
+    Kokkos::deep_copy(vxx,hvxx);
+    Kokkos::deep_copy(dxx,vxx);
+    auto hdxx = Kokkos::create_mirror_view(dxx);
+    Kokkos::deep_copy(hdxx,dxx);
+    for (int i = 0; i < testdim; ++i)
+      { ASSERT_EQ( hvxx(i) , hdxx(i) ); }
+
+    ASSERT_EQ( rank(hdxx) , rank(hvxx) );
+    ASSERT_EQ( hdxx.dimension_0() , testdim );
+    ASSERT_EQ( hdxx.dimension_0() , hvxx.dimension_0() );
+
+    // deep_copy from dynrankview to view
+    View1 vdxx("vdxx",testdim);
+    auto hvdxx = Kokkos::create_mirror_view(vdxx);
+    Kokkos::deep_copy(hvdxx , hdxx);
+    ASSERT_EQ( rank(hdxx) , rank(hvdxx) );
+    ASSERT_EQ( hvdxx.dimension_0() , testdim );
+    ASSERT_EQ( hdxx.dimension_0() , hvdxx.dimension_0() );
+    for (int i = 0; i < testdim; ++i)
+      { ASSERT_EQ( hvxx(i) , hvdxx(i) ); }
+  }
+
+  typedef T DataType ;
+
+  static void
+  check_auto_conversion_to_const(
+     const Kokkos::Experimental::DynRankView< const DataType , device > & arg_const ,
+     const Kokkos::Experimental::DynRankView< DataType , device > & arg )
+  {
+    ASSERT_TRUE( arg_const == arg );
+  }
+
+  static void run_test_const()
+  {
+    typedef Kokkos::Experimental::DynRankView< DataType , device > typeX ;
+    typedef Kokkos::Experimental::DynRankView< const DataType , device > const_typeX ;
+    typedef Kokkos::Experimental::DynRankView< const DataType , device , Kokkos::MemoryRandomAccess > const_typeR ;
+    typeX x( "X", 2 );
+    const_typeX xc = x ;
+    const_typeR xr = x ;
+
+    ASSERT_TRUE( xc == x );
+    ASSERT_TRUE( x == xc );
+
+    // For CUDA the constant random access View does not return
+    // an lvalue reference due to retrieving through texture cache
+    // therefore not allowed to query the underlying pointer.
+#if defined(KOKKOS_HAVE_CUDA)
+    if ( ! std::is_same< typename device::execution_space , Kokkos::Cuda >::value )
+#endif
+    {
+      ASSERT_TRUE( x.ptr_on_device() == xr.ptr_on_device() );
+    }
+
+    // typeX xf = xc ; // setting non-const from const must not compile
+
+    check_auto_conversion_to_const( x , x );
+  }
+
+
+  static void run_test_subview()
+  {
+    typedef Kokkos::Experimental::DynRankView< const T , device > cdView ;
+    typedef Kokkos::Experimental::DynRankView< T , device > dView ;
+  // LayoutStride required for all returned DynRankView subdynrankview's
+    typedef Kokkos::Experimental::DynRankView< T , Kokkos::LayoutStride , device > sdView ; 
+
+    dView0 d0( "d0" );
+    cdView s0 = d0 ;
+
+  //  N0 = 1000,N1 = 3,N2 = 5,N3 = 7 
+    unsigned order[] = { 6,5,4,3,2,1,0 }, dimen[] = { N0, N1, N2, 2, 2, 2, 2 }; //LayoutRight equivalent
+    sdView d7( "d7" , Kokkos::LayoutStride::order_dimensions(7, order, dimen) );
+    ASSERT_EQ( d7.rank() , 7 );
+
+    sdView ds0 = Kokkos::subdynrankview( d7 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ); 
+    ASSERT_EQ( ds0.rank() , 0 );
+
+//Basic test - ALL
+    sdView dsALL = Kokkos::Experimental::subdynrankview( d7 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() ); 
+    ASSERT_EQ( dsALL.rank() , 7 );
+
+//  Send a value to final rank returning rank 6 subview
+    sdView dsm1 = Kokkos::Experimental::subdynrankview( d7 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , 1 );
+    ASSERT_EQ( dsm1.rank() , 6 );
+
+//  Send a std::pair as argument to a rank
+    sdView dssp = Kokkos::Experimental::subdynrankview( d7 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , std::pair<unsigned,unsigned>(1,2) );
+    ASSERT_EQ( dssp.rank() , 7 );
+
+//  Send a kokkos::pair as argument to a rank; take default layout as input
+    dView0 dd0("dd0" , N0 , N1 , N2 , 2 , 2 , 2 , 2 ); //default layout
+    ASSERT_EQ( dd0.rank() , 7 );
+    sdView dtkp = Kokkos::Experimental::subdynrankview( dd0 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::pair<unsigned,unsigned>(0,1) );
+    ASSERT_EQ( dtkp.rank() , 7 );
+
+// Return rank 7 subview, taking a pair as one argument, layout stride input
+    sdView ds7 = Kokkos::Experimental::subdynrankview( d7 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::pair<unsigned,unsigned>(0,1) );
+    ASSERT_EQ( ds7.rank() , 7 );
+
+// Default Layout DynRankView
+    dView dv6("dv6" , N0 , N1 , N2 , N3 , 2 , 2 );
+    ASSERT_EQ( dv6.rank() , 6 );
+
+// DynRankView with LayoutRight
+    typedef Kokkos::Experimental::DynRankView< T , Kokkos::LayoutRight , device > drView ;
+    drView dr5( "dr5" , N0 , N1 , N2 , 2 , 2 );
+    ASSERT_EQ( dr5.rank() , 5 );
+
+// LayoutStride but arranged as LayoutRight
+  // NOTE: unused arg_layout dimensions must be set to ~size_t(0) so that 
+  //  rank deduction can properly take place
+    unsigned order5[] = { 4,3,2,1,0 }, dimen5[] = { N0, N1, N2, 2, 2 };
+    Kokkos::LayoutStride ls = Kokkos::LayoutStride::order_dimensions(5, order5, dimen5);
+    ls.dimension[5] = ~size_t(0);
+    ls.dimension[6] = ~size_t(0);
+    ls.dimension[7] = ~size_t(0);
+    sdView d5("d5", ls);
+    ASSERT_EQ( d5.rank() , 5 );
+
+//  LayoutStride arranged as LayoutRight - commented out as example that fails unit test
+//    unsigned order5[] = { 4,3,2,1,0 }, dimen5[] = { N0, N1, N2, 2, 2 };
+//    sdView d5( "d5" , Kokkos::LayoutStride::order_dimensions(5, order5, dimen5) );
+//
+//  Fails the following unit test:
+//    ASSERT_EQ( d5.rank() , dr5.rank() );
+//
+//  Explanation: In construction of the Kokkos::LayoutStride below, since the 
+//   remaining dimensions are not specified, they will default to values of 0 
+//   rather than ~size_t(0). 
+//  When passed to the DynRankView constructor the default dimensions (of 0) 
+//   will be counted toward the dynamic rank and returning an incorrect value 
+//   (i.e. rank 7 rather than 5).
+
+// Check LayoutRight dr5 and LayoutStride d5 dimensions agree (as they should) 
+    ASSERT_EQ( d5.dimension_0() , dr5.dimension_0() );
+    ASSERT_EQ( d5.dimension_1() , dr5.dimension_1() );
+    ASSERT_EQ( d5.dimension_2() , dr5.dimension_2() );
+    ASSERT_EQ( d5.dimension_3() , dr5.dimension_3() );
+    ASSERT_EQ( d5.dimension_4() , dr5.dimension_4() );
+    ASSERT_EQ( d5.dimension_5() , dr5.dimension_5() );
+    ASSERT_EQ( d5.rank() , dr5.rank() );
+
+// Rank 5 subview of rank 5 dynamic rank view, layout stride input
+    sdView ds5 = Kokkos::Experimental::subdynrankview( d5 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::pair<unsigned,unsigned>(0,1) );
+    ASSERT_EQ( ds5.rank() , 5 );
+
+// Pass in extra ALL arguments beyond the rank of the DynRank View.
+// This behavior is allowed - ignore the extra ALL arguments when
+//  the src.rank() < number of arguments, but be careful!
+    sdView ds5plus = Kokkos::Experimental::subdynrankview( d5 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::pair<unsigned,unsigned>(0,1) , Kokkos::ALL() );
+
+    ASSERT_EQ( ds5.rank() , ds5plus.rank() );
+    ASSERT_EQ( ds5.dimension_0() , ds5plus.dimension_0() );
+    ASSERT_EQ( ds5.dimension_4() , ds5plus.dimension_4() );
+    ASSERT_EQ( ds5.dimension_5() , ds5plus.dimension_5() );
+
+#if ! defined( KOKKOS_HAVE_CUDA ) || defined ( KOKKOS_USE_CUDA_UVM )
+    ASSERT_EQ( & ds5(1,1,1,1,0) - & ds5plus(1,1,1,1,0) , 0 );
+    ASSERT_EQ( & ds5(1,1,1,1,0,0) - & ds5plus(1,1,1,1,0,0) , 0 );  // passing argument to rank beyond the view's rank is allowed iff it is a 0. 
+#endif
+
+// Similar test to rank 5 above, but create rank 4 subview
+// Check that the rank contracts (ds4 and ds4plus) and that subdynrankview can accept extra args (ds4plus)
+    sdView ds4 = Kokkos::Experimental::subdynrankview( d5 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , 0 );
+    sdView ds4plus = Kokkos::Experimental::subdynrankview( d5 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , 0 , Kokkos::ALL() );
+
+    ASSERT_EQ( ds4.rank() , ds4plus.rank() );
+    ASSERT_EQ( ds4.rank() , 4 );
+    ASSERT_EQ( ds4.dimension_0() , ds4plus.dimension_0() );
+    ASSERT_EQ( ds4.dimension_4() , ds4plus.dimension_4() );
+    ASSERT_EQ( ds4.dimension_5() , ds4plus.dimension_5() );
+  }
+
+  static void run_test_subview_strided()
+  {
+    typedef Kokkos::Experimental::DynRankView < int , Kokkos::LayoutLeft , host_drv_space > drview_left ;
+    typedef Kokkos::Experimental::DynRankView < int , Kokkos::LayoutRight , host_drv_space > drview_right ;
+    typedef Kokkos::Experimental::DynRankView < int , Kokkos::LayoutStride , host_drv_space > drview_stride ;
+
+    drview_left  xl2( "xl2", 100 , 200 );
+    drview_right xr2( "xr2", 100 , 200 );
+    drview_stride yl1 = Kokkos::Experimental::subdynrankview( xl2 , 0 , Kokkos::ALL() );
+    drview_stride yl2 = Kokkos::Experimental::subdynrankview( xl2 , 1 , Kokkos::ALL() );
+    drview_stride ys1 = Kokkos::Experimental::subdynrankview( xr2 , 0 , Kokkos::ALL() );
+    drview_stride ys2 = Kokkos::Experimental::subdynrankview( xr2 , 1 , Kokkos::ALL() );
+    drview_stride yr1 = Kokkos::Experimental::subdynrankview( xr2 , 0 , Kokkos::ALL() );
+    drview_stride yr2 = Kokkos::Experimental::subdynrankview( xr2 , 1 , Kokkos::ALL() );
+
+    ASSERT_EQ( yl1.dimension_0() , xl2.dimension_1() );
+    ASSERT_EQ( yl2.dimension_0() , xl2.dimension_1() );
+
+    ASSERT_EQ( yr1.dimension_0() , xr2.dimension_1() );
+    ASSERT_EQ( yr2.dimension_0() , xr2.dimension_1() );
+
+    ASSERT_EQ( & yl1(0) - & xl2(0,0) , 0 );
+    ASSERT_EQ( & yl2(0) - & xl2(1,0) , 0 );
+    ASSERT_EQ( & yr1(0) - & xr2(0,0) , 0 );
+    ASSERT_EQ( & yr2(0) - & xr2(1,0) , 0 );
+
+
+    drview_left  xl4( "xl4", 10 , 20 , 30 , 40 );
+    drview_right xr4( "xr4", 10 , 20 , 30 , 40 );
+
+    //Replace subdynrankview with subview - test
+    drview_stride yl4 = Kokkos::Experimental::subview( xl4 , 1 , Kokkos::ALL() , 2 , Kokkos::ALL() );
+    drview_stride yr4 = Kokkos::Experimental::subview( xr4 , 1 , Kokkos::ALL() , 2 , Kokkos::ALL() );
+
+    ASSERT_EQ( yl4.dimension_0() , xl4.dimension_1() );
+    ASSERT_EQ( yl4.dimension_1() , xl4.dimension_3() );
+    ASSERT_EQ( yr4.dimension_0() , xr4.dimension_1() );
+    ASSERT_EQ( yr4.dimension_1() , xr4.dimension_3() );
+    ASSERT_EQ( yl4.rank() , 2);
+    ASSERT_EQ( yr4.rank() , 2);
+
+    ASSERT_EQ( & yl4(4,4) - & xl4(1,4,2,4) , 0 );
+    ASSERT_EQ( & yr4(4,4) - & xr4(1,4,2,4) , 0 );
+  }
+
+  static void run_test_vector()
+  {
+    static const unsigned Length = 1000 , Count = 8 ;
+
+    typedef typename Kokkos::Experimental::DynRankView< T , Kokkos::LayoutLeft , host_drv_space > multivector_type ; 
+
+    typedef typename Kokkos::Experimental::DynRankView< T , Kokkos::LayoutRight , host_drv_space > multivector_right_type ;
+
+    multivector_type mv = multivector_type( "mv" , Length , Count );
+    multivector_right_type mv_right = multivector_right_type( "mv" , Length , Count );
+
+    typedef typename Kokkos::Experimental::DynRankView< T , Kokkos::LayoutStride , host_drv_space > svector_type ;
+    typedef typename Kokkos::Experimental::DynRankView< T , Kokkos::LayoutStride , host_drv_space > smultivector_type ;
+    typedef typename Kokkos::Experimental::DynRankView< const T , Kokkos::LayoutStride , host_drv_space > const_svector_right_type ; 
+    typedef typename Kokkos::Experimental::DynRankView< const T , Kokkos::LayoutStride , host_drv_space > const_svector_type ;
+    typedef typename Kokkos::Experimental::DynRankView< const T , Kokkos::LayoutStride , host_drv_space > const_smultivector_type ;
+
+    svector_type v1 = Kokkos::Experimental::subdynrankview( mv , Kokkos::ALL() , 0 );
+    svector_type v2 = Kokkos::Experimental::subdynrankview( mv , Kokkos::ALL() , 1 );
+    svector_type v3 = Kokkos::Experimental::subdynrankview( mv , Kokkos::ALL() , 2 );
+
+    svector_type rv1 = Kokkos::Experimental::subdynrankview( mv_right , 0 , Kokkos::ALL() );
+    svector_type rv2 = Kokkos::Experimental::subdynrankview( mv_right , 1 , Kokkos::ALL() );
+    svector_type rv3 = Kokkos::Experimental::subdynrankview( mv_right , 2 , Kokkos::ALL() );
+
+    smultivector_type mv1 = Kokkos::Experimental::subdynrankview( mv , std::make_pair( 1 , 998 ) ,
+                                                 std::make_pair( 2 , 5 ) );
+
+    smultivector_type mvr1 =
+      Kokkos::Experimental::subdynrankview( mv_right ,
+                       std::make_pair( 1 , 998 ) ,
+                       std::make_pair( 2 , 5 ) );
+
+    const_svector_type cv1 = Kokkos::Experimental::subdynrankview( mv , Kokkos::ALL(), 0 );
+    const_svector_type cv2 = Kokkos::Experimental::subdynrankview( mv , Kokkos::ALL(), 1 );
+    const_svector_type cv3 = Kokkos::Experimental::subdynrankview( mv , Kokkos::ALL(), 2 );
+
+    svector_type vr1 = Kokkos::Experimental::subdynrankview( mv , Kokkos::ALL() , 0 );
+    svector_type vr2 = Kokkos::Experimental::subdynrankview( mv , Kokkos::ALL() , 1 );
+    svector_type vr3 = Kokkos::Experimental::subdynrankview( mv , Kokkos::ALL() , 2 );
+
+    const_svector_right_type cvr1 = Kokkos::Experimental::subdynrankview( mv , Kokkos::ALL() , 0 );
+    const_svector_right_type cvr2 = Kokkos::Experimental::subdynrankview( mv , Kokkos::ALL() , 1 );
+    const_svector_right_type cvr3 = Kokkos::Experimental::subdynrankview( mv , Kokkos::ALL() , 2 );
+
+
+    ASSERT_TRUE( & v1[0] == & v1(0) );
+    ASSERT_TRUE( & v1[0] == & mv(0,0) );
+    ASSERT_TRUE( & v2[0] == & mv(0,1) );
+    ASSERT_TRUE( & v3[0] == & mv(0,2) );
+
+    ASSERT_TRUE( & cv1[0] == & mv(0,0) );
+    ASSERT_TRUE( & cv2[0] == & mv(0,1) );
+    ASSERT_TRUE( & cv3[0] == & mv(0,2) );
+
+    ASSERT_TRUE( & vr1[0] == & mv(0,0) );
+    ASSERT_TRUE( & vr2[0] == & mv(0,1) );
+    ASSERT_TRUE( & vr3[0] == & mv(0,2) );
+
+    ASSERT_TRUE( & cvr1[0] == & mv(0,0) );
+    ASSERT_TRUE( & cvr2[0] == & mv(0,1) );
+    ASSERT_TRUE( & cvr3[0] == & mv(0,2) );
+
+
+    ASSERT_TRUE( & mv1(0,0) == & mv( 1 , 2 ) );
+    ASSERT_TRUE( & mv1(1,1) == & mv( 2 , 3 ) );
+    ASSERT_TRUE( & mv1(3,2) == & mv( 4 , 4 ) );
+    ASSERT_TRUE( & mvr1(0,0) == & mv_right( 1 , 2 ) );
+    ASSERT_TRUE( & mvr1(1,1) == & mv_right( 2 , 3 ) );
+    ASSERT_TRUE( & mvr1(3,2) == & mv_right( 4 , 4 ) );
+
+    const_svector_type c_cv1( v1 );
+    typename svector_type::const_type c_cv2( v2 );
+    typename const_svector_type::const_type c_ccv2( v2 );
+
+
+    const_smultivector_type cmv( mv );
+    typename smultivector_type::const_type cmvX( cmv );
+    typename const_smultivector_type::const_type ccmvX( cmv );
+  }
+};
+
+} // namespace Test
+
+/*--------------------------------------------------------------------------*/
+
diff --git a/lib/kokkos/containers/unit_tests/TestDynamicView.hpp b/lib/kokkos/containers/unit_tests/TestDynamicView.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..7e3ca005f4b6401a088208fca120c097143afc49
--- /dev/null
+++ b/lib/kokkos/containers/unit_tests/TestDynamicView.hpp
@@ -0,0 +1,168 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_TEST_DYNAMICVIEW_HPP
+#define KOKKOS_TEST_DYNAMICVIEW_HPP
+
+#include <gtest/gtest.h>
+#include <iostream>
+#include <cstdlib>
+#include <cstdio>
+#include <Kokkos_Core.hpp>
+
+#include <Kokkos_DynamicView.hpp>
+#include <impl/Kokkos_Timer.hpp>
+
+namespace Test {
+
+template< typename Scalar , class Space >
+struct TestDynamicView
+{
+  typedef typename Space::execution_space  execution_space ;
+  typedef typename Space::memory_space     memory_space ;
+
+  typedef Kokkos::Experimental::MemoryPool<typename Space::device_type> memory_pool_type;
+
+  typedef Kokkos::Experimental::DynamicView<Scalar*,Space> view_type;
+
+  typedef typename Kokkos::TeamPolicy<execution_space>::member_type member_type ;
+  typedef double value_type;
+
+  struct TEST {};
+  struct VERIFY {};
+
+  view_type a;
+  const unsigned total_size ;
+
+  TestDynamicView( const view_type & arg_a , const unsigned arg_total )
+    : a(arg_a), total_size( arg_total ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() ( const TEST , member_type team_member, double& value) const
+  {
+    const unsigned int team_idx = team_member.league_rank() * team_member.team_size();
+
+    if ( team_member.team_rank() == 0 ) {
+      unsigned n = team_idx + team_member.team_size();
+
+      if ( total_size < n ) n = total_size ;
+
+      a.resize_parallel( n );
+
+      if ( a.extent(0) < n ) {
+        Kokkos::abort("GrowTest TEST failed resize_parallel");
+      }
+    }
+
+    // Make sure resize is done for all team members:
+    team_member.team_barrier();
+
+    const unsigned int val = team_idx + team_member.team_rank();
+
+    if ( val < total_size ) {
+      value += val ;
+
+      a( val ) = val ;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() ( const VERIFY , member_type team_member, double& value) const
+  {
+    const unsigned int val =
+      team_member.team_rank() + 
+      team_member.league_rank() * team_member.team_size();
+
+    if ( val < total_size ) {
+    
+      if ( val != a(val) ) {
+        Kokkos::abort("GrowTest VERIFY failed resize_parallel");
+      }
+
+      value += a(val);
+    }
+  }
+
+  static void run( unsigned arg_total_size )
+  {
+    typedef Kokkos::TeamPolicy<execution_space,TEST> TestPolicy ;
+    typedef Kokkos::TeamPolicy<execution_space,VERIFY> VerifyPolicy ;
+
+// printf("TestDynamicView::run(%d) construct memory pool\n",arg_total_size);
+
+    memory_pool_type pool( memory_space() , arg_total_size * sizeof(Scalar) * 1.2 );
+
+// printf("TestDynamicView::run(%d) construct dynamic view\n",arg_total_size);
+
+    view_type da("A",pool,arg_total_size);
+
+// printf("TestDynamicView::run(%d) construct test functor\n",arg_total_size);
+
+    TestDynamicView functor(da,arg_total_size);
+
+    const unsigned team_size = TestPolicy::team_size_recommended(functor);
+    const unsigned league_size = ( arg_total_size + team_size - 1 ) / team_size ;
+
+    double reference = 0;
+    double result = 0;
+
+// printf("TestDynamicView::run(%d) run functor test\n",arg_total_size);
+
+    Kokkos::parallel_reduce( TestPolicy(league_size,team_size) , functor , reference);
+    execution_space::fence();
+
+
+// printf("TestDynamicView::run(%d) run functor verify\n",arg_total_size);
+
+    Kokkos::parallel_reduce( VerifyPolicy(league_size,team_size) , functor , result );
+    execution_space::fence();
+
+// printf("TestDynamicView::run(%d) done\n",arg_total_size);
+
+  }
+};
+
+} // namespace Test
+
+#endif /* #ifndef KOKKOS_TEST_DYNAMICVIEW_HPP */
+
diff --git a/lib/kokkos/containers/unit_tests/TestOpenMP.cpp b/lib/kokkos/containers/unit_tests/TestOpenMP.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a4319f39ff7ce626f45a3b7cd3fe9b2a823d1132
--- /dev/null
+++ b/lib/kokkos/containers/unit_tests/TestOpenMP.cpp
@@ -0,0 +1,182 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+#include <Kokkos_Bitset.hpp>
+#include <Kokkos_UnorderedMap.hpp>
+#include <Kokkos_Vector.hpp>
+
+//----------------------------------------------------------------------------
+#include <TestBitset.hpp>
+#include <TestUnorderedMap.hpp>
+#include <TestStaticCrsGraph.hpp>
+#include <TestVector.hpp>
+#include <TestDualView.hpp>
+#include <TestDynamicView.hpp>
+#include <TestSegmentedView.hpp>
+#include <TestComplex.hpp>
+
+#include <Kokkos_DynRankView.hpp>
+#include <TestDynViewAPI.hpp>
+
+#include <iomanip>
+
+namespace Test {
+
+#ifdef KOKKOS_HAVE_OPENMP
+class openmp : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+    std::cout << std::setprecision(5) << std::scientific;
+
+    unsigned threads_count = 4 ;
+
+    if ( Kokkos::hwloc::available() ) {
+      threads_count = Kokkos::hwloc::get_available_numa_count() *
+                      Kokkos::hwloc::get_available_cores_per_numa();
+    }
+
+    Kokkos::OpenMP::initialize( threads_count );
+  }
+
+  static void TearDownTestCase()
+  {
+    Kokkos::OpenMP::finalize();
+  }
+};
+
+TEST_F( openmp, complex )
+{
+  testComplex<Kokkos::OpenMP> ();
+}
+
+TEST_F( openmp, dyn_view_api) {
+  TestDynViewAPI< double , Kokkos::OpenMP >();
+}
+
+TEST_F( openmp, bitset )
+{
+  test_bitset<Kokkos::OpenMP>();
+}
+
+TEST_F( openmp , staticcrsgraph )
+{
+  TestStaticCrsGraph::run_test_graph< Kokkos::OpenMP >();
+  TestStaticCrsGraph::run_test_graph2< Kokkos::OpenMP >();
+}
+
+#define OPENMP_INSERT_TEST( name, num_nodes, num_inserts, num_duplicates, repeat, near )                                \
+  TEST_F( openmp, UnorderedMap_insert_##name##_##num_nodes##_##num_inserts##_##num_duplicates##_##repeat##x) {   \
+    for (int i=0; i<repeat; ++i)                                                                                \
+      test_insert<Kokkos::OpenMP>(num_nodes,num_inserts,num_duplicates, near);                                   \
+  }
+
+#define OPENMP_FAILED_INSERT_TEST( num_nodes, repeat )                         \
+  TEST_F( openmp, UnorderedMap_failed_insert_##num_nodes##_##repeat##x) {     \
+    for (int i=0; i<repeat; ++i)                                               \
+      test_failed_insert<Kokkos::OpenMP>(num_nodes);                             \
+  }
+
+#define OPENMP_ASSIGNEMENT_TEST( num_nodes, repeat )                             \
+  TEST_F( openmp, UnorderedMap_assignment_operators_##num_nodes##_##repeat##x) {       \
+    for (int i=0; i<repeat; ++i)                                               \
+      test_assignement_operators<Kokkos::OpenMP>(num_nodes);                     \
+  }
+
+#define OPENMP_DEEP_COPY( num_nodes, repeat )                             \
+  TEST_F( openmp, UnorderedMap_deep_copy##num_nodes##_##repeat##x) {       \
+    for (int i=0; i<repeat; ++i)                                               \
+      test_deep_copy<Kokkos::OpenMP>(num_nodes);                     \
+  }
+
+#define OPENMP_VECTOR_COMBINE_TEST( size )                             \
+  TEST_F( openmp, vector_combination##size##x) {       \
+      test_vector_combinations<int,Kokkos::OpenMP>(size);                     \
+  }
+
+#define OPENMP_DUALVIEW_COMBINE_TEST( size )                             \
+  TEST_F( openmp, dualview_combination##size##x) {       \
+      test_dualview_combinations<int,Kokkos::OpenMP>(size);                     \
+  }
+
+#define OPENMP_SEGMENTEDVIEW_TEST( size )                             \
+  TEST_F( openmp, segmentedview_##size##x) {       \
+      test_segmented_view<double,Kokkos::OpenMP>(size);                     \
+  }
+
+OPENMP_INSERT_TEST(close, 100000, 90000, 100, 500, true)
+OPENMP_INSERT_TEST(far, 100000, 90000, 100, 500, false)
+OPENMP_FAILED_INSERT_TEST( 10000, 1000 )
+OPENMP_DEEP_COPY( 10000, 1 )
+
+OPENMP_VECTOR_COMBINE_TEST( 10 )
+OPENMP_VECTOR_COMBINE_TEST( 3057 )
+OPENMP_DUALVIEW_COMBINE_TEST( 10 )
+OPENMP_SEGMENTEDVIEW_TEST( 10000 )
+
+#undef OPENMP_INSERT_TEST
+#undef OPENMP_FAILED_INSERT_TEST
+#undef OPENMP_ASSIGNEMENT_TEST
+#undef OPENMP_DEEP_COPY
+#undef OPENMP_VECTOR_COMBINE_TEST
+#undef OPENMP_DUALVIEW_COMBINE_TEST
+#undef OPENMP_SEGMENTEDVIEW_TEST
+#endif
+
+
+TEST_F( openmp , dynamic_view )
+{
+  typedef TestDynamicView< double , Kokkos::OpenMP >
+    TestDynView ;
+
+  for ( int i = 0 ; i < 10 ; ++i ) {
+    TestDynView::run( 100000 + 100 * i );
+  }
+}
+
+} // namespace test
+
diff --git a/lib/kokkos/containers/unit_tests/TestSegmentedView.hpp b/lib/kokkos/containers/unit_tests/TestSegmentedView.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..bfd66d12a7dc658fe729ce7016b95d5d05c60202
--- /dev/null
+++ b/lib/kokkos/containers/unit_tests/TestSegmentedView.hpp
@@ -0,0 +1,708 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_TEST_SEGMENTEDVIEW_HPP
+#define KOKKOS_TEST_SEGMENTEDVIEW_HPP
+
+#include <gtest/gtest.h>
+#include <iostream>
+#include <cstdlib>
+#include <cstdio>
+#include <Kokkos_Core.hpp>
+
+#if ! KOKKOS_USING_EXP_VIEW
+
+#include <Kokkos_SegmentedView.hpp>
+#include <impl/Kokkos_Timer.hpp>
+
+namespace Test {
+
+namespace Impl {
+
+  template<class ViewType , class ExecutionSpace, int Rank = ViewType::Rank>
+  struct GrowTest;
+
+  template<class ViewType , class ExecutionSpace>
+  struct GrowTest<ViewType , ExecutionSpace , 1> {
+    typedef ExecutionSpace execution_space;
+    typedef Kokkos::TeamPolicy<execution_space> Policy;
+    typedef typename Policy::member_type team_type;
+    typedef double value_type;
+
+    ViewType a;
+
+    GrowTest(ViewType in):a(in) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (team_type team_member, double& value) const {
+      unsigned int team_idx = team_member.league_rank() * team_member.team_size();
+
+      a.grow(team_member , team_idx+team_member.team_size());
+      value += team_idx + team_member.team_rank();
+
+      if((a.dimension_0()>team_idx+team_member.team_rank()) &&
+         (a.dimension(0)>team_idx+team_member.team_rank()))
+        a(team_idx+team_member.team_rank()) = team_idx+team_member.team_rank();
+
+    }
+  };
+
+  template<class ViewType , class ExecutionSpace>
+  struct GrowTest<ViewType , ExecutionSpace , 2> {
+    typedef ExecutionSpace execution_space;
+    typedef Kokkos::TeamPolicy<execution_space> Policy;
+    typedef typename Policy::member_type team_type;
+    typedef double value_type;
+
+    ViewType a;
+
+    GrowTest(ViewType in):a(in) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (team_type team_member, double& value) const {
+      unsigned int team_idx = team_member.league_rank() * team_member.team_size();
+
+      a.grow(team_member , team_idx+ team_member.team_size());
+
+      for( typename ExecutionSpace::size_type k=0;k<7;k++)
+        value += team_idx + team_member.team_rank() + 13*k;
+
+      if((a.dimension_0()>team_idx+ team_member.team_rank()) &&
+         (a.dimension(0)>team_idx+ team_member.team_rank())) {
+        for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++) {
+          a(team_idx+ team_member.team_rank(),k) =
+              team_idx+ team_member.team_rank() + 13*k;
+        }
+      }
+    }
+  };
+
+  template<class ViewType , class ExecutionSpace>
+  struct GrowTest<ViewType , ExecutionSpace , 3> {
+    typedef ExecutionSpace execution_space;
+    typedef Kokkos::TeamPolicy<execution_space> Policy;
+    typedef typename Policy::member_type team_type;
+    typedef double value_type;
+
+    ViewType a;
+
+    GrowTest(ViewType in):a(in) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (team_type team_member, double& value) const {
+      unsigned int team_idx = team_member.league_rank() * team_member.team_size();
+
+      a.grow(team_member , team_idx+ team_member.team_size());
+
+      for( typename ExecutionSpace::size_type k=0;k<7;k++)
+        for( typename ExecutionSpace::size_type l=0;l<3;l++)
+          value += team_idx + team_member.team_rank() + 13*k + 3*l;
+
+      if((a.dimension_0()>team_idx+ team_member.team_rank()) &&
+         (a.dimension(0)>team_idx+ team_member.team_rank())) {
+        for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++)
+          for( typename ExecutionSpace::size_type l=0;l<a.dimension_2();l++)
+            a(team_idx+ team_member.team_rank(),k,l) =
+                team_idx+ team_member.team_rank() + 13*k + 3*l;
+      }
+    }
+  };
+
+  template<class ViewType , class ExecutionSpace>
+  struct GrowTest<ViewType , ExecutionSpace , 4> {
+    typedef ExecutionSpace execution_space;
+    typedef Kokkos::TeamPolicy<execution_space> Policy;
+    typedef typename Policy::member_type team_type;
+    typedef double value_type;
+
+    ViewType a;
+
+    GrowTest(ViewType in):a(in) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (team_type team_member, double& value) const {
+      unsigned int team_idx = team_member.league_rank() * team_member.team_size();
+
+      a.grow(team_member , team_idx+ team_member.team_size());
+
+      for( typename ExecutionSpace::size_type k=0;k<7;k++)
+        for( typename ExecutionSpace::size_type l=0;l<3;l++)
+          for( typename ExecutionSpace::size_type m=0;m<2;m++)
+            value += team_idx + team_member.team_rank() + 13*k + 3*l + 7*m;
+
+      if((a.dimension_0()>team_idx+ team_member.team_rank()) &&
+         (a.dimension(0)>team_idx+ team_member.team_rank())) {
+        for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++)
+          for( typename ExecutionSpace::size_type l=0;l<a.dimension_2();l++)
+            for( typename ExecutionSpace::size_type m=0;m<a.dimension_3();m++)
+              a(team_idx+ team_member.team_rank(),k,l,m) =
+                  team_idx+ team_member.team_rank() + 13*k + 3*l + 7*m;
+      }
+    }
+  };
+
+  template<class ViewType , class ExecutionSpace>
+  struct GrowTest<ViewType , ExecutionSpace , 5> {
+    typedef ExecutionSpace execution_space;
+    typedef Kokkos::TeamPolicy<execution_space> Policy;
+    typedef typename Policy::member_type team_type;
+    typedef double value_type;
+
+    ViewType a;
+
+    GrowTest(ViewType in):a(in) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (team_type team_member, double& value) const {
+      unsigned int team_idx = team_member.league_rank() * team_member.team_size();
+
+      a.grow(team_member , team_idx+ team_member.team_size());
+
+      for( typename ExecutionSpace::size_type k=0;k<7;k++)
+        for( typename ExecutionSpace::size_type l=0;l<3;l++)
+          for( typename ExecutionSpace::size_type m=0;m<2;m++)
+            for( typename ExecutionSpace::size_type n=0;n<3;n++)
+              value +=
+                  team_idx + team_member.team_rank() + 13*k + 3*l + 7*m + 5*n;
+
+      if((a.dimension_0()>team_idx+ team_member.team_rank()) &&
+         (a.dimension(0)>team_idx+ team_member.team_rank())) {
+        for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++)
+          for( typename ExecutionSpace::size_type l=0;l<a.dimension_2();l++)
+            for( typename ExecutionSpace::size_type m=0;m<a.dimension_3();m++)
+              for( typename ExecutionSpace::size_type n=0;n<a.dimension_4();n++)
+                a(team_idx+ team_member.team_rank(),k,l,m,n) =
+                  team_idx+ team_member.team_rank() + 13*k + 3*l + 7*m + 5*n;
+      }
+    }
+  };
+
+  template<class ViewType , class ExecutionSpace>
+  struct GrowTest<ViewType , ExecutionSpace , 6> {
+    typedef ExecutionSpace execution_space;
+    typedef Kokkos::TeamPolicy<execution_space> Policy;
+    typedef typename Policy::member_type team_type;
+    typedef double value_type;
+
+    ViewType a;
+
+    GrowTest(ViewType in):a(in) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (team_type team_member, double& value) const {
+      unsigned int team_idx = team_member.league_rank() * team_member.team_size();
+
+      a.grow(team_member , team_idx+ team_member.team_size());
+
+      for( typename ExecutionSpace::size_type k=0;k<7;k++)
+        for( typename ExecutionSpace::size_type l=0;l<3;l++)
+          for( typename ExecutionSpace::size_type m=0;m<2;m++)
+            for( typename ExecutionSpace::size_type n=0;n<3;n++)
+              for( typename ExecutionSpace::size_type o=0;o<2;o++)
+              value +=
+                  team_idx + team_member.team_rank() + 13*k + 3*l + 7*m + 5*n + 2*o ;
+
+      if((a.dimension_0()>team_idx+ team_member.team_rank()) &&
+         (a.dimension(0)>team_idx+ team_member.team_rank())) {
+        for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++)
+          for( typename ExecutionSpace::size_type l=0;l<a.dimension_2();l++)
+            for( typename ExecutionSpace::size_type m=0;m<a.dimension_3();m++)
+              for( typename ExecutionSpace::size_type n=0;n<a.dimension_4();n++)
+                for( typename ExecutionSpace::size_type o=0;o<a.dimension_5();o++)
+                a(team_idx+ team_member.team_rank(),k,l,m,n,o) =
+                    team_idx + team_member.team_rank() + 13*k + 3*l + 7*m + 5*n + 2*o ;
+      }
+    }
+  };
+
+  template<class ViewType , class ExecutionSpace>
+  struct GrowTest<ViewType , ExecutionSpace , 7> {
+    typedef ExecutionSpace execution_space;
+    typedef Kokkos::TeamPolicy<execution_space> Policy;
+    typedef typename Policy::member_type team_type;
+    typedef double value_type;
+
+    ViewType a;
+
+    GrowTest(ViewType in):a(in) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (team_type team_member, double& value) const {
+      unsigned int team_idx = team_member.league_rank() * team_member.team_size();
+
+      a.grow(team_member , team_idx+ team_member.team_size());
+
+      for( typename ExecutionSpace::size_type k=0;k<7;k++)
+        for( typename ExecutionSpace::size_type l=0;l<3;l++)
+          for( typename ExecutionSpace::size_type m=0;m<2;m++)
+            for( typename ExecutionSpace::size_type n=0;n<3;n++)
+              for( typename ExecutionSpace::size_type o=0;o<2;o++)
+                for( typename ExecutionSpace::size_type p=0;p<4;p++)
+              value +=
+                  team_idx + team_member.team_rank() + 13*k + 3*l + 7*m + 5*n + 2*o + 15*p ;
+
+      if((a.dimension_0()>team_idx+ team_member.team_rank()) &&
+         (a.dimension(0)>team_idx+ team_member.team_rank())) {
+        for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++)
+          for( typename ExecutionSpace::size_type l=0;l<a.dimension_2();l++)
+            for( typename ExecutionSpace::size_type m=0;m<a.dimension_3();m++)
+              for( typename ExecutionSpace::size_type n=0;n<a.dimension_4();n++)
+                for( typename ExecutionSpace::size_type o=0;o<a.dimension_5();o++)
+                  for( typename ExecutionSpace::size_type p=0;p<a.dimension_6();p++)
+                a(team_idx+ team_member.team_rank(),k,l,m,n,o,p) =
+                    team_idx + team_member.team_rank() + 13*k + 3*l + 7*m + 5*n + 2*o + 15*p ;
+      }
+    }
+  };
+
+  template<class ViewType , class ExecutionSpace>
+  struct GrowTest<ViewType , ExecutionSpace , 8> {
+    typedef ExecutionSpace execution_space;
+    typedef Kokkos::TeamPolicy<execution_space> Policy;
+    typedef typename Policy::member_type team_type;
+    typedef double value_type;
+
+    ViewType a;
+
+    GrowTest(ViewType in):a(in) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (team_type team_member, double& value) const {
+      unsigned int team_idx = team_member.league_rank() * team_member.team_size();
+      a.grow(team_member , team_idx + team_member.team_size());
+
+      for( typename ExecutionSpace::size_type k=0;k<7;k++)
+        for( typename ExecutionSpace::size_type l=0;l<3;l++)
+          for( typename ExecutionSpace::size_type m=0;m<2;m++)
+            for( typename ExecutionSpace::size_type n=0;n<3;n++)
+              for( typename ExecutionSpace::size_type o=0;o<2;o++)
+                for( typename ExecutionSpace::size_type p=0;p<4;p++)
+                  for( typename ExecutionSpace::size_type q=0;q<3;q++)
+              value +=
+                  team_idx + team_member.team_rank() + 13*k + 3*l + 7*m + 5*n + 2*o + 15*p + 17*q;
+
+      if((a.dimension_0()>team_idx+ team_member.team_rank()) &&
+         (a.dimension(0)>team_idx+ team_member.team_rank())) {
+        for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++)
+          for( typename ExecutionSpace::size_type l=0;l<a.dimension_2();l++)
+            for( typename ExecutionSpace::size_type m=0;m<a.dimension_3();m++)
+              for( typename ExecutionSpace::size_type n=0;n<a.dimension_4();n++)
+                for( typename ExecutionSpace::size_type o=0;o<a.dimension_5();o++)
+                  for( typename ExecutionSpace::size_type p=0;p<a.dimension_6();p++)
+                    for( typename ExecutionSpace::size_type q=0;q<a.dimension_7();q++)
+                a(team_idx+ team_member.team_rank(),k,l,m,n,o,p,q) =
+                    team_idx + team_member.team_rank() + 13*k + 3*l + 7*m + 5*n + 2*o + 15*p + 17*q;
+      }
+    }
+  };
+
+  template<class ViewType , class ExecutionSpace, int Rank = ViewType::Rank>
+  struct VerifyTest;
+
+  template<class ViewType , class ExecutionSpace>
+  struct VerifyTest<ViewType , ExecutionSpace , 1> {
+    typedef ExecutionSpace execution_space;
+    typedef Kokkos::TeamPolicy<execution_space> Policy;
+    typedef typename Policy::member_type team_type;
+    typedef double value_type;
+
+    ViewType a;
+
+    VerifyTest(ViewType in):a(in) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (team_type team_member, double& value) const {
+      unsigned int team_idx = team_member.league_rank() * team_member.team_size();
+
+      if((a.dimension_0()>team_idx+ team_member.team_rank()) &&
+         (a.dimension(0)>team_idx+ team_member.team_rank())) {
+        value += a(team_idx+ team_member.team_rank());
+      }
+    }
+  };
+
+  template<class ViewType , class ExecutionSpace>
+  struct VerifyTest<ViewType , ExecutionSpace , 2> {
+    typedef ExecutionSpace execution_space;
+    typedef Kokkos::TeamPolicy<execution_space> Policy;
+    typedef typename Policy::member_type team_type;
+    typedef double value_type;
+
+    ViewType a;
+
+    VerifyTest(ViewType in):a(in) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (team_type team_member, double& value) const {
+      unsigned int team_idx = team_member.league_rank() * team_member.team_size();
+
+      if((a.dimension_0()>team_idx+ team_member.team_rank()) &&
+         (a.dimension(0)>team_idx+ team_member.team_rank())) {
+        for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++)
+          value += a(team_idx+ team_member.team_rank(),k);
+      }
+    }
+  };
+
+  template<class ViewType , class ExecutionSpace>
+  struct VerifyTest<ViewType , ExecutionSpace , 3> {
+    typedef ExecutionSpace execution_space;
+    typedef Kokkos::TeamPolicy<execution_space> Policy;
+    typedef typename Policy::member_type team_type;
+    typedef double value_type;
+
+    ViewType a;
+
+    VerifyTest(ViewType in):a(in) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (team_type team_member, double& value) const {
+      unsigned int team_idx = team_member.league_rank() * team_member.team_size();
+
+      if((a.dimension_0()>team_idx+ team_member.team_rank()) &&
+         (a.dimension(0)>team_idx+ team_member.team_rank())) {
+        for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++)
+          for( typename ExecutionSpace::size_type l=0;l<a.dimension_2();l++)
+            value += a(team_idx+ team_member.team_rank(),k,l);
+      }
+    }
+  };
+
+  template<class ViewType , class ExecutionSpace>
+  struct VerifyTest<ViewType , ExecutionSpace , 4> {
+    typedef ExecutionSpace execution_space;
+    typedef Kokkos::TeamPolicy<execution_space> Policy;
+    typedef typename Policy::member_type team_type;
+    typedef double value_type;
+
+    ViewType a;
+
+    VerifyTest(ViewType in):a(in) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (team_type team_member, double& value) const {
+      unsigned int team_idx = team_member.league_rank() * team_member.team_size();
+
+      if((a.dimension_0()>team_idx+ team_member.team_rank()) &&
+         (a.dimension(0)>team_idx+ team_member.team_rank())) {
+        for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++)
+          for( typename ExecutionSpace::size_type l=0;l<a.dimension_2();l++)
+            for( typename ExecutionSpace::size_type m=0;m<a.dimension_3();m++)
+              value += a(team_idx+ team_member.team_rank(),k,l,m);
+      }
+    }
+  };
+
+  template<class ViewType , class ExecutionSpace>
+  struct VerifyTest<ViewType , ExecutionSpace , 5> {
+    typedef ExecutionSpace execution_space;
+    typedef Kokkos::TeamPolicy<execution_space> Policy;
+    typedef typename Policy::member_type team_type;
+    typedef double value_type;
+
+    ViewType a;
+
+    VerifyTest(ViewType in):a(in) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (team_type team_member, double& value) const {
+      unsigned int team_idx = team_member.league_rank() * team_member.team_size();
+
+      if((a.dimension_0()>team_idx+ team_member.team_rank()) &&
+         (a.dimension(0)>team_idx+ team_member.team_rank())) {
+        for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++)
+          for( typename ExecutionSpace::size_type l=0;l<a.dimension_2();l++)
+            for( typename ExecutionSpace::size_type m=0;m<a.dimension_3();m++)
+              for( typename ExecutionSpace::size_type n=0;n<a.dimension_4();n++)
+                value += a(team_idx+ team_member.team_rank(),k,l,m,n);
+      }
+    }
+  };
+
+  template<class ViewType , class ExecutionSpace>
+  struct VerifyTest<ViewType , ExecutionSpace , 6> {
+    typedef ExecutionSpace execution_space;
+    typedef Kokkos::TeamPolicy<execution_space> Policy;
+    typedef typename Policy::member_type team_type;
+    typedef double value_type;
+
+    ViewType a;
+
+    VerifyTest(ViewType in):a(in) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (team_type team_member, double& value) const {
+      unsigned int team_idx = team_member.league_rank() * team_member.team_size();
+
+      if((a.dimension_0()>team_idx+ team_member.team_rank()) &&
+         (a.dimension(0)>team_idx+ team_member.team_rank())) {
+        for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++)
+          for( typename ExecutionSpace::size_type l=0;l<a.dimension_2();l++)
+            for( typename ExecutionSpace::size_type m=0;m<a.dimension_3();m++)
+              for( typename ExecutionSpace::size_type n=0;n<a.dimension_4();n++)
+                for( typename ExecutionSpace::size_type o=0;o<a.dimension_5();o++)
+                  value += a(team_idx+ team_member.team_rank(),k,l,m,n,o);
+      }
+    }
+  };
+
+  template<class ViewType , class ExecutionSpace>
+  struct VerifyTest<ViewType , ExecutionSpace , 7> {
+    typedef ExecutionSpace execution_space;
+    typedef Kokkos::TeamPolicy<execution_space> Policy;
+    typedef typename Policy::member_type team_type;
+    typedef double value_type;
+
+    ViewType a;
+
+    VerifyTest(ViewType in):a(in) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (team_type team_member, double& value) const {
+      unsigned int team_idx = team_member.league_rank() * team_member.team_size();
+
+      if((a.dimension_0()>team_idx+ team_member.team_rank()) &&
+         (a.dimension(0)>team_idx+ team_member.team_rank())) {
+        for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++)
+          for( typename ExecutionSpace::size_type l=0;l<a.dimension_2();l++)
+            for( typename ExecutionSpace::size_type m=0;m<a.dimension_3();m++)
+              for( typename ExecutionSpace::size_type n=0;n<a.dimension_4();n++)
+                for( typename ExecutionSpace::size_type o=0;o<a.dimension_5();o++)
+                  for( typename ExecutionSpace::size_type p=0;p<a.dimension_6();p++)
+                    value += a(team_idx+ team_member.team_rank(),k,l,m,n,o,p);
+      }
+    }
+  };
+
+  template<class ViewType , class ExecutionSpace>
+  struct VerifyTest<ViewType , ExecutionSpace , 8> {
+    typedef ExecutionSpace execution_space;
+    typedef Kokkos::TeamPolicy<execution_space> Policy;
+    typedef typename Policy::member_type team_type;
+    typedef double value_type;
+
+    ViewType a;
+
+    VerifyTest(ViewType in):a(in) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (team_type team_member, double& value) const {
+      unsigned int team_idx = team_member.league_rank() * team_member.team_size();
+
+      if((a.dimension_0()>team_idx+ team_member.team_rank()) &&
+         (a.dimension(0)>team_idx+ team_member.team_rank())) {
+        for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++)
+          for( typename ExecutionSpace::size_type l=0;l<a.dimension_2();l++)
+            for( typename ExecutionSpace::size_type m=0;m<a.dimension_3();m++)
+              for( typename ExecutionSpace::size_type n=0;n<a.dimension_4();n++)
+                for( typename ExecutionSpace::size_type o=0;o<a.dimension_5();o++)
+                  for( typename ExecutionSpace::size_type p=0;p<a.dimension_6();p++)
+                    for( typename ExecutionSpace::size_type q=0;q<a.dimension_7();q++)
+                      value += a(team_idx+ team_member.team_rank(),k,l,m,n,o,p,q);
+      }
+    }
+  };
+
+  template <typename Scalar, class ExecutionSpace>
+  struct test_segmented_view
+  {
+    typedef test_segmented_view<Scalar,ExecutionSpace> self_type;
+
+    typedef Scalar scalar_type;
+    typedef ExecutionSpace execution_space;
+    typedef Kokkos::TeamPolicy<execution_space> Policy;
+
+    double result;
+    double reference;
+
+    template <class ViewType>
+    void run_me(ViewType a, int max_length){
+      const int team_size = Policy::team_size_max( GrowTest<ViewType,execution_space>(a) );
+      const int nteams = max_length/team_size;
+
+      reference = 0;
+      result = 0;
+
+      Kokkos::parallel_reduce(Policy(nteams,team_size),GrowTest<ViewType,execution_space>(a),reference);
+      Kokkos::fence();
+      Kokkos::parallel_reduce(Policy(nteams,team_size),VerifyTest<ViewType,execution_space>(a),result);
+      Kokkos::fence();
+    }
+
+
+    test_segmented_view(unsigned int size,int rank)
+    {
+      reference = 0;
+      result = 0;
+
+      const int dim_1 = 7;
+      const int dim_2 = 3;
+      const int dim_3 = 2;
+      const int dim_4 = 3;
+      const int dim_5 = 2;
+      const int dim_6 = 4;
+      //const int dim_7 = 3;
+
+      if(rank==1) {
+        typedef Kokkos::Experimental::SegmentedView<Scalar*,Kokkos::LayoutLeft,ExecutionSpace> rank1_view;
+        run_me< rank1_view >(rank1_view("Rank1",128,size), size);
+      }
+      if(rank==2) {
+        typedef Kokkos::Experimental::SegmentedView<Scalar**,Kokkos::LayoutLeft,ExecutionSpace> rank2_view;
+        run_me< rank2_view >(rank2_view("Rank2",128,size,dim_1), size);
+      }
+      if(rank==3) {
+        typedef Kokkos::Experimental::SegmentedView<Scalar*[7][3][2],Kokkos::LayoutRight,ExecutionSpace> rank3_view;
+        run_me< rank3_view >(rank3_view("Rank3",128,size), size);
+      }
+      if(rank==4) {
+        typedef Kokkos::Experimental::SegmentedView<Scalar****,Kokkos::LayoutRight,ExecutionSpace> rank4_view;
+        run_me< rank4_view >(rank4_view("Rank4",128,size,dim_1,dim_2,dim_3), size);
+      }
+      if(rank==5) {
+        typedef Kokkos::Experimental::SegmentedView<Scalar*[7][3][2][3],Kokkos::LayoutLeft,ExecutionSpace> rank5_view;
+        run_me< rank5_view >(rank5_view("Rank5",128,size), size);
+      }
+      if(rank==6) {
+        typedef Kokkos::Experimental::SegmentedView<Scalar*****[2],Kokkos::LayoutRight,ExecutionSpace> rank6_view;
+        run_me< rank6_view >(rank6_view("Rank6",128,size,dim_1,dim_2,dim_3,dim_4), size);
+      }
+      if(rank==7) {
+        typedef Kokkos::Experimental::SegmentedView<Scalar*******,Kokkos::LayoutLeft,ExecutionSpace> rank7_view;
+        run_me< rank7_view >(rank7_view("Rank7",128,size,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6), size);
+      }
+      if(rank==8) {
+        typedef Kokkos::Experimental::SegmentedView<Scalar*****[2][4][3],Kokkos::LayoutLeft,ExecutionSpace> rank8_view;
+        run_me< rank8_view >(rank8_view("Rank8",128,size,dim_1,dim_2,dim_3,dim_4), size);
+      }
+    }
+
+   };
+
+} // namespace Impl
+
+
+
+
+template <typename Scalar, class ExecutionSpace>
+void test_segmented_view(unsigned int size)
+{
+  {
+    typedef Kokkos::Experimental::SegmentedView<Scalar*****[2][4][3],Kokkos::LayoutLeft,ExecutionSpace> view_type;
+    view_type a("A",128,size,7,3,2,3);
+    double reference;
+
+    Impl::GrowTest<view_type,ExecutionSpace> f(a);
+
+    const int team_size = Kokkos::TeamPolicy<ExecutionSpace>::team_size_max( f );
+    const int nteams = (size+team_size-1)/team_size;
+
+    Kokkos::parallel_reduce(Kokkos::TeamPolicy<ExecutionSpace>(nteams,team_size),f,reference);
+
+    size_t real_size = ((size+127)/128)*128;
+
+    ASSERT_EQ(real_size,a.dimension_0());
+    ASSERT_EQ(7,a.dimension_1());
+    ASSERT_EQ(3,a.dimension_2());
+    ASSERT_EQ(2,a.dimension_3());
+    ASSERT_EQ(3,a.dimension_4());
+    ASSERT_EQ(2,a.dimension_5());
+    ASSERT_EQ(4,a.dimension_6());
+    ASSERT_EQ(3,a.dimension_7());
+    ASSERT_EQ(real_size,a.dimension(0));
+    ASSERT_EQ(7,a.dimension(1));
+    ASSERT_EQ(3,a.dimension(2));
+    ASSERT_EQ(2,a.dimension(3));
+    ASSERT_EQ(3,a.dimension(4));
+    ASSERT_EQ(2,a.dimension(5));
+    ASSERT_EQ(4,a.dimension(6));
+    ASSERT_EQ(3,a.dimension(7));
+    ASSERT_EQ(8,a.Rank);
+  }
+  {
+    Impl::test_segmented_view<Scalar,ExecutionSpace> test(size,1);
+    ASSERT_EQ(test.reference,test.result);
+  }
+  {
+    Impl::test_segmented_view<Scalar,ExecutionSpace> test(size,2);
+    ASSERT_EQ(test.reference,test.result);
+  }
+  {
+    Impl::test_segmented_view<Scalar,ExecutionSpace> test(size,3);
+    ASSERT_EQ(test.reference,test.result);
+  }
+  {
+    Impl::test_segmented_view<Scalar,ExecutionSpace> test(size,4);
+    ASSERT_EQ(test.reference,test.result);
+  }
+  {
+    Impl::test_segmented_view<Scalar,ExecutionSpace> test(size,5);
+    ASSERT_EQ(test.reference,test.result);
+  }
+  {
+    Impl::test_segmented_view<Scalar,ExecutionSpace> test(size,6);
+    ASSERT_EQ(test.reference,test.result);
+  }
+  {
+    Impl::test_segmented_view<Scalar,ExecutionSpace> test(size,7);
+    ASSERT_EQ(test.reference,test.result);
+  }
+  {
+    Impl::test_segmented_view<Scalar,ExecutionSpace> test(size,8);
+    ASSERT_EQ(test.reference,test.result);
+  }
+
+}
+
+
+} // namespace Test
+
+#else
+
+template <typename Scalar, class ExecutionSpace>
+void test_segmented_view(unsigned int ) {}
+
+#endif
+
+#endif /* #ifndef KOKKOS_TEST_SEGMENTEDVIEW_HPP */
+
diff --git a/lib/kokkos/containers/unit_tests/TestSerial.cpp b/lib/kokkos/containers/unit_tests/TestSerial.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a7c42d27987d2938fb6b10254d72045732e0f74c
--- /dev/null
+++ b/lib/kokkos/containers/unit_tests/TestSerial.cpp
@@ -0,0 +1,175 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+#if ! defined(KOKKOS_HAVE_SERIAL)
+#  error "It doesn't make sense to build this file unless the Kokkos::Serial device is enabled.  If you see this message, it probably means that there is an error in Kokkos' CMake build infrastructure."
+#else
+
+#include <Kokkos_Bitset.hpp>
+#include <Kokkos_UnorderedMap.hpp>
+#include <Kokkos_Vector.hpp>
+
+#include <TestBitset.hpp>
+#include <TestUnorderedMap.hpp>
+#include <TestStaticCrsGraph.hpp>
+#include <TestVector.hpp>
+#include <TestDualView.hpp>
+#include <TestSegmentedView.hpp>
+#include <TestDynamicView.hpp>
+#include <TestComplex.hpp>
+
+#include <iomanip>
+
+#include <Kokkos_DynRankView.hpp>
+#include <TestDynViewAPI.hpp>
+
+namespace Test {
+
+class serial : public ::testing::Test {
+protected:
+  static void SetUpTestCase () {
+    std::cout << std::setprecision(5) << std::scientific;
+    Kokkos::Serial::initialize ();
+  }
+
+  static void TearDownTestCase () {
+    Kokkos::Serial::finalize ();
+  }
+};
+
+TEST_F( serial, dyn_view_api) {
+  TestDynViewAPI< double , Kokkos::Serial >();
+}
+
+TEST_F( serial , staticcrsgraph )
+{
+  TestStaticCrsGraph::run_test_graph< Kokkos::Serial >();
+  TestStaticCrsGraph::run_test_graph2< Kokkos::Serial >();
+}
+
+TEST_F( serial, complex )
+{
+  testComplex<Kokkos::Serial> ();
+}
+
+TEST_F( serial, bitset )
+{
+  test_bitset<Kokkos::Serial> ();
+}
+
+#define SERIAL_INSERT_TEST( name, num_nodes, num_inserts, num_duplicates, repeat, near ) \
+  TEST_F( serial, UnorderedMap_insert_##name##_##num_nodes##_##num_inserts##_##num_duplicates##_##repeat##x) { \
+    for (int i=0; i<repeat; ++i)                                        \
+      test_insert<Kokkos::Serial> (num_nodes, num_inserts, num_duplicates, near); \
+  }
+
+#define SERIAL_FAILED_INSERT_TEST( num_nodes, repeat )                  \
+  TEST_F( serial, UnorderedMap_failed_insert_##num_nodes##_##repeat##x) { \
+    for (int i=0; i<repeat; ++i)                                        \
+      test_failed_insert<Kokkos::Serial> (num_nodes);                   \
+  }
+
+#define SERIAL_ASSIGNEMENT_TEST( num_nodes, repeat )                    \
+  TEST_F( serial, UnorderedMap_assignment_operators_##num_nodes##_##repeat##x) { \
+    for (int i=0; i<repeat; ++i)                                        \
+      test_assignement_operators<Kokkos::Serial> (num_nodes);           \
+  }
+
+#define SERIAL_DEEP_COPY( num_nodes, repeat )                           \
+  TEST_F( serial, UnorderedMap_deep_copy##num_nodes##_##repeat##x) {    \
+    for (int i=0; i<repeat; ++i)                                        \
+      test_deep_copy<Kokkos::Serial> (num_nodes);                       \
+  }
+
+#define SERIAL_VECTOR_COMBINE_TEST( size )             \
+  TEST_F( serial, vector_combination##size##x) {                        \
+    test_vector_combinations<int,Kokkos::Serial>(size);                 \
+  }
+
+#define SERIAL_DUALVIEW_COMBINE_TEST( size )             \
+  TEST_F( serial, dualview_combination##size##x) {                      \
+    test_dualview_combinations<int,Kokkos::Serial>(size);               \
+  }
+
+#define SERIAL_SEGMENTEDVIEW_TEST( size )                               \
+  TEST_F( serial, segmentedview_##size##x) {                            \
+    test_segmented_view<double,Kokkos::Serial>(size);                   \
+  }
+
+SERIAL_INSERT_TEST(close, 100000, 90000, 100, 500, true)
+SERIAL_INSERT_TEST(far, 100000, 90000, 100, 500, false)
+SERIAL_FAILED_INSERT_TEST( 10000, 1000 )
+SERIAL_DEEP_COPY( 10000, 1 )
+
+SERIAL_VECTOR_COMBINE_TEST( 10 )
+SERIAL_VECTOR_COMBINE_TEST( 3057 )
+SERIAL_DUALVIEW_COMBINE_TEST( 10 )
+SERIAL_SEGMENTEDVIEW_TEST( 10000 )
+
+#undef SERIAL_INSERT_TEST
+#undef SERIAL_FAILED_INSERT_TEST
+#undef SERIAL_ASSIGNEMENT_TEST
+#undef SERIAL_DEEP_COPY
+#undef SERIAL_VECTOR_COMBINE_TEST
+#undef SERIAL_DUALVIEW_COMBINE_TEST
+#undef SERIAL_SEGMENTEDVIEW_TEST
+
+TEST_F( serial , dynamic_view )
+{
+  typedef TestDynamicView< double , Kokkos::Serial >
+    TestDynView ;
+
+  for ( int i = 0 ; i < 10 ; ++i ) {
+    TestDynView::run( 100000 + 100 * i );
+  }
+}
+
+} // namespace Test
+
+#endif // KOKKOS_HAVE_SERIAL
+
+
diff --git a/lib/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp b/lib/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..52b45b786562efcfbaf10a4db3ac280eb644b09b
--- /dev/null
+++ b/lib/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp
@@ -0,0 +1,149 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <vector>
+
+#include <Kokkos_StaticCrsGraph.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace TestStaticCrsGraph {
+
+template< class Space >
+void run_test_graph()
+{
+  typedef Kokkos::StaticCrsGraph< unsigned , Space > dView ;
+  typedef typename dView::HostMirror hView ;
+
+  const unsigned LENGTH = 1000 ;
+  dView dx ;
+  hView hx ;
+
+  std::vector< std::vector< int > > graph( LENGTH );
+
+  for ( size_t i = 0 ; i < LENGTH ; ++i ) {
+    graph[i].reserve(8);
+    for ( size_t j = 0 ; j < 8 ; ++j ) {
+      graph[i].push_back( i + j * 3 );
+    }
+  }
+
+  dx = Kokkos::create_staticcrsgraph<dView>( "dx" , graph );
+    hx = Kokkos::create_mirror( dx );
+
+  ASSERT_EQ( hx.row_map.dimension_0() - 1 , LENGTH );
+
+  for ( size_t i = 0 ; i < LENGTH ; ++i ) {
+    const size_t begin = hx.row_map[i];
+    const size_t n = hx.row_map[i+1] - begin ;
+    ASSERT_EQ( n , graph[i].size() );
+    for ( size_t j = 0 ; j < n ; ++j ) {
+      ASSERT_EQ( (int) hx.entries( j + begin ) , graph[i][j] );
+    }
+  }
+}
+
+template< class Space >
+void run_test_graph2()
+{
+  typedef Kokkos::StaticCrsGraph< unsigned[3] , Space > dView ;
+  typedef typename dView::HostMirror hView ;
+
+  const unsigned LENGTH = 10 ;
+
+  std::vector< size_t > sizes( LENGTH );
+
+  size_t total_length = 0 ;
+
+  for ( size_t i = 0 ; i < LENGTH ; ++i ) {
+    total_length += ( sizes[i] = 6 + i % 4 );
+  }
+
+  dView dx = Kokkos::create_staticcrsgraph<dView>( "test" , sizes );
+  hView hx = Kokkos::create_mirror( dx );
+  hView mx = Kokkos::create_mirror( dx );
+
+  ASSERT_EQ( (size_t) dx.row_map.dimension_0() , (size_t) LENGTH + 1 );
+  ASSERT_EQ( (size_t) hx.row_map.dimension_0() , (size_t) LENGTH + 1 );
+  ASSERT_EQ( (size_t) mx.row_map.dimension_0() , (size_t) LENGTH + 1 );
+
+  ASSERT_EQ( (size_t) dx.entries.dimension_0() , (size_t) total_length );
+  ASSERT_EQ( (size_t) hx.entries.dimension_0() , (size_t) total_length );
+  ASSERT_EQ( (size_t) mx.entries.dimension_0() , (size_t) total_length );
+
+  ASSERT_EQ( (size_t) dx.entries.dimension_1() , (size_t) 3 );
+  ASSERT_EQ( (size_t) hx.entries.dimension_1() , (size_t) 3 );
+  ASSERT_EQ( (size_t) mx.entries.dimension_1() , (size_t) 3 );
+
+  for ( size_t i = 0 ; i < LENGTH ; ++i ) {
+    const size_t entry_begin = hx.row_map[i];
+    const size_t entry_end   = hx.row_map[i+1];
+    for ( size_t j = entry_begin ; j < entry_end ; ++j ) {
+      hx.entries(j,0) = j + 1 ;
+      hx.entries(j,1) = j + 2 ;
+      hx.entries(j,2) = j + 3 ;
+    }
+  }
+
+  Kokkos::deep_copy( dx.entries , hx.entries );
+  Kokkos::deep_copy( mx.entries , dx.entries );
+
+  ASSERT_EQ( mx.row_map.dimension_0() , (size_t) LENGTH + 1 );
+
+  for ( size_t i = 0 ; i < LENGTH ; ++i ) {
+    const size_t entry_begin = mx.row_map[i];
+    const size_t entry_end   = mx.row_map[i+1];
+    ASSERT_EQ( ( entry_end - entry_begin ) , sizes[i] );
+    for ( size_t j = entry_begin ; j < entry_end ; ++j ) {
+      ASSERT_EQ( (size_t) mx.entries( j , 0 ) , ( j + 1 ) );
+      ASSERT_EQ( (size_t) mx.entries( j , 1 ) , ( j + 2 ) );
+      ASSERT_EQ( (size_t) mx.entries( j , 2 ) , ( j + 3 ) );
+    }
+  }
+}
+
+} /* namespace TestStaticCrsGraph */
+
+
diff --git a/lib/kokkos/containers/unit_tests/TestThreads.cpp b/lib/kokkos/containers/unit_tests/TestThreads.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..58277528d31d6ea6adae2996f5e8329b2c63b791
--- /dev/null
+++ b/lib/kokkos/containers/unit_tests/TestThreads.cpp
@@ -0,0 +1,188 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+#if defined( KOKKOS_HAVE_PTHREAD )
+
+#include <Kokkos_Bitset.hpp>
+#include <Kokkos_UnorderedMap.hpp>
+
+#include <Kokkos_Vector.hpp>
+#include <iomanip>
+
+
+//----------------------------------------------------------------------------
+#include <TestBitset.hpp>
+#include <TestUnorderedMap.hpp>
+#include <TestStaticCrsGraph.hpp>
+
+#include <TestVector.hpp>
+#include <TestDualView.hpp>
+#include <TestDynamicView.hpp>
+#include <TestSegmentedView.hpp>
+
+#include <Kokkos_DynRankView.hpp>
+#include <TestDynViewAPI.hpp>
+
+namespace Test {
+
+class threads : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+    std::cout << std::setprecision(5) << std::scientific;
+
+    unsigned num_threads = 4;
+
+    if (Kokkos::hwloc::available()) {
+      num_threads = Kokkos::hwloc::get_available_numa_count()
+                    * Kokkos::hwloc::get_available_cores_per_numa()
+                 // * Kokkos::hwloc::get_available_threads_per_core()
+                    ;
+
+    }
+
+    std::cout << "Threads: " << num_threads << std::endl;
+
+    Kokkos::Threads::initialize( num_threads );
+  }
+
+  static void TearDownTestCase()
+  {
+    Kokkos::Threads::finalize();
+  }
+};
+
+TEST_F( threads , dyn_view_api) {
+  TestDynViewAPI< double , Kokkos::Threads >();
+}
+
+TEST_F( threads , staticcrsgraph )
+{
+  TestStaticCrsGraph::run_test_graph< Kokkos::Threads >();
+  TestStaticCrsGraph::run_test_graph2< Kokkos::Threads >();
+}
+
+/*TEST_F( threads, bitset )
+{
+  test_bitset<Kokkos::Threads>();
+}*/
+
+#define THREADS_INSERT_TEST( name, num_nodes, num_inserts, num_duplicates, repeat, near )                                \
+  TEST_F( threads, UnorderedMap_insert_##name##_##num_nodes##_##num_inserts##_##num_duplicates##_##repeat##x) {   \
+    for (int i=0; i<repeat; ++i)                                                                                \
+      test_insert<Kokkos::Threads>(num_nodes,num_inserts,num_duplicates, near);                                   \
+  }
+
+#define THREADS_FAILED_INSERT_TEST( num_nodes, repeat )                            \
+  TEST_F( threads, UnorderedMap_failed_insert_##num_nodes##_##repeat##x) {       \
+    for (int i=0; i<repeat; ++i)                                               \
+      test_failed_insert<Kokkos::Threads>(num_nodes);                             \
+  }
+
+#define THREADS_ASSIGNEMENT_TEST( num_nodes, repeat )                             \
+  TEST_F( threads, UnorderedMap_assignment_operators_##num_nodes##_##repeat##x) {       \
+    for (int i=0; i<repeat; ++i)                                               \
+      test_assignement_operators<Kokkos::Threads>(num_nodes);                     \
+  }
+
+#define THREADS_DEEP_COPY( num_nodes, repeat )                             \
+  TEST_F( threads, UnorderedMap_deep_copy##num_nodes##_##repeat##x) {       \
+    for (int i=0; i<repeat; ++i)                                               \
+      test_deep_copy<Kokkos::Threads>(num_nodes);                     \
+  }
+
+#define THREADS_VECTOR_COMBINE_TEST( size )                             \
+  TEST_F( threads, vector_combination##size##x) {       \
+      test_vector_combinations<int,Kokkos::Threads>(size);                     \
+  }
+
+#define THREADS_DUALVIEW_COMBINE_TEST( size )                             \
+  TEST_F( threads, dualview_combination##size##x) {       \
+      test_dualview_combinations<int,Kokkos::Threads>(size);                     \
+  }
+
+#define THREADS_SEGMENTEDVIEW_TEST( size )                             \
+  TEST_F( threads, segmentedview_##size##x) {       \
+      test_segmented_view<double,Kokkos::Threads>(size);                     \
+  }
+
+
+THREADS_INSERT_TEST(far, 100000, 90000, 100, 500, false)
+THREADS_FAILED_INSERT_TEST( 10000, 1000 )
+THREADS_DEEP_COPY( 10000, 1 )
+
+THREADS_VECTOR_COMBINE_TEST( 10 )
+THREADS_VECTOR_COMBINE_TEST( 3057 )
+THREADS_DUALVIEW_COMBINE_TEST( 10 )
+THREADS_SEGMENTEDVIEW_TEST( 10000 )
+
+
+#undef THREADS_INSERT_TEST
+#undef THREADS_FAILED_INSERT_TEST
+#undef THREADS_ASSIGNEMENT_TEST
+#undef THREADS_DEEP_COPY
+#undef THREADS_VECTOR_COMBINE_TEST
+#undef THREADS_DUALVIEW_COMBINE_TEST
+#undef THREADS_SEGMENTEDVIEW_TEST
+
+
+
+TEST_F( threads , dynamic_view )
+{
+  typedef TestDynamicView< double , Kokkos::Threads >
+    TestDynView ;
+
+  for ( int i = 0 ; i < 10 ; ++i ) {
+    TestDynView::run( 100000 + 100 * i );
+  }
+}
+
+} // namespace Test
+
+
+#endif /* #if defined( KOKKOS_HAVE_PTHREAD ) */
+
diff --git a/lib/kokkos/containers/unit_tests/TestUnorderedMap.hpp b/lib/kokkos/containers/unit_tests/TestUnorderedMap.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ff0328548dee0a3458faa82ab44a16e5a081d29b
--- /dev/null
+++ b/lib/kokkos/containers/unit_tests/TestUnorderedMap.hpp
@@ -0,0 +1,313 @@
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+
+#ifndef KOKKOS_TEST_UNORDERED_MAP_HPP
+#define KOKKOS_TEST_UNORDERED_MAP_HPP
+
+#include <gtest/gtest.h>
+#include <iostream>
+
+
+namespace Test {
+
+namespace Impl {
+
+template <typename MapType, bool Near = false>
+struct TestInsert
+{
+  typedef MapType map_type;
+  typedef typename map_type::execution_space execution_space;
+  typedef uint32_t value_type;
+
+  map_type map;
+  uint32_t inserts;
+  uint32_t collisions;
+
+  TestInsert( map_type arg_map, uint32_t arg_inserts, uint32_t arg_collisions)
+    : map(arg_map)
+    , inserts(arg_inserts)
+    , collisions(arg_collisions)
+  {}
+
+  void testit( bool rehash_on_fail = true )
+  {
+    execution_space::fence();
+
+    uint32_t failed_count = 0;
+    do {
+      failed_count = 0;
+      Kokkos::parallel_reduce(inserts, *this, failed_count);
+
+      if (rehash_on_fail && failed_count > 0u) {
+        const uint32_t new_capacity = map.capacity() + ((map.capacity()*3ull)/20u) + failed_count/collisions ;
+        map.rehash( new_capacity );
+      }
+    } while (rehash_on_fail && failed_count > 0u);
+
+    execution_space::fence();
+  }
+
+
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type & failed_count ) const { failed_count = 0; }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile value_type & failed_count, const volatile value_type & count ) const
+  { failed_count += count; }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(uint32_t i, value_type & failed_count) const
+  {
+    const uint32_t key = Near ? i/collisions : i%(inserts/collisions);
+    if (map.insert(key,i).failed()) ++failed_count;
+  }
+
+};
+
+  template <typename MapType, bool Near>
+  struct TestErase
+  {
+    typedef TestErase<MapType, Near> self_type;
+
+    typedef MapType map_type;
+    typedef typename MapType::execution_space execution_space;
+
+    map_type m_map;
+    uint32_t m_num_erase;
+    uint32_t m_num_duplicates;
+
+    TestErase(map_type map, uint32_t num_erases, uint32_t num_duplicates)
+      : m_map(map)
+      , m_num_erase(num_erases)
+      , m_num_duplicates(num_duplicates)
+    {}
+
+    void testit()
+    {
+      execution_space::fence();
+      Kokkos::parallel_for(m_num_erase, *this);
+      execution_space::fence();
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(typename execution_space::size_type i) const
+    {
+      if (Near) {
+        m_map.erase(i/m_num_duplicates);
+      }
+      else {
+        m_map.erase(i%(m_num_erase/m_num_duplicates));
+      }
+
+    }
+  };
+
+  template <typename MapType>
+  struct TestFind
+  {
+    typedef MapType map_type;
+    typedef typename MapType::execution_space::execution_space execution_space;
+    typedef uint32_t value_type;
+
+    map_type m_map;
+    uint32_t m_num_insert;
+    uint32_t m_num_duplicates;
+    uint32_t m_max_key;
+
+    TestFind(map_type map, uint32_t num_inserts, uint32_t num_duplicates)
+      : m_map(map)
+      , m_num_insert(num_inserts)
+      , m_num_duplicates(num_duplicates)
+      , m_max_key( ((num_inserts + num_duplicates) - 1)/num_duplicates )
+    {}
+
+    void testit(value_type &errors)
+    {
+      execution_space::execution_space::fence();
+      Kokkos::parallel_reduce(m_map.capacity(), *this, errors);
+      execution_space::execution_space::fence();
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    static void init( value_type & dst)
+    {
+      dst = 0;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    static void join( volatile value_type & dst, const volatile value_type & src)
+    { dst += src; }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(typename execution_space::size_type i, value_type & errors) const
+    {
+      const bool expect_to_find_i = (i < m_max_key);
+
+      const bool exists = m_map.exists(i);
+
+      if (expect_to_find_i && !exists)  ++errors;
+      if (!expect_to_find_i && exists)  ++errors;
+    }
+  };
+
+} // namespace Impl
+
+
+
+template <typename Device>
+void test_insert( uint32_t num_nodes , uint32_t num_inserts , uint32_t num_duplicates , bool near )
+{
+  typedef Kokkos::UnorderedMap<uint32_t,uint32_t, Device> map_type;
+  typedef Kokkos::UnorderedMap<const uint32_t,const uint32_t, Device> const_map_type;
+
+  const uint32_t expected_inserts = (num_inserts + num_duplicates -1u) / num_duplicates;
+
+  map_type map;
+  map.rehash(num_nodes,false);
+
+  if (near) {
+    Impl::TestInsert<map_type,true> test_insert(map, num_inserts, num_duplicates);
+    test_insert.testit();
+  } else
+  {
+    Impl::TestInsert<map_type,false> test_insert(map, num_inserts, num_duplicates);
+    test_insert.testit();
+  }
+
+  const bool print_list = false;
+  if (print_list) {
+    Kokkos::Impl::UnorderedMapPrint<map_type> f(map);
+    f.apply();
+  }
+
+  const uint32_t map_size = map.size();
+
+  ASSERT_FALSE( map.failed_insert());
+  {
+    EXPECT_EQ(expected_inserts, map_size);
+
+    {
+      uint32_t find_errors = 0;
+      Impl::TestFind<const_map_type> test_find(map, num_inserts, num_duplicates);
+      test_find.testit(find_errors);
+      EXPECT_EQ( 0u, find_errors);
+    }
+
+    map.begin_erase();
+    Impl::TestErase<map_type,false> test_erase(map, num_inserts, num_duplicates);
+    test_erase.testit();
+    map.end_erase();
+    EXPECT_EQ(0u, map.size());
+  }
+}
+
+template <typename Device>
+void test_failed_insert( uint32_t num_nodes)
+{
+  typedef Kokkos::UnorderedMap<uint32_t,uint32_t, Device> map_type;
+
+  map_type map(num_nodes);
+  Impl::TestInsert<map_type> test_insert(map, 2u*num_nodes, 1u);
+  test_insert.testit(false /*don't rehash on fail*/);
+  Device::execution_space::fence();
+
+  EXPECT_TRUE( map.failed_insert() );
+}
+
+
+
+template <typename Device>
+void test_deep_copy( uint32_t num_nodes )
+{
+  typedef Kokkos::UnorderedMap<uint32_t,uint32_t, Device> map_type;
+  typedef Kokkos::UnorderedMap<const uint32_t, const uint32_t, Device> const_map_type;
+
+  typedef typename map_type::HostMirror host_map_type ;
+  // typedef Kokkos::UnorderedMap<uint32_t, uint32_t, typename Device::host_mirror_execution_space > host_map_type;
+
+  map_type map;
+  map.rehash(num_nodes,false);
+
+  {
+    Impl::TestInsert<map_type> test_insert(map, num_nodes, 1);
+    test_insert.testit();
+    ASSERT_EQ( map.size(), num_nodes);
+    ASSERT_FALSE( map.failed_insert() );
+    {
+      uint32_t find_errors = 0;
+      Impl::TestFind<map_type> test_find(map, num_nodes, 1);
+      test_find.testit(find_errors);
+      EXPECT_EQ( find_errors, 0u);
+    }
+
+  }
+
+  host_map_type hmap;
+  Kokkos::deep_copy(hmap, map);
+
+  ASSERT_EQ( map.size(), hmap.size());
+  ASSERT_EQ( map.capacity(), hmap.capacity());
+  {
+    uint32_t find_errors = 0;
+    Impl::TestFind<host_map_type> test_find(hmap, num_nodes, 1);
+    test_find.testit(find_errors);
+    EXPECT_EQ( find_errors, 0u);
+  }
+
+  map_type mmap;
+  Kokkos::deep_copy(mmap, hmap);
+
+  const_map_type cmap = mmap;
+
+  EXPECT_EQ( cmap.size(), num_nodes);
+
+  {
+    uint32_t find_errors = 0;
+    Impl::TestFind<const_map_type> test_find(cmap, num_nodes, 1);
+    test_find.testit(find_errors);
+    EXPECT_EQ( find_errors, 0u);
+  }
+
+}
+
+} // namespace Test
+
+#endif //KOKKOS_TEST_UNORDERED_MAP_HPP
diff --git a/lib/kokkos/containers/unit_tests/TestVector.hpp b/lib/kokkos/containers/unit_tests/TestVector.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f9f4564898edf32e0030d0ca135ff9f43909f397
--- /dev/null
+++ b/lib/kokkos/containers/unit_tests/TestVector.hpp
@@ -0,0 +1,131 @@
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+
+#ifndef KOKKOS_TEST_VECTOR_HPP
+#define KOKKOS_TEST_VECTOR_HPP
+
+#include <gtest/gtest.h>
+#include <iostream>
+#include <cstdlib>
+#include <cstdio>
+#include <impl/Kokkos_Timer.hpp>
+
+namespace Test {
+
+namespace Impl {
+
+  template <typename Scalar, class Device>
+  struct test_vector_combinations
+  {
+    typedef test_vector_combinations<Scalar,Device> self_type;
+
+    typedef Scalar scalar_type;
+    typedef Device execution_space;
+
+    Scalar reference;
+    Scalar result;
+
+    template <typename Vector>
+    Scalar run_me(unsigned int n){
+      Vector a(n,1);
+
+
+      a.push_back(2);
+      a.resize(n+4);
+      a[n+1] = 3;
+      a[n+2] = 4;
+      a[n+3] = 5;
+
+
+      Scalar temp1 = a[2];
+      Scalar temp2 = a[n];
+      Scalar temp3 = a[n+1];
+
+      a.assign(n+2,-1);
+
+      a[2] = temp1;
+      a[n] = temp2;
+      a[n+1] = temp3;
+
+      Scalar test1 = 0;
+      for(unsigned int i=0; i<a.size(); i++)
+        test1+=a[i];
+
+      a.assign(n+1,-2);
+      Scalar test2 = 0;
+      for(unsigned int i=0; i<a.size(); i++)
+        test2+=a[i];
+
+      a.reserve(n+10);
+
+      Scalar test3 = 0;
+      for(unsigned int i=0; i<a.size(); i++)
+        test3+=a[i];
+
+
+      return (test1*test2+test3)*test2+test1*test3;
+    }
+
+
+    test_vector_combinations(unsigned int size)
+    {
+      reference = run_me<std::vector<Scalar> >(size);
+      result = run_me<Kokkos::vector<Scalar,Device> >(size);
+    }
+
+   };
+
+} // namespace Impl
+
+
+
+
+template <typename Scalar, typename Device>
+void test_vector_combinations(unsigned int size)
+{
+  Impl::test_vector_combinations<Scalar,Device> test(size);
+  ASSERT_EQ( test.reference, test.result);
+}
+
+
+} // namespace Test
+
+#endif //KOKKOS_TEST_UNORDERED_MAP_HPP
diff --git a/lib/kokkos/containers/unit_tests/UnitTestMain.cpp b/lib/kokkos/containers/unit_tests/UnitTestMain.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f952ab3db51028aff0a0ebfe313b2639e353ab87
--- /dev/null
+++ b/lib/kokkos/containers/unit_tests/UnitTestMain.cpp
@@ -0,0 +1,50 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+int main(int argc, char *argv[]) {
+  ::testing::InitGoogleTest(&argc,argv);
+  return RUN_ALL_TESTS();
+}
+
diff --git a/lib/kokkos/core/CMakeLists.txt b/lib/kokkos/core/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..42fce6b2f210a73befefbb6c2a768fca5f9440df
--- /dev/null
+++ b/lib/kokkos/core/CMakeLists.txt
@@ -0,0 +1,11 @@
+
+
+TRIBITS_SUBPACKAGE(Core)
+
+ADD_SUBDIRECTORY(src)
+
+TRIBITS_ADD_TEST_DIRECTORIES(unit_test)
+TRIBITS_ADD_TEST_DIRECTORIES(perf_test)
+
+TRIBITS_SUBPACKAGE_POSTPROCESS()
+
diff --git a/lib/kokkos/core/cmake/Dependencies.cmake b/lib/kokkos/core/cmake/Dependencies.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..34ff0be5d3c6d26761b4758fda5d7217d66660e6
--- /dev/null
+++ b/lib/kokkos/core/cmake/Dependencies.cmake
@@ -0,0 +1,4 @@
+TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
+  LIB_OPTIONAL_TPLS Pthread CUDA HWLOC QTHREAD DLlib
+  TEST_OPTIONAL_TPLS CUSPARSE
+  )
diff --git a/lib/kokkos/core/cmake/KokkosCore_config.h.in b/lib/kokkos/core/cmake/KokkosCore_config.h.in
new file mode 100644
index 0000000000000000000000000000000000000000..27e3ba1c31f56aa35c6487488d96fa71f7b25d99
--- /dev/null
+++ b/lib/kokkos/core/cmake/KokkosCore_config.h.in
@@ -0,0 +1,57 @@
+#ifndef KOKKOS_CORE_CONFIG_H
+#define KOKKOS_CORE_CONFIG_H
+
+/* The trivial 'src/build_common.sh' creates a config
+ * that must stay in sync with this file.
+ */
+#cmakedefine KOKKOS_FOR_SIERRA
+
+#if !defined( KOKKOS_FOR_SIERRA )
+
+#cmakedefine KOKKOS_HAVE_MPI
+#cmakedefine KOKKOS_HAVE_CUDA
+
+// mfh 16 Sep 2014: If passed in on the command line, that overrides
+// any value of KOKKOS_USE_CUDA_UVM here.  Doing this should prevent build
+// warnings like this one:
+//
+// packages/kokkos/core/src/KokkosCore_config.h:13:1: warning: "KOKKOS_USE_CUDA_UVM" redefined
+//
+// At some point, we should edit the test-build scripts in
+// Trilinos/cmake/ctest/drivers/perseus/, and take
+// -DKOKKOS_USE_CUDA_UVM from the command-line arguments there.  I
+// hesitate to do that now, because I'm not sure if all the files are
+// including KokkosCore_config.h (or a header file that includes it) like
+// they should.
+
+#if ! defined(KOKKOS_USE_CUDA_UVM)
+#cmakedefine KOKKOS_USE_CUDA_UVM
+#endif // ! defined(KOKKOS_USE_CUDA_UVM)
+
+#cmakedefine KOKKOS_HAVE_PTHREAD
+#cmakedefine KOKKOS_HAVE_SERIAL
+#cmakedefine KOKKOS_HAVE_QTHREAD
+#cmakedefine KOKKOS_HAVE_Winthread
+#cmakedefine KOKKOS_HAVE_OPENMP
+#cmakedefine KOKKOS_HAVE_HWLOC
+#cmakedefine KOKKOS_HAVE_DEBUG
+#cmakedefine KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK
+#cmakedefine KOKKOS_HAVE_CXX11
+#cmakedefine KOKKOS_HAVE_CUSPARSE
+#cmakedefine KOKKOS_ENABLE_PROFILING_INTERNAL
+#ifdef KOKKOS_ENABLE_PROFILING_INTERNAL
+#define KOKKOS_ENABLE_PROFILING 1
+#else
+#define KOKKOS_ENABLE_PROFILING 0
+#endif
+
+// Don't forbid users from defining this macro on the command line,
+// but still make sure that CMake logic can control its definition.
+#if ! defined(KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA)
+#cmakedefine KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA 1
+#endif // KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA
+
+#cmakedefine KOKKOS_USING_DEPRECATED_VIEW
+
+#endif // KOKKOS_FOR_SIERRA
+#endif // KOKKOS_CORE_CONFIG_H
diff --git a/lib/kokkos/core/perf_test/CMakeLists.txt b/lib/kokkos/core/perf_test/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d93ca14d96fe159def46c29165e743313f91c9c4
--- /dev/null
+++ b/lib/kokkos/core/perf_test/CMakeLists.txt
@@ -0,0 +1,29 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINRARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+SET(SOURCES
+  PerfTestMain.cpp
+  PerfTestHost.cpp
+  PerfTestCuda.cpp
+  )
+
+# Per #374, we always want to build this test, but we only want to run
+# it as a PERFORMANCE test.  That's why we separate building the test
+# from running the test.
+
+TRIBITS_ADD_EXECUTABLE(
+  PerfTestExec
+  SOURCES ${SOURCES}
+  COMM serial mpi
+  TESTONLYLIBS kokkos_gtest
+  )
+
+TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  PerfTest
+  NAME PerfTestExec
+  COMM serial mpi
+  NUM_MPI_PROCS 1
+  CATEGORIES PERFORMANCE
+  FAIL_REGULAR_EXPRESSION "  FAILED  "
+  )
diff --git a/lib/kokkos/core/perf_test/Makefile b/lib/kokkos/core/perf_test/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..8fa1fbfc3c00795cf0739a95f1fd23a988b30fa6
--- /dev/null
+++ b/lib/kokkos/core/perf_test/Makefile
@@ -0,0 +1,66 @@
+KOKKOS_PATH = ../..
+
+GTEST_PATH = ../../tpls/gtest
+
+vpath %.cpp ${KOKKOS_PATH}/core/perf_test
+
+default: build_all
+	echo "End Build"
+
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+	CXX = $(NVCC_WRAPPER)
+	CXXFLAGS ?= -O3
+	LINK = $(CXX)
+	LDFLAGS ?= -lpthread
+else
+	CXX ?= g++
+	CXXFLAGS ?= -O3
+	LINK ?= $(CXX)
+	LDFLAGS ?=  -lpthread
+endif
+
+KOKKOS_CXXFLAGS += -I$(GTEST_PATH) -I${KOKKOS_PATH}/core/perf_test
+
+TEST_TARGETS = 
+TARGETS = 
+
+OBJ_PERF = PerfTestHost.o PerfTestCuda.o PerfTestMain.o gtest-all.o
+TARGETS += KokkosCore_PerformanceTest
+TEST_TARGETS += test-performance
+
+OBJ_ATOMICS = test_atomic.o 
+TARGETS += KokkosCore_PerformanceTest_Atomics
+TEST_TARGETS += test-atomic
+
+
+KokkosCore_PerformanceTest: $(OBJ_PERF) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_PERF) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_PerformanceTest
+
+KokkosCore_PerformanceTest_Atomics: $(OBJ_ATOMICS) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_ATOMICS) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_PerformanceTest_Atomics
+
+test-performance: KokkosCore_PerformanceTest
+	./KokkosCore_PerformanceTest
+
+test-atomic: KokkosCore_PerformanceTest_Atomics
+	./KokkosCore_PerformanceTest_Atomics
+
+
+build_all: $(TARGETS)
+
+test: $(TEST_TARGETS)
+
+clean: kokkos-clean 
+	rm -f *.o $(TARGETS)
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
+
+gtest-all.o:$(GTEST_PATH)/gtest/gtest-all.cc 
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $(GTEST_PATH)/gtest/gtest-all.cc
+
diff --git a/lib/kokkos/core/perf_test/PerfTestBlasKernels.hpp b/lib/kokkos/core/perf_test/PerfTestBlasKernels.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..aa4046cbf047defd47a89141d960ad330622d9b7
--- /dev/null
+++ b/lib/kokkos/core/perf_test/PerfTestBlasKernels.hpp
@@ -0,0 +1,309 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_BLAS_KERNELS_HPP
+#define KOKKOS_BLAS_KERNELS_HPP
+
+namespace Kokkos {
+
+template< class ConstVectorType ,
+          class Device = typename ConstVectorType::execution_space >
+struct Dot ;
+
+template< class ConstVectorType ,
+          class Device = typename ConstVectorType::execution_space >
+struct DotSingle ;
+
+template< class ConstScalarType ,
+          class VectorType ,
+          class Device = typename VectorType::execution_space >
+struct Scale ;
+
+template< class ConstScalarType ,
+          class ConstVectorType ,
+          class VectorType ,
+          class Device = typename VectorType::execution_space >
+struct AXPBY ;
+
+/** \brief  Y = alpha * X + beta * Y */
+template< class ConstScalarType ,
+          class ConstVectorType ,
+          class      VectorType >
+void axpby( const ConstScalarType & alpha ,
+            const ConstVectorType & X ,
+            const ConstScalarType & beta ,
+            const      VectorType & Y )
+{
+  typedef AXPBY< ConstScalarType , ConstVectorType , VectorType > functor ;
+
+  parallel_for( Y.dimension_0() , functor( alpha , X , beta , Y ) );
+}
+
+/** \brief  Y *= alpha */
+template< class ConstScalarType ,
+          class      VectorType >
+void scale( const ConstScalarType & alpha , const VectorType & Y )
+{
+  typedef Scale< ConstScalarType , VectorType > functor ;
+
+  parallel_for( Y.dimension_0() , functor( alpha , Y ) );
+}
+
+template< class ConstVectorType ,
+          class Finalize >
+void dot( const ConstVectorType & X ,
+          const ConstVectorType & Y ,
+          const Finalize & finalize )
+{
+  typedef Dot< ConstVectorType >  functor ;
+
+  parallel_reduce( X.dimension_0() , functor( X , Y ) , finalize );
+}
+
+template< class ConstVectorType ,
+          class Finalize >
+void dot( const ConstVectorType & X ,
+          const Finalize & finalize )
+{
+  typedef DotSingle< ConstVectorType >  functor ;
+
+  parallel_reduce( X.dimension_0() , functor( X ) , finalize );
+}
+
+} /* namespace Kokkos */
+
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+template< class Type , class Device >
+struct Dot
+{
+  typedef typename Device::execution_space execution_space ;
+
+  typedef typename
+    Impl::StaticAssertSame< Impl::unsigned_< 1 > ,
+                            Impl::unsigned_< Type::Rank > >::type ok_rank ;
+
+
+/*  typedef typename
+    Impl::StaticAssertSame< execution_space ,
+                            typename Type::execution_space >::type ok_device ;*/
+
+  typedef double value_type ;
+
+#if 1
+  typename Type::const_type X ;
+  typename Type::const_type Y ;
+#else
+  Type X ;
+  Type Y ;
+#endif
+
+  Dot( const Type & arg_x , const Type & arg_y )
+    : X(arg_x) , Y(arg_y) { }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int i , value_type & update ) const
+    { update += X[i] * Y[i]; }
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update ,
+                    const volatile value_type & source )
+    { update += source; }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init( value_type & update )
+    { update = 0 ; }
+};
+
+template< class Type , class Device >
+struct DotSingle
+{
+  typedef typename Device::execution_space execution_space ;
+
+  typedef typename
+    Impl::StaticAssertSame< Impl::unsigned_< 1 > ,
+                            Impl::unsigned_< Type::Rank > >::type ok_rank ;
+
+/*  typedef typename
+    Impl::StaticAssertSame< execution_space ,
+                            typename Type::execution_space >::type ok_device ;*/
+
+  typedef double value_type ;
+
+#if 1
+  typename Type::const_type X ;
+#else
+  Type X ;
+#endif
+
+  DotSingle( const Type & arg_x ) : X(arg_x) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int i , value_type & update ) const
+    {
+      const typename Type::value_type & x = X[i]; update += x * x ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update ,
+                    const volatile value_type & source )
+    { update += source; }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init( value_type & update )
+    { update = 0 ; }
+};
+
+
+template< class ScalarType , class VectorType , class Device>
+struct Scale
+{
+  typedef typename Device::execution_space execution_space ;
+
+/*  typedef typename
+    Impl::StaticAssertSame< execution_space ,
+                            typename ScalarType::execution_space >::type
+      ok_scalar_device ;
+
+  typedef typename
+    Impl::StaticAssertSame< execution_space ,
+                            typename VectorType::execution_space >::type
+      ok_vector_device ;*/
+
+  typedef typename
+    Impl::StaticAssertSame< Impl::unsigned_< 0 > ,
+                            Impl::unsigned_< ScalarType::Rank > >::type
+      ok_scalar_rank ;
+
+  typedef typename
+    Impl::StaticAssertSame< Impl::unsigned_< 1 > ,
+                            Impl::unsigned_< VectorType::Rank > >::type
+      ok_vector_rank ;
+
+#if 1
+  typename ScalarType::const_type alpha ;
+#else
+  ScalarType alpha ;
+#endif
+
+  VectorType Y ;
+
+  Scale( const ScalarType & arg_alpha , const VectorType & arg_Y )
+    : alpha( arg_alpha ), Y( arg_Y ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int i ) const
+    {
+      Y[i] *= alpha() ;
+    }
+};
+
+
+template< class ScalarType ,
+          class ConstVectorType ,
+          class VectorType,
+          class Device>
+struct AXPBY
+{
+  typedef typename Device::execution_space execution_space ;
+
+/*  typedef typename
+    Impl::StaticAssertSame< execution_space ,
+                            typename ScalarType::execution_space >::type
+      ok_scalar_device ;
+
+  typedef typename
+    Impl::StaticAssertSame< execution_space ,
+                            typename ConstVectorType::execution_space >::type
+      ok_const_vector_device ;
+
+  typedef typename
+    Impl::StaticAssertSame< execution_space ,
+                            typename VectorType::execution_space >::type
+      ok_vector_device ;*/
+
+  typedef typename
+    Impl::StaticAssertSame< Impl::unsigned_< 0 > ,
+                            Impl::unsigned_< ScalarType::Rank > >::type
+      ok_scalar_rank ;
+
+  typedef typename
+    Impl::StaticAssertSame< Impl::unsigned_< 1 > ,
+                            Impl::unsigned_< ConstVectorType::Rank > >::type
+      ok_const_vector_rank ;
+
+  typedef typename
+    Impl::StaticAssertSame< Impl::unsigned_< 1 > ,
+                            Impl::unsigned_< VectorType::Rank > >::type
+      ok_vector_rank ;
+
+#if 1
+  typename ScalarType::const_type alpha , beta ;
+  typename ConstVectorType::const_type X ;
+#else
+  ScalarType alpha , beta ;
+  ConstVectorType X ;
+#endif
+
+  VectorType Y ;
+
+  AXPBY( const ScalarType      & arg_alpha ,
+         const ConstVectorType & arg_X ,
+         const ScalarType      & arg_beta ,
+         const VectorType      & arg_Y )
+    : alpha( arg_alpha ), beta( arg_beta ), X( arg_X ), Y( arg_Y ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int i ) const
+    {
+      Y[i] = alpha() * X[i] + beta() * Y[i] ;
+    }
+};
+
+} /* namespace Kokkos */
+
+#endif /* #ifndef KOKKOS_BLAS_KERNELS_HPP */
diff --git a/lib/kokkos/core/perf_test/PerfTestCuda.cpp b/lib/kokkos/core/perf_test/PerfTestCuda.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..524beb8b90303ba21fe06bb2cf60b0100b480169
--- /dev/null
+++ b/lib/kokkos/core/perf_test/PerfTestCuda.cpp
@@ -0,0 +1,189 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <iostream>
+#include <iomanip>
+#include <algorithm>
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+#if defined( KOKKOS_HAVE_CUDA )
+
+#include <impl/Kokkos_Timer.hpp>
+
+#include <PerfTestHexGrad.hpp>
+#include <PerfTestBlasKernels.hpp>
+#include <PerfTestGramSchmidt.hpp>
+#include <PerfTestDriver.hpp>
+
+
+namespace Test {
+
+class cuda : public ::testing::Test {
+  protected:
+    static void SetUpTestCase() {
+      Kokkos::HostSpace::execution_space::initialize();
+      Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice(0) );
+    }
+    static void TearDownTestCase() {
+      Kokkos::Cuda::finalize();
+      Kokkos::HostSpace::execution_space::finalize();
+    }
+};
+
+TEST_F( cuda, hexgrad )
+{
+  EXPECT_NO_THROW( run_test_hexgrad< Kokkos::Cuda >( 10 , 20, "Kokkos::Cuda" ) );
+}
+
+TEST_F( cuda, gramschmidt )
+{
+  EXPECT_NO_THROW( run_test_gramschmidt< Kokkos::Cuda >( 10 , 20, "Kokkos::Cuda" ) );
+}
+
+namespace {
+
+template <typename T>
+struct TextureFetch
+{
+  typedef Kokkos::View< T *, Kokkos::CudaSpace> array_type;
+  typedef Kokkos::View< const T *, Kokkos::CudaSpace, Kokkos::MemoryRandomAccess> const_array_type;
+  typedef Kokkos::View< int *, Kokkos::CudaSpace> index_array_type;
+  typedef Kokkos::View< const int *, Kokkos::CudaSpace> const_index_array_type;
+
+  struct FillArray
+  {
+    array_type m_array;
+    FillArray( const array_type & array )
+      : m_array(array)
+    {}
+
+    void apply() const
+    {
+      Kokkos::parallel_for( Kokkos::RangePolicy<Kokkos::Cuda,int>(0,m_array.dimension_0()), *this);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(int i) const { m_array(i) = i; }
+  };
+
+  struct RandomIndexes
+  {
+    index_array_type m_indexes;
+    typename index_array_type::HostMirror m_host_indexes;
+    RandomIndexes( const index_array_type & indexes)
+      : m_indexes(indexes)
+      , m_host_indexes(Kokkos::create_mirror(m_indexes))
+    {}
+
+    void apply() const
+    {
+      Kokkos::parallel_for( Kokkos::RangePolicy<Kokkos::HostSpace::execution_space,int>(0,m_host_indexes.dimension_0()), *this);
+      //random shuffle
+      Kokkos::HostSpace::execution_space::fence();
+      std::random_shuffle(m_host_indexes.ptr_on_device(), m_host_indexes.ptr_on_device() + m_host_indexes.dimension_0());
+      Kokkos::deep_copy(m_indexes,m_host_indexes);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(int i) const { m_host_indexes(i) = i; }
+  };
+
+  struct RandomReduce
+  {
+    const_array_type       m_array;
+    const_index_array_type m_indexes;
+    RandomReduce( const const_array_type & array, const const_index_array_type & indexes)
+      : m_array(array)
+      , m_indexes(indexes)
+    {}
+
+    void apply(T & reduce) const
+    {
+      Kokkos::parallel_reduce( Kokkos::RangePolicy<Kokkos::Cuda,int>(0,m_array.dimension_0()), *this, reduce);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(int i, T & reduce) const
+    { reduce += m_array(m_indexes(i)); }
+  };
+
+  static void run(int size, double & reduce_time, T &reduce)
+  {
+    array_type array("array",size);
+    index_array_type indexes("indexes",size);
+
+    { FillArray f(array); f.apply(); }
+    { RandomIndexes f(indexes); f.apply(); }
+
+    Kokkos::Cuda::fence();
+
+    Kokkos::Timer timer;
+    for (int j=0; j<10; ++j) {
+      RandomReduce f(array,indexes);
+      f.apply(reduce);
+    }
+    Kokkos::Cuda::fence();
+    reduce_time = timer.seconds();
+  }
+};
+
+} // unnamed namespace
+
+TEST_F( cuda, texture_double )
+{
+  printf("Random reduce of double through texture fetch\n");
+  for (int i=1; i<=26; ++i) {
+    int size = 1<<i;
+    double time = 0;
+    double reduce = 0;
+    TextureFetch<double>::run(size,time,reduce);
+    printf("   time = %1.3e   size = 2^%d\n", time, i);
+  }
+}
+
+} // namespace Test
+
+#endif /* #if defined( KOKKOS_HAVE_CUDA ) */
+
diff --git a/lib/kokkos/core/perf_test/PerfTestDriver.hpp b/lib/kokkos/core/perf_test/PerfTestDriver.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e3dd3b4123a2dae6fd4f69f77a046796f9c040c8
--- /dev/null
+++ b/lib/kokkos/core/perf_test/PerfTestDriver.hpp
@@ -0,0 +1,152 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <iostream>
+#include <string>
+
+// mfh 06 Jun 2013: This macro doesn't work like one might thing it
+// should.  It doesn't take the template parameter DeviceType and
+// print its actual type name; it just literally prints out
+// "DeviceType".  I've worked around this below without using the
+// macro, so I'm commenting out the macro to avoid compiler complaints
+// about an unused macro.
+
+// #define KOKKOS_MACRO_IMPL_TO_STRING( X ) #X
+// #define KOKKOS_MACRO_TO_STRING( X )  KOKKOS_MACRO_IMPL_TO_STRING( X )
+
+//------------------------------------------------------------------------
+
+namespace Test {
+
+enum { NUMBER_OF_TRIALS = 5 };
+
+
+
+template< class DeviceType >
+void run_test_hexgrad( int exp_beg , int exp_end, const char deviceTypeName[] )
+{
+  std::string label_hexgrad ;
+  label_hexgrad.append( "\"HexGrad< double , " );
+  // mfh 06 Jun 2013: This only appends "DeviceType" (literally) to
+  // the string, not the actual name of the device type.  Thus, I've
+  // modified the function to take the name of the device type.
+  //
+  //label_hexgrad.append( KOKKOS_MACRO_TO_STRING( DeviceType ) );
+  label_hexgrad.append( deviceTypeName );
+  label_hexgrad.append( " >\"" );
+
+  for (int i = exp_beg ; i < exp_end ; ++i) {
+    double min_seconds = 0.0 ;
+    double max_seconds = 0.0 ;
+    double avg_seconds = 0.0 ;
+
+    const int parallel_work_length = 1<<i;
+
+    for ( int j = 0 ; j < NUMBER_OF_TRIALS ; ++j ) {
+      const double seconds = HexGrad< DeviceType >::test(parallel_work_length) ;
+
+      if ( 0 == j ) {
+        min_seconds = seconds ;
+        max_seconds = seconds ;
+      }
+      else {
+        if ( seconds < min_seconds ) min_seconds = seconds ;
+        if ( seconds > max_seconds ) max_seconds = seconds ;
+      }
+      avg_seconds += seconds ;
+    }
+    avg_seconds /= NUMBER_OF_TRIALS ;
+
+    std::cout << label_hexgrad
+      << " , " << parallel_work_length
+      << " , " << min_seconds
+      << " , " << ( min_seconds / parallel_work_length )
+      << std::endl ;
+  }
+}
+
+template< class DeviceType >
+void run_test_gramschmidt( int exp_beg , int exp_end, const char deviceTypeName[] )
+{
+  std::string label_gramschmidt ;
+  label_gramschmidt.append( "\"GramSchmidt< double , " );
+  // mfh 06 Jun 2013: This only appends "DeviceType" (literally) to
+  // the string, not the actual name of the device type.  Thus, I've
+  // modified the function to take the name of the device type.
+  //
+  //label_gramschmidt.append( KOKKOS_MACRO_TO_STRING( DeviceType ) );
+  label_gramschmidt.append( deviceTypeName );
+  label_gramschmidt.append( " >\"" );
+
+  for (int i = exp_beg ; i < exp_end ; ++i) {
+    double min_seconds = 0.0 ;
+    double max_seconds = 0.0 ;
+    double avg_seconds = 0.0 ;
+
+    const int parallel_work_length = 1<<i;
+
+    for ( int j = 0 ; j < NUMBER_OF_TRIALS ; ++j ) {
+      const double seconds = ModifiedGramSchmidt< double , DeviceType >::test(parallel_work_length, 32 ) ;
+
+      if ( 0 == j ) {
+        min_seconds = seconds ;
+        max_seconds = seconds ;
+      }
+      else {
+        if ( seconds < min_seconds ) min_seconds = seconds ;
+        if ( seconds > max_seconds ) max_seconds = seconds ;
+      }
+      avg_seconds += seconds ;
+    }
+    avg_seconds /= NUMBER_OF_TRIALS ;
+
+    std::cout << label_gramschmidt
+      << " , " << parallel_work_length
+      << " , " << min_seconds
+      << " , " << ( min_seconds / parallel_work_length )
+      << std::endl ;
+  }
+}
+
+}
+
diff --git a/lib/kokkos/core/perf_test/PerfTestGramSchmidt.hpp b/lib/kokkos/core/perf_test/PerfTestGramSchmidt.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..516696b141d22ab5ac0662ef2c6d78fae8c9b8ad
--- /dev/null
+++ b/lib/kokkos/core/perf_test/PerfTestGramSchmidt.hpp
@@ -0,0 +1,226 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cmath>
+#include <PerfTestBlasKernels.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Test {
+
+// Reduction   : result = dot( Q(:,j) , Q(:,j) );
+// PostProcess : R(j,j) = result ; inv = 1 / result ;
+template< class VectorView , class ValueView  >
+struct InvNorm2 : public Kokkos::DotSingle< VectorView > {
+
+  typedef typename Kokkos::DotSingle< VectorView >::value_type value_type ;
+
+  ValueView  Rjj ;
+  ValueView  inv ;
+
+  InvNorm2( const VectorView & argX ,
+            const ValueView  & argR ,
+            const ValueView  & argInv )
+    : Kokkos::DotSingle< VectorView >( argX )
+    , Rjj( argR )
+    , inv( argInv )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  void final( value_type & result ) const
+  {
+    result = sqrt( result );
+    Rjj() = result ;
+    inv() = ( 0 < result ) ? 1.0 / result : 0 ;
+  }
+};
+
+template< class VectorView , class ValueView >
+inline
+void invnorm2( const VectorView & x ,
+               const ValueView  & r ,
+               const ValueView  & r_inv )
+{
+  Kokkos::parallel_reduce( x.dimension_0() , InvNorm2< VectorView , ValueView >( x , r , r_inv ) );
+}
+
+// PostProcess : tmp = - ( R(j,k) = result );
+template< class VectorView , class ValueView  >
+struct DotM : public Kokkos::Dot< VectorView > {
+
+  typedef typename Kokkos::Dot< VectorView >::value_type value_type ;
+
+  ValueView  Rjk ;
+  ValueView  tmp ;
+
+  DotM( const VectorView & argX ,
+        const VectorView & argY ,
+        const ValueView & argR ,
+        const ValueView & argTmp )
+    : Kokkos::Dot< VectorView >( argX , argY )
+    , Rjk( argR )
+    , tmp( argTmp )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  void final( value_type & result ) const
+  {
+     Rjk()  = result ;
+     tmp()  = - result ;
+  }
+};
+
+template< class VectorView , class ValueView >
+inline
+void dot_neg( const VectorView & x ,
+              const VectorView & y ,
+              const ValueView  & r ,
+              const ValueView  & r_neg )
+{
+  Kokkos::parallel_reduce( x.dimension_0() , DotM< VectorView , ValueView >( x , y , r , r_neg ) );
+}
+
+
+template< typename Scalar , class DeviceType >
+struct ModifiedGramSchmidt
+{
+  typedef DeviceType  execution_space ;
+  typedef typename execution_space::size_type  size_type ;
+
+  typedef Kokkos::View< Scalar** ,
+                        Kokkos::LayoutLeft ,
+                        execution_space > multivector_type ;
+
+  typedef Kokkos::View< Scalar* ,
+                        Kokkos::LayoutLeft ,
+                        execution_space > vector_type ;
+
+  typedef Kokkos::View< Scalar ,
+                        Kokkos::LayoutLeft ,
+                        execution_space > value_view ;
+
+
+  multivector_type Q ;
+  multivector_type R ;
+
+  static double factorization( const multivector_type Q_ ,
+                               const multivector_type R_ )
+  {
+    const size_type count  = Q_.dimension_1();
+    value_view tmp("tmp");
+    value_view one("one");
+
+    Kokkos::deep_copy( one , (Scalar) 1 );
+
+    Kokkos::Timer timer ;
+
+    for ( size_type j = 0 ; j < count ; ++j ) {
+      // Reduction   : tmp = dot( Q(:,j) , Q(:,j) );
+      // PostProcess : tmp = sqrt( tmp ); R(j,j) = tmp ; tmp = 1 / tmp ;
+      const vector_type Qj  = Kokkos::subview( Q_ , Kokkos::ALL() , j );
+      const value_view  Rjj = Kokkos::subview( R_ , j , j );
+
+      invnorm2( Qj , Rjj , tmp );
+
+      // Q(:,j) *= ( 1 / R(j,j) ); => Q(:,j) *= tmp ;
+      Kokkos::scale( tmp , Qj );
+
+      for ( size_t k = j + 1 ; k < count ; ++k ) {
+        const vector_type Qk = Kokkos::subview( Q_ , Kokkos::ALL() , k );
+        const value_view  Rjk = Kokkos::subview( R_ , j , k );
+
+        // Reduction   : R(j,k) = dot( Q(:,j) , Q(:,k) );
+        // PostProcess : tmp = - R(j,k);
+        dot_neg( Qj , Qk , Rjk , tmp );
+
+        // Q(:,k) -= R(j,k) * Q(:,j); => Q(:,k) += tmp * Q(:,j)
+        Kokkos::axpby( tmp , Qj , one , Qk );
+      }
+    }
+
+    execution_space::fence();
+
+    return timer.seconds();
+  }
+
+  //--------------------------------------------------------------------------
+
+  static double test( const size_t length ,
+                      const size_t count ,
+                      const size_t iter = 1 )
+  {
+    multivector_type Q_( "Q" , length , count );
+    multivector_type R_( "R" , count , count );
+
+    typename multivector_type::HostMirror A =
+      Kokkos::create_mirror( Q_ );
+
+    // Create and fill A on the host
+
+    for ( size_type j = 0 ; j < count ; ++j ) {
+      for ( size_type i = 0 ; i < length ; ++i ) {
+        A(i,j) = ( i + 1 ) * ( j + 1 );
+      }
+    }
+
+    double dt_min = 0 ;
+
+    for ( size_t i = 0 ; i < iter ; ++i ) {
+
+      Kokkos::deep_copy( Q_ , A );
+
+      // A = Q * R
+
+      const double dt = factorization( Q_ , R_ );
+
+      if ( 0 == i ) dt_min = dt ;
+      else dt_min = dt < dt_min ? dt : dt_min ;
+    }
+
+    return dt_min ;
+  }
+};
+
+}
+
diff --git a/lib/kokkos/core/perf_test/PerfTestHexGrad.hpp b/lib/kokkos/core/perf_test/PerfTestHexGrad.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ed5371f29c4db0fc2af4613d301006b1e96a0f28
--- /dev/null
+++ b/lib/kokkos/core/perf_test/PerfTestHexGrad.hpp
@@ -0,0 +1,268 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+namespace Test {
+
+template< class DeviceType ,
+          typename CoordScalarType = double ,
+          typename GradScalarType  = float >
+struct HexGrad
+{
+  typedef DeviceType execution_space ;
+  typedef typename execution_space::size_type  size_type ;
+
+  typedef HexGrad<DeviceType,CoordScalarType,GradScalarType> self_type;
+
+  // 3D array : ( ParallelWork , Space , Node )
+
+  enum { NSpace = 3 , NNode = 8 };
+
+  typedef Kokkos::View< CoordScalarType*[NSpace][NNode] , execution_space >
+    elem_coord_type ;
+
+  typedef Kokkos::View< GradScalarType*[NSpace][NNode] , execution_space >
+    elem_grad_type ;
+
+  elem_coord_type  coords ;
+  elem_grad_type   grad_op ;
+
+  enum { FLOPS  = 318 }; // = 3 * ( 18 + 8 * 11 ) };
+  enum { READS  = 18 };
+  enum { WRITES = 18 };
+
+  HexGrad( const elem_coord_type  & arg_coords ,
+           const elem_grad_type   & arg_grad_op )
+    : coords( arg_coords )
+    , grad_op( arg_grad_op )
+    {}
+
+  KOKKOS_INLINE_FUNCTION static
+  void grad( const CoordScalarType x[] ,
+             const CoordScalarType z[] ,
+                   GradScalarType grad_y[] )
+  {
+    const GradScalarType R42=(x[3] - x[1]);
+    const GradScalarType R52=(x[4] - x[1]);
+    const GradScalarType R54=(x[4] - x[3]);
+
+    const GradScalarType R63=(x[5] - x[2]);
+    const GradScalarType R83=(x[7] - x[2]);
+    const GradScalarType R86=(x[7] - x[5]);
+
+    const GradScalarType R31=(x[2] - x[0]);
+    const GradScalarType R61=(x[5] - x[0]);
+    const GradScalarType R74=(x[6] - x[3]);
+
+    const GradScalarType R72=(x[6] - x[1]);
+    const GradScalarType R75=(x[6] - x[4]);
+    const GradScalarType R81=(x[7] - x[0]);
+
+    const GradScalarType t1=(R63 + R54);
+    const GradScalarType t2=(R61 + R74);
+    const GradScalarType t3=(R72 + R81);
+
+    const GradScalarType t4 =(R86 + R42);
+    const GradScalarType t5 =(R83 + R52);
+    const GradScalarType t6 =(R75 + R31);
+
+    //  Calculate Y gradient from X and Z data
+
+    grad_y[0] = (z[1] *  t1) - (z[2] * R42) - (z[3] *  t5)  + (z[4] *  t4) + (z[5] * R52) - (z[7] * R54);
+    grad_y[1] = (z[2] *  t2) + (z[3] * R31) - (z[0] *  t1)  - (z[5] *  t6) + (z[6] * R63) - (z[4] * R61);
+    grad_y[2] = (z[3] *  t3) + (z[0] * R42) - (z[1] *  t2)  - (z[6] *  t4) + (z[7] * R74) - (z[5] * R72);
+    grad_y[3] = (z[0] *  t5) - (z[1] * R31) - (z[2] *  t3)  + (z[7] *  t6) + (z[4] * R81) - (z[6] * R83);
+    grad_y[4] = (z[5] *  t3) + (z[6] * R86) - (z[7] *  t2)  - (z[0] *  t4) - (z[3] * R81) + (z[1] * R61);
+    grad_y[5] = (z[6] *  t5) - (z[4] *  t3)  - (z[7] * R75) + (z[1] *  t6) - (z[0] * R52) + (z[2] * R72);
+    grad_y[6] = (z[7] *  t1) - (z[5] *  t5)  - (z[4] * R86) + (z[2] *  t4) - (z[1] * R63) + (z[3] * R83);
+    grad_y[7] = (z[4] *  t2) - (z[6] *  t1)  + (z[5] * R75) - (z[3] *  t6) - (z[2] * R74) + (z[0] * R54);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( size_type ielem ) const
+  {
+    GradScalarType g[NNode] ;
+
+    const CoordScalarType x[NNode] = {
+      coords(ielem,0,0),
+      coords(ielem,0,1),
+      coords(ielem,0,2),
+      coords(ielem,0,3),
+      coords(ielem,0,4),
+      coords(ielem,0,5),
+      coords(ielem,0,6),
+      coords(ielem,0,7)
+    };
+
+    const CoordScalarType y[NNode] = {
+      coords(ielem,1,0),
+      coords(ielem,1,1),
+      coords(ielem,1,2),
+      coords(ielem,1,3),
+      coords(ielem,1,4),
+      coords(ielem,1,5),
+      coords(ielem,1,6),
+      coords(ielem,1,7)
+    };
+
+    const CoordScalarType z[NNode] = {
+      coords(ielem,2,0),
+      coords(ielem,2,1),
+      coords(ielem,2,2),
+      coords(ielem,2,3),
+      coords(ielem,2,4),
+      coords(ielem,2,5),
+      coords(ielem,2,6),
+      coords(ielem,2,7)
+    };
+
+    grad( z , y , g );
+
+    grad_op(ielem,0,0) = g[0];
+    grad_op(ielem,0,1) = g[1];
+    grad_op(ielem,0,2) = g[2];
+    grad_op(ielem,0,3) = g[3];
+    grad_op(ielem,0,4) = g[4];
+    grad_op(ielem,0,5) = g[5];
+    grad_op(ielem,0,6) = g[6];
+    grad_op(ielem,0,7) = g[7];
+
+    grad( x , z , g );
+
+    grad_op(ielem,1,0) = g[0];
+    grad_op(ielem,1,1) = g[1];
+    grad_op(ielem,1,2) = g[2];
+    grad_op(ielem,1,3) = g[3];
+    grad_op(ielem,1,4) = g[4];
+    grad_op(ielem,1,5) = g[5];
+    grad_op(ielem,1,6) = g[6];
+    grad_op(ielem,1,7) = g[7];
+
+    grad( y , x , g );
+
+    grad_op(ielem,2,0) = g[0];
+    grad_op(ielem,2,1) = g[1];
+    grad_op(ielem,2,2) = g[2];
+    grad_op(ielem,2,3) = g[3];
+    grad_op(ielem,2,4) = g[4];
+    grad_op(ielem,2,5) = g[5];
+    grad_op(ielem,2,6) = g[6];
+    grad_op(ielem,2,7) = g[7];
+  }
+
+  //--------------------------------------------------------------------------
+
+  struct Init {
+    typedef typename self_type::execution_space execution_space ;
+
+    elem_coord_type coords ;
+
+    Init( const elem_coord_type & arg_coords )
+      : coords( arg_coords ) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()( size_type ielem ) const
+    {
+      coords(ielem,0,0) = 0.;
+      coords(ielem,1,0) = 0.;
+      coords(ielem,2,0) = 0.;
+
+      coords(ielem,0,1) = 1.;
+      coords(ielem,1,1) = 0.;
+      coords(ielem,2,1) = 0.;
+
+      coords(ielem,0,2) = 1.;
+      coords(ielem,1,2) = 1.;
+      coords(ielem,2,2) = 0.;
+
+      coords(ielem,0,3) = 0.;
+      coords(ielem,1,3) = 1.;
+      coords(ielem,2,3) = 0.;
+
+
+      coords(ielem,0,4) = 0.;
+      coords(ielem,1,4) = 0.;
+      coords(ielem,2,4) = 1.;
+
+      coords(ielem,0,5) = 1.;
+      coords(ielem,1,5) = 0.;
+      coords(ielem,2,5) = 1.;
+
+      coords(ielem,0,6) = 1.;
+      coords(ielem,1,6) = 1.;
+      coords(ielem,2,6) = 1.;
+
+      coords(ielem,0,7) = 0.;
+      coords(ielem,1,7) = 1.;
+      coords(ielem,2,7) = 1.;
+    }
+  };
+
+  //--------------------------------------------------------------------------
+
+  static double test( const int count , const int iter = 1 )
+  {
+    elem_coord_type coord( "coord" , count );
+    elem_grad_type  grad ( "grad" , count );
+
+    // Execute the parallel kernels on the arrays:
+
+    double dt_min = 0 ;
+
+    Kokkos::parallel_for( count , Init( coord ) );
+    execution_space::fence();
+
+    for ( int i = 0 ; i < iter ; ++i ) {
+      Kokkos::Timer timer ;
+      Kokkos::parallel_for( count , HexGrad<execution_space>( coord , grad ) );
+      execution_space::fence();
+      const double dt = timer.seconds();
+      if ( 0 == i ) dt_min = dt ;
+      else dt_min = dt < dt_min ? dt : dt_min ;
+    }
+
+    return dt_min ;
+  }
+};
+
+}
+
diff --git a/lib/kokkos/core/perf_test/PerfTestHost.cpp b/lib/kokkos/core/perf_test/PerfTestHost.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6a0f2efadacd01e979d3beefd23b617b81acff48
--- /dev/null
+++ b/lib/kokkos/core/perf_test/PerfTestHost.cpp
@@ -0,0 +1,104 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+#if defined( KOKKOS_HAVE_OPENMP )
+
+typedef Kokkos::OpenMP TestHostDevice ;
+const char TestHostDeviceName[] = "Kokkos::OpenMP" ;
+
+#elif defined( KOKKOS_HAVE_PTHREAD )
+
+typedef Kokkos::Threads TestHostDevice ;
+const char TestHostDeviceName[] = "Kokkos::Threads" ;
+
+#elif defined( KOKKOS_HAVE_SERIAL )
+
+typedef Kokkos::Serial TestHostDevice ;
+const char TestHostDeviceName[] = "Kokkos::Serial" ;
+
+#else
+#  error "You must enable at least one of the following execution spaces in order to build this test: Kokkos::Threads, Kokkos::OpenMP, or Kokkos::Serial."
+#endif
+
+#include <impl/Kokkos_Timer.hpp>
+
+#include <PerfTestHexGrad.hpp>
+#include <PerfTestBlasKernels.hpp>
+#include <PerfTestGramSchmidt.hpp>
+#include <PerfTestDriver.hpp>
+
+//------------------------------------------------------------------------
+
+namespace Test {
+
+class host : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+    const unsigned team_count = Kokkos::hwloc::get_available_numa_count();
+    const unsigned threads_per_team = 4 ;
+
+    TestHostDevice::initialize( team_count * threads_per_team );
+  }
+
+  static void TearDownTestCase()
+  {
+    TestHostDevice::finalize();
+  }
+};
+
+TEST_F( host, hexgrad ) {
+  EXPECT_NO_THROW(run_test_hexgrad< TestHostDevice>( 10, 20, TestHostDeviceName ));
+}
+
+TEST_F( host, gramschmidt ) {
+  EXPECT_NO_THROW(run_test_gramschmidt< TestHostDevice>( 10, 20, TestHostDeviceName ));
+}
+
+} // namespace Test
+
+
diff --git a/lib/kokkos/core/perf_test/PerfTestMain.cpp b/lib/kokkos/core/perf_test/PerfTestMain.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ac916308292076fc27231968715518b3f5c02f80
--- /dev/null
+++ b/lib/kokkos/core/perf_test/PerfTestMain.cpp
@@ -0,0 +1,49 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+int main(int argc, char *argv[]) {
+  ::testing::InitGoogleTest(&argc,argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/lib/kokkos/core/perf_test/test_atomic.cpp b/lib/kokkos/core/perf_test/test_atomic.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ab73f2505e28df6bda1c8f4a43d66fc20093bf2a
--- /dev/null
+++ b/lib/kokkos/core/perf_test/test_atomic.cpp
@@ -0,0 +1,507 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cstdio>
+#include <cstring>
+#include <cstdlib>
+
+#include <Kokkos_Core.hpp>
+#include <impl/Kokkos_Timer.hpp>
+
+typedef Kokkos::DefaultExecutionSpace exec_space;
+
+#define RESET		0
+#define BRIGHT 		1
+#define DIM		2
+#define UNDERLINE 	3
+#define BLINK		4
+#define REVERSE		7
+#define HIDDEN		8
+
+#define BLACK 		0
+#define RED		1
+#define GREEN		2
+#define YELLOW		3
+#define BLUE		4
+#define MAGENTA		5
+#define CYAN		6
+#define GREY		7
+#define	WHITE		8
+
+void textcolor(int attr, int fg, int bg)
+{	char command[13];
+
+	/* Command is the control command to the terminal */
+	sprintf(command, "%c[%d;%d;%dm", 0x1B, attr, fg + 30, bg + 40);
+	printf("%s", command);
+}
+void textcolor_standard() {textcolor(RESET, BLACK, WHITE);}
+
+
+template<class T,class DEVICE_TYPE>
+struct ZeroFunctor{
+  typedef DEVICE_TYPE execution_space;
+  typedef typename Kokkos::View<T,execution_space> type;
+  typedef typename Kokkos::View<T,execution_space>::HostMirror h_type;
+  type data;
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int i) const {
+    data() = 0;
+  }
+};
+
+//---------------------------------------------------
+//--------------atomic_fetch_add---------------------
+//---------------------------------------------------
+
+template<class T,class DEVICE_TYPE>
+struct AddFunctor{
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View<T,execution_space> type;
+  type data;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int i) const {
+    Kokkos::atomic_fetch_add(&data(),(T)1);
+  }
+};
+
+template<class T>
+T AddLoop(int loop) {
+  struct ZeroFunctor<T,exec_space> f_zero;
+  typename ZeroFunctor<T,exec_space>::type data("Data");
+  typename ZeroFunctor<T,exec_space>::h_type h_data("HData");
+  f_zero.data = data;
+  Kokkos::parallel_for(1,f_zero);
+  exec_space::fence();
+
+  struct AddFunctor<T,exec_space> f_add;
+  f_add.data = data;
+  Kokkos::parallel_for(loop,f_add);
+  exec_space::fence();
+
+  Kokkos::deep_copy(h_data,data);
+  T val = h_data();
+  return val;
+}
+
+template<class T,class DEVICE_TYPE>
+struct AddNonAtomicFunctor{
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View<T,execution_space> type;
+  type data;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int i) const {
+    data()+=(T)1;
+  }
+};
+
+template<class T>
+T AddLoopNonAtomic(int loop) {
+  struct ZeroFunctor<T,exec_space> f_zero;
+  typename ZeroFunctor<T,exec_space>::type data("Data");
+  typename ZeroFunctor<T,exec_space>::h_type h_data("HData");
+
+  f_zero.data = data;
+  Kokkos::parallel_for(1,f_zero);
+  exec_space::fence();
+
+  struct AddNonAtomicFunctor<T,exec_space> f_add;
+  f_add.data = data;
+  Kokkos::parallel_for(loop,f_add);
+  exec_space::fence();
+
+  Kokkos::deep_copy(h_data,data);
+  T val = h_data();
+
+  return val;
+}
+
+template<class T>
+T AddLoopSerial(int loop) {
+  T* data = new T[1];
+  data[0] = 0;
+
+  for(int i=0;i<loop;i++)
+  *data+=(T)1;
+
+  T val = *data;
+  delete [] data;
+  return val;
+}
+
+template<class T,class DEVICE_TYPE>
+struct CASFunctor{
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View<T,execution_space> type;
+  type data;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int i) const {
+	  T old = data();
+	  T newval, assumed;
+	  do {
+	    assumed = old;
+	    newval = assumed + (T)1;
+	    old = Kokkos::atomic_compare_exchange(&data(), assumed, newval);
+	  }
+	  while( old != assumed );
+  }
+};
+
+template<class T>
+T CASLoop(int loop) {
+  struct ZeroFunctor<T,exec_space> f_zero;
+  typename ZeroFunctor<T,exec_space>::type data("Data");
+  typename ZeroFunctor<T,exec_space>::h_type h_data("HData");
+  f_zero.data = data;
+  Kokkos::parallel_for(1,f_zero);
+  exec_space::fence();
+
+  struct CASFunctor<T,exec_space> f_cas;
+  f_cas.data = data;
+  Kokkos::parallel_for(loop,f_cas);
+  exec_space::fence();
+
+  Kokkos::deep_copy(h_data,data);
+  T val = h_data();
+
+  return val;
+}
+
+template<class T,class DEVICE_TYPE>
+struct CASNonAtomicFunctor{
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View<T,execution_space> type;
+  type data;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int i) const {
+	  volatile T assumed;
+	  volatile T newval;
+	  bool fail=1;
+	  do {
+	    assumed = data();
+	    newval = assumed + (T)1;
+	    if(data()==assumed) {
+	    	data() = newval;
+	    	fail = 0;
+	    }
+	  }
+	  while(fail);
+  }
+};
+
+template<class T>
+T CASLoopNonAtomic(int loop) {
+  struct ZeroFunctor<T,exec_space> f_zero;
+  typename ZeroFunctor<T,exec_space>::type data("Data");
+  typename ZeroFunctor<T,exec_space>::h_type h_data("HData");
+  f_zero.data = data;
+  Kokkos::parallel_for(1,f_zero);
+  exec_space::fence();
+
+  struct CASNonAtomicFunctor<T,exec_space> f_cas;
+  f_cas.data = data;
+  Kokkos::parallel_for(loop,f_cas);
+  exec_space::fence();
+
+  Kokkos::deep_copy(h_data,data);
+  T val = h_data();
+
+  return val;
+}
+
+template<class T>
+T CASLoopSerial(int loop) {
+  T* data = new T[1];
+  data[0] = 0;
+
+  for(int i=0;i<loop;i++) {
+	  T assumed;
+	  T newval;
+	  T old;
+	  do {
+	    assumed = *data;
+	    newval = assumed + (T)1;
+	    old = *data;
+	    *data = newval;
+	  }
+	  while(!(assumed==old));
+  }
+
+  T val = *data;
+  delete [] data;
+  return val;
+}
+
+template<class T,class DEVICE_TYPE>
+struct ExchFunctor{
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View<T,execution_space> type;
+  type data, data2;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int i) const {
+	T old = Kokkos::atomic_exchange(&data(),(T)i);
+    Kokkos::atomic_fetch_add(&data2(),old);
+  }
+};
+
+template<class T>
+T ExchLoop(int loop) {
+  struct ZeroFunctor<T,exec_space> f_zero;
+  typename ZeroFunctor<T,exec_space>::type data("Data");
+  typename ZeroFunctor<T,exec_space>::h_type h_data("HData");
+  f_zero.data = data;
+  Kokkos::parallel_for(1,f_zero);
+  exec_space::fence();
+
+  typename ZeroFunctor<T,exec_space>::type data2("Data");
+  typename ZeroFunctor<T,exec_space>::h_type h_data2("HData");
+  f_zero.data = data2;
+  Kokkos::parallel_for(1,f_zero);
+  exec_space::fence();
+
+  struct ExchFunctor<T,exec_space> f_exch;
+  f_exch.data = data;
+  f_exch.data2 = data2;
+  Kokkos::parallel_for(loop,f_exch);
+  exec_space::fence();
+
+  Kokkos::deep_copy(h_data,data);
+  Kokkos::deep_copy(h_data2,data2);
+  T val = h_data() + h_data2();
+
+  return val;
+}
+
+template<class T,class DEVICE_TYPE>
+struct ExchNonAtomicFunctor{
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View<T,execution_space> type;
+  type data, data2;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int i) const {
+		T old = data();
+		data()=(T) i;
+		data2()+=old;
+  }
+};
+
+
+template<class T>
+T ExchLoopNonAtomic(int loop) {
+  struct ZeroFunctor<T,exec_space> f_zero;
+  typename ZeroFunctor<T,exec_space>::type data("Data");
+  typename ZeroFunctor<T,exec_space>::h_type h_data("HData");
+  f_zero.data = data;
+  Kokkos::parallel_for(1,f_zero);
+  exec_space::fence();
+
+  typename ZeroFunctor<T,exec_space>::type data2("Data");
+  typename ZeroFunctor<T,exec_space>::h_type h_data2("HData");
+  f_zero.data = data2;
+  Kokkos::parallel_for(1,f_zero);
+  exec_space::fence();
+
+  struct ExchNonAtomicFunctor<T,exec_space> f_exch;
+  f_exch.data = data;
+  f_exch.data2 = data2;
+  Kokkos::parallel_for(loop,f_exch);
+  exec_space::fence();
+
+  Kokkos::deep_copy(h_data,data);
+  Kokkos::deep_copy(h_data2,data2);
+  T val = h_data() + h_data2();
+
+  return val;
+}
+
+template<class T>
+T ExchLoopSerial(int loop) {
+  T* data = new T[1];
+  T* data2 = new T[1];
+  data[0] = 0;
+  data2[0] = 0;
+  for(int i=0;i<loop;i++) {
+	T old = *data;
+	*data=(T) i;
+	*data2+=old;
+  }
+
+  T val = *data2 + *data;
+  delete [] data;
+  delete [] data2;
+  return val;
+}
+
+template<class T>
+T LoopVariant(int loop, int test) {
+  switch (test) {
+    case 1: return AddLoop<T>(loop);
+    case 2: return CASLoop<T>(loop);
+    case 3: return ExchLoop<T>(loop);
+  }
+  return 0;
+}
+
+template<class T>
+T LoopVariantSerial(int loop, int test) {
+  switch (test) {
+    case 1: return AddLoopSerial<T>(loop);
+    case 2: return CASLoopSerial<T>(loop);
+    case 3: return ExchLoopSerial<T>(loop);
+  }
+  return 0;
+}
+
+template<class T>
+T LoopVariantNonAtomic(int loop, int test) {
+  switch (test) {
+    case 1: return AddLoopNonAtomic<T>(loop);
+    case 2: return CASLoopNonAtomic<T>(loop);
+    case 3: return ExchLoopNonAtomic<T>(loop);
+  }
+  return 0;
+}
+
+template<class T>
+void Loop(int loop, int test, const char* type_name) {
+  LoopVariant<T>(loop,test);
+
+  Kokkos::Impl::Timer timer;
+  T res = LoopVariant<T>(loop,test);
+  double time = timer.seconds();
+
+  timer.reset();
+  T resNonAtomic = LoopVariantNonAtomic<T>(loop,test);
+  double timeNonAtomic = timer.seconds();
+
+  timer.reset();
+  T resSerial = LoopVariantSerial<T>(loop,test);
+  double timeSerial = timer.seconds();
+
+  time         *=1e6/loop;
+  timeNonAtomic*=1e6/loop;
+  timeSerial   *=1e6/loop;
+  //textcolor_standard();
+  bool passed = true;
+  if(resSerial!=res) passed = false;
+  //if(!passed) textcolor(RESET,BLACK,YELLOW);
+  printf("%s Test %i %s  --- Loop: %i Value (S,A,NA): %e %e %e Time: %7.4e %7.4e %7.4e Size of Type %i)",
+         type_name,test,passed?"PASSED":"FAILED",loop,
+         1.0*resSerial,1.0*res,1.0*resNonAtomic,
+         timeSerial,time,timeNonAtomic,(int)sizeof(T));
+  //if(!passed) textcolor_standard();
+  printf("\n");
+}
+
+
+template<class T>
+void Test(int loop, int test, const char* type_name) {
+  if(test==-1) {
+    Loop<T>(loop,1,type_name);
+    Loop<T>(loop,2,type_name);
+    Loop<T>(loop,3,type_name);
+
+  }
+  else
+    Loop<T>(loop,test,type_name);
+}
+
+int main(int argc, char* argv[])
+{
+  int type = -1;
+  int loop = 100000;
+  int test = -1;
+
+  for(int i=0;i<argc;i++)
+  {
+     if((strcmp(argv[i],"--test")==0)) {test=atoi(argv[++i]); continue;}
+     if((strcmp(argv[i],"--type")==0)) {type=atoi(argv[++i]); continue;}
+     if((strcmp(argv[i],"-l")==0)||(strcmp(argv[i],"--loop")==0)) {loop=atoi(argv[++i]); continue;}
+  }
+
+
+  Kokkos::initialize(argc,argv);
+
+
+  printf("Using %s\n",Kokkos::atomic_query_version());
+  bool all_tests = false;
+  if(type==-1) all_tests = true;
+  while(type<100) {
+    if(type==1) {
+     Test<int>(loop,test,"int                    ");
+    }
+    if(type==2) {
+     Test<long int>(loop,test,"long int               ");
+    }
+    if(type==3) {
+     Test<long long int>(loop,test,"long long int          ");
+    }
+    if(type==4) {
+     Test<unsigned int>(loop,test,"unsigned int           ");
+    }
+    if(type==5) {
+     Test<unsigned long int>(loop,test,"unsigned long int      ");
+    }
+    if(type==6) {
+     Test<unsigned long long int>(loop,test,"unsigned long long int ");
+    }
+    if(type==10) {
+     //Test<float>(loop,test,"float                  ");
+    }
+    if(type==11) {
+     Test<double>(loop,test,"double                 ");
+    }
+    if(!all_tests) type=100;
+    else type++;
+  }
+
+  Kokkos::finalize();
+
+}
+
diff --git a/lib/kokkos/core/src/CMakeLists.txt b/lib/kokkos/core/src/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..807a01ed01b128c531b87df0c27e1d406525b603
--- /dev/null
+++ b/lib/kokkos/core/src/CMakeLists.txt
@@ -0,0 +1,113 @@
+
+TRIBITS_ADD_OPTION_AND_DEFINE(
+  Kokkos_ENABLE_Serial
+  KOKKOS_HAVE_SERIAL
+  "Whether to enable the Kokkos::Serial device.  This device executes \"parallel\" kernels sequentially on a single CPU thread.  It is enabled by default.  If you disable this device, please enable at least one other CPU device, such as Kokkos::OpenMP or Kokkos::Threads."
+  ON
+  )
+
+ASSERT_DEFINED(${PROJECT_NAME}_ENABLE_CXX11)
+ASSERT_DEFINED(${PACKAGE_NAME}_ENABLE_CUDA)
+
+# Kokkos_ENABLE_CXX11_DISPATCH_LAMBDA governs whether Kokkos allows
+# use of lambdas at the outer level of parallel dispatch (that is, as
+# the argument to an outer parallel_for, parallel_reduce, or
+# parallel_scan).  This works with non-CUDA execution spaces if C++11
+# is enabled.  It does not currently work with public releases of
+# CUDA.  If that changes, please change the default here to ON if CUDA
+# and C++11 are ON.
+IF (${PROJECT_NAME}_ENABLE_CXX11)
+  IF (${PACKAGE_NAME}_ENABLE_CUDA)
+    SET(Kokkos_ENABLE_CXX11_DISPATCH_LAMBDA_DEFAULT OFF)
+  ELSE ()
+    SET(Kokkos_ENABLE_CXX11_DISPATCH_LAMBDA_DEFAULT ON)
+  ENDIF ()
+ELSE ()
+  SET(Kokkos_ENABLE_CXX11_DISPATCH_LAMBDA_DEFAULT OFF)
+ENDIF ()
+
+TRIBITS_ADD_OPTION_AND_DEFINE(
+  Kokkos_ENABLE_CXX11_DISPATCH_LAMBDA
+  KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA
+  "Whether Kokkos allows use of lambdas at the outer level of parallel dispatch (that is, as the argument to an outer parallel_for, parallel_reduce, or parallel_scan).  This requires C++11.  It also does not currently work with public releases of CUDA.  As a result, even if C++11 is enabled, this will be OFF by default if CUDA is enabled.  If this option is ON, the macro KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA will be defined.  For compatibility with Kokkos' Makefile build system, it is also possible to define that macro on the command line."
+  ${Kokkos_ENABLE_CXX11_DISPATCH_LAMBDA_DEFAULT}
+  )
+
+TRIBITS_CONFIGURE_FILE(${PACKAGE_NAME}_config.h)
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+#-----------------------------------------------------------------------------
+
+SET(TRILINOS_INCDIR ${CMAKE_INSTALL_PREFIX}/${${PROJECT_NAME}_INSTALL_INCLUDE_DIR})
+
+#-----------------------------------------------------------------------------
+
+SET(HEADERS_PUBLIC "")
+SET(HEADERS_PRIVATE "")
+SET(SOURCES "")
+
+FILE(GLOB HEADERS_PUBLIC Kokkos*.hpp)
+LIST( APPEND HEADERS_PUBLIC ${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME}_config.h )
+
+#-----------------------------------------------------------------------------
+
+FILE(GLOB HEADERS_IMPL impl/*.hpp)
+FILE(GLOB SOURCES_IMPL impl/*.cpp)
+
+LIST(APPEND HEADERS_PRIVATE ${HEADERS_IMPL} )
+LIST(APPEND SOURCES         ${SOURCES_IMPL} )
+
+INSTALL(FILES ${HEADERS_IMPL} DESTINATION ${TRILINOS_INCDIR}/impl/)
+
+#-----------------------------------------------------------------------------
+
+FILE(GLOB HEADERS_THREADS Threads/*.hpp)
+FILE(GLOB SOURCES_THREADS Threads/*.cpp)
+
+LIST(APPEND HEADERS_PRIVATE ${HEADERS_THREADS} )
+LIST(APPEND SOURCES         ${SOURCES_THREADS} )
+
+INSTALL(FILES ${HEADERS_THREADS} DESTINATION ${TRILINOS_INCDIR}/Threads/)
+
+#-----------------------------------------------------------------------------
+
+FILE(GLOB HEADERS_OPENMP OpenMP/*.hpp)
+FILE(GLOB SOURCES_OPENMP OpenMP/*.cpp)
+
+LIST(APPEND HEADERS_PRIVATE ${HEADERS_OPENMP} )
+LIST(APPEND SOURCES         ${SOURCES_OPENMP} )
+
+INSTALL(FILES ${HEADERS_OPENMP} DESTINATION ${TRILINOS_INCDIR}/OpenMP/)
+
+#-----------------------------------------------------------------------------
+
+FILE(GLOB HEADERS_CUDA Cuda/*.hpp)
+FILE(GLOB SOURCES_CUDA Cuda/*.cpp)
+
+LIST(APPEND HEADERS_PRIVATE ${HEADERS_CUDA} )
+LIST(APPEND SOURCES         ${SOURCES_CUDA} )
+
+INSTALL(FILES ${HEADERS_CUDA} DESTINATION ${TRILINOS_INCDIR}/Cuda/)
+
+#-----------------------------------------------------------------------------
+FILE(GLOB HEADERS_QTHREAD Qthread/*.hpp)
+FILE(GLOB SOURCES_QTHREAD Qthread/*.cpp)
+
+LIST(APPEND HEADERS_PRIVATE ${HEADERS_QTHREAD} )
+LIST(APPEND SOURCES         ${SOURCES_QTHREAD} )
+
+INSTALL(FILES ${HEADERS_QTHREAD} DESTINATION ${TRILINOS_INCDIR}/Qthread/)
+
+#-----------------------------------------------------------------------------
+
+TRIBITS_ADD_LIBRARY(
+    kokkoscore
+    HEADERS ${HEADERS_PUBLIC}
+    NOINSTALLHEADERS ${HEADERS_PRIVATE}
+    SOURCES ${SOURCES}
+    DEPLIBS
+    )
+
+
diff --git a/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_View.hpp b/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_View.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..4ed7d8e2a8a40ef6434637f3e0ae72266e4c76bb
--- /dev/null
+++ b/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_View.hpp
@@ -0,0 +1,334 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_EXPERIMENTAL_CUDA_VIEW_HPP
+#define KOKKOS_EXPERIMENTAL_CUDA_VIEW_HPP
+
+/* only compile this file if CUDA is enabled for Kokkos */
+#if defined( KOKKOS_HAVE_CUDA )
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template<>
+struct ViewOperatorBoundsErrorAbort< Kokkos::CudaSpace > {
+  KOKKOS_INLINE_FUNCTION
+  static void apply( const size_t rank
+                   , const size_t n0 , const size_t n1
+                   , const size_t n2 , const size_t n3
+                   , const size_t n4 , const size_t n5
+                   , const size_t n6 , const size_t n7
+                   , const size_t i0 , const size_t i1
+                   , const size_t i2 , const size_t i3
+                   , const size_t i4 , const size_t i5
+                   , const size_t i6 , const size_t i7 )
+    {
+      const int r =
+        ( n0 <= i0 ? 0 :
+        ( n1 <= i1 ? 1 :
+        ( n2 <= i2 ? 2 :
+        ( n3 <= i3 ? 3 :
+        ( n4 <= i4 ? 4 :
+        ( n5 <= i5 ? 5 :
+        ( n6 <= i6 ? 6 : 7 )))))));
+      const size_t n =
+        ( n0 <= i0 ? n0 :
+        ( n1 <= i1 ? n1 :
+        ( n2 <= i2 ? n2 :
+        ( n3 <= i3 ? n3 :
+        ( n4 <= i4 ? n4 :
+        ( n5 <= i5 ? n5 :
+        ( n6 <= i6 ? n6 : n7 )))))));
+      const size_t i =
+        ( n0 <= i0 ? i0 :
+        ( n1 <= i1 ? i1 :
+        ( n2 <= i2 ? i2 :
+        ( n3 <= i3 ? i3 :
+        ( n4 <= i4 ? i4 :
+        ( n5 <= i5 ? i5 :
+        ( n6 <= i6 ? i6 : i7 )))))));
+      printf("Cuda view array bounds error index %d : FAILED %lu < %lu\n" , r , i , n );
+      Kokkos::Impl::cuda_abort("Cuda view array bounds error");
+    }
+};
+
+} // namespace Impl
+} // namespace Experimental
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+// Cuda Texture fetches can be performed for 4, 8 and 16 byte objects (int,int2,int4)
+// Via reinterpret_case this can be used to support all scalar types of those sizes.
+// Any other scalar type falls back to either normal reads out of global memory,
+// or using the __ldg intrinsic on Kepler GPUs or newer (Compute Capability >= 3.0)
+
+template< typename ValueType , typename AliasType >
+struct CudaTextureFetch {
+
+  ::cudaTextureObject_t   m_obj ;
+  const ValueType       * m_ptr ;
+  int                     m_offset ;
+
+  // Deference operator pulls through texture object and returns by value
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  ValueType operator[]( const iType & i ) const
+    {
+#if defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ )
+      AliasType v = tex1Dfetch<AliasType>( m_obj , i + m_offset );
+      return  *(reinterpret_cast<ValueType*> (&v));
+#else
+      return m_ptr[ i ];
+#endif
+    }
+
+  // Pointer to referenced memory
+  KOKKOS_INLINE_FUNCTION
+  operator const ValueType * () const { return m_ptr ; }
+
+
+  KOKKOS_INLINE_FUNCTION
+  CudaTextureFetch() : m_obj() , m_ptr() , m_offset() {}
+
+  KOKKOS_INLINE_FUNCTION
+  ~CudaTextureFetch() {}
+
+  KOKKOS_INLINE_FUNCTION
+  CudaTextureFetch( const CudaTextureFetch & rhs )
+    : m_obj(     rhs.m_obj )
+    , m_ptr(     rhs.m_ptr )
+    , m_offset(  rhs.m_offset )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  CudaTextureFetch( CudaTextureFetch && rhs )
+    : m_obj(     rhs.m_obj )
+    , m_ptr(     rhs.m_ptr )
+    , m_offset(  rhs.m_offset )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  CudaTextureFetch & operator = ( const CudaTextureFetch & rhs )
+    {
+      m_obj     = rhs.m_obj ;
+      m_ptr     = rhs.m_ptr ;
+      m_offset  = rhs.m_offset ;
+      return *this ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  CudaTextureFetch & operator = ( CudaTextureFetch && rhs )
+    {
+      m_obj     = rhs.m_obj ;
+      m_ptr     = rhs.m_ptr ;
+      m_offset  = rhs.m_offset ;
+      return *this ;
+    }
+
+  // Texture object spans the entire allocation.
+  // This handle may view a subset of the allocation, so an offset is required.
+  template< class CudaMemorySpace >
+  inline explicit
+  CudaTextureFetch( const ValueType * const arg_ptr
+                  , Kokkos::Experimental::Impl::SharedAllocationRecord< CudaMemorySpace , void > & record
+                  )
+    : m_obj( record.template attach_texture_object< AliasType >() )
+    , m_ptr( arg_ptr )
+    , m_offset( record.attach_texture_object_offset( reinterpret_cast<const AliasType*>( arg_ptr ) ) )
+    {}
+};
+
+#if defined( KOKKOS_CUDA_USE_LDG_INTRINSIC )
+
+template< typename ValueType , typename AliasType >
+struct CudaLDGFetch {
+
+  const ValueType * m_ptr ;
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  ValueType operator[]( const iType & i ) const
+    {
+      AliasType v = __ldg(reinterpret_cast<AliasType*>(&m_ptr[i]));
+      return  *(reinterpret_cast<ValueType*> (&v));
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  operator const ValueType * () const { return m_ptr ; }
+
+  KOKKOS_INLINE_FUNCTION
+  CudaLDGFetch() : m_ptr() {}
+
+  KOKKOS_INLINE_FUNCTION
+  ~CudaLDGFetch() {}
+
+  KOKKOS_INLINE_FUNCTION
+  CudaLDGFetch( const CudaLDGFetch & rhs )
+    : m_ptr( rhs.m_ptr )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  CudaLDGFetch( CudaLDGFetch && rhs )
+    : m_ptr( rhs.m_ptr )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  CudaLDGFetch & operator = ( const CudaLDGFetch & rhs )
+    {
+      m_ptr = rhs.m_ptr ;
+      return *this ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  CudaLDGFetch & operator = ( CudaLDGFetch && rhs )
+    {
+      m_ptr = rhs.m_ptr ;
+      return *this ;
+    }
+
+  template< class CudaMemorySpace >
+  inline explicit
+  CudaTextureFetch( const ValueType * const arg_ptr
+                  , Kokkos::Experimental::Impl::SharedAllocationRecord< CudaMemorySpace , void > const &
+                  )
+    : m_ptr( arg_data_ptr )
+    {}
+};
+
+#endif
+
+} // namespace Impl
+} // namespace Experimental
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+/** \brief  Replace Default ViewDataHandle with Cuda texture fetch specialization
+ *          if 'const' value type, CudaSpace and random access.
+ */
+template< class Traits >
+class ViewDataHandle< Traits ,
+  typename std::enable_if<(
+    // Is Cuda memory space
+    ( std::is_same< typename Traits::memory_space,Kokkos::CudaSpace>::value ||
+      std::is_same< typename Traits::memory_space,Kokkos::CudaUVMSpace>::value )
+    &&
+    // Is a trivial const value of 4, 8, or 16 bytes
+    std::is_trivial<typename Traits::const_value_type>::value
+    &&
+    std::is_same<typename Traits::const_value_type,typename Traits::value_type>::value
+    &&
+    ( sizeof(typename Traits::const_value_type) ==  4 ||
+      sizeof(typename Traits::const_value_type) ==  8 ||
+      sizeof(typename Traits::const_value_type) == 16 )
+    &&
+    // Random access trait
+    ( Traits::memory_traits::RandomAccess != 0 )
+  )>::type >
+{
+public:
+
+  using track_type  = Kokkos::Experimental::Impl::SharedAllocationTracker ;
+
+  using value_type  = typename Traits::const_value_type ;
+  using return_type = typename Traits::const_value_type ; // NOT a reference
+
+  using alias_type = typename std::conditional< ( sizeof(value_type) ==  4 ) , int ,
+                     typename std::conditional< ( sizeof(value_type) ==  8 ) , ::int2 ,
+                     typename std::conditional< ( sizeof(value_type) == 16 ) , ::int4 , void
+                     >::type
+                     >::type
+                     >::type ;
+
+#if defined( KOKKOS_CUDA_USE_LDG_INTRINSIC )
+  using handle_type = Kokkos::Experimental::Impl::CudaLDGFetch< value_type , alias_type > ;
+#else
+  using handle_type = Kokkos::Experimental::Impl::CudaTextureFetch< value_type , alias_type > ;
+#endif
+
+  KOKKOS_INLINE_FUNCTION
+  static handle_type const & assign( handle_type const & arg_handle , track_type const & /* arg_tracker */ )
+    {
+      return arg_handle ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  static handle_type assign( value_type * arg_data_ptr, track_type const & arg_tracker )
+    {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      // Assignment of texture = non-texture requires creation of a texture object
+      // which can only occur on the host.  In addition, 'get_record' is only valid
+      // if called in a host execution space
+      return handle_type( arg_data_ptr , arg_tracker.template get_record< typename Traits::memory_space >() );
+#else
+      Kokkos::Impl::cuda_abort("Cannot create Cuda texture object from within a Cuda kernel");
+      return handle_type();
+#endif
+    }
+};
+
+}
+}
+}
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_HAVE_CUDA ) */
+#endif /* #ifndef KOKKOS_CUDA_VIEW_HPP */
+
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp b/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d1a560ee04648dc8d34b9ec82cb44abddc9ae6e8
--- /dev/null
+++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp
@@ -0,0 +1,318 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDAEXEC_HPP
+#define KOKKOS_CUDAEXEC_HPP
+
+#include <Kokkos_Macros.hpp>
+
+/* only compile this file if CUDA is enabled for Kokkos */
+#ifdef KOKKOS_HAVE_CUDA
+
+#include <string>
+#include <Kokkos_Parallel.hpp>
+#include <impl/Kokkos_Error.hpp>
+#include <Cuda/Kokkos_Cuda_abort.hpp>
+#include <Cuda/Kokkos_Cuda_Error.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+struct CudaTraits {
+  enum { WarpSize       = 32      /* 0x0020 */ };
+  enum { WarpIndexMask  = 0x001f  /* Mask for warpindex */ };
+  enum { WarpIndexShift = 5       /* WarpSize == 1 << WarpShift */ };
+
+  enum { SharedMemoryBanks    = 32      /* Compute device 2.0 */ };
+  enum { SharedMemoryCapacity = 0x0C000 /* 48k shared / 16k L1 Cache */ };
+  enum { SharedMemoryUsage    = 0x04000 /* 16k shared / 48k L1 Cache */ };
+
+  enum { UpperBoundGridCount    = 65535 /* Hard upper bound */ };
+  enum { ConstantMemoryCapacity = 0x010000 /* 64k bytes */ };
+  enum { ConstantMemoryUsage    = 0x008000 /* 32k bytes */ };
+  enum { ConstantMemoryCache    = 0x002000 /*  8k bytes */ };
+
+  typedef unsigned long
+    ConstantGlobalBufferType[ ConstantMemoryUsage / sizeof(unsigned long) ];
+
+  enum { ConstantMemoryUseThreshold = 0x000200 /* 512 bytes */ };
+
+  KOKKOS_INLINE_FUNCTION static
+  CudaSpace::size_type warp_count( CudaSpace::size_type i )
+    { return ( i + WarpIndexMask ) >> WarpIndexShift ; }
+
+  KOKKOS_INLINE_FUNCTION static
+  CudaSpace::size_type warp_align( CudaSpace::size_type i )
+    {
+      enum { Mask = ~CudaSpace::size_type( WarpIndexMask ) };
+      return ( i + WarpIndexMask ) & Mask ;
+    }
+};
+
+//----------------------------------------------------------------------------
+
+CudaSpace::size_type cuda_internal_multiprocessor_count();
+CudaSpace::size_type cuda_internal_maximum_warp_count();
+CudaSpace::size_type cuda_internal_maximum_grid_count();
+CudaSpace::size_type cuda_internal_maximum_shared_words();
+
+CudaSpace::size_type * cuda_internal_scratch_flags( const CudaSpace::size_type size );
+CudaSpace::size_type * cuda_internal_scratch_space( const CudaSpace::size_type size );
+CudaSpace::size_type * cuda_internal_scratch_unified( const CudaSpace::size_type size );
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#if defined( __CUDACC__ )
+
+/** \brief  Access to constant memory on the device */
+#ifdef KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE
+
+__device__ __constant__
+extern unsigned long kokkos_impl_cuda_constant_memory_buffer[] ;
+
+#else
+
+__device__ __constant__
+unsigned long kokkos_impl_cuda_constant_memory_buffer[ Kokkos::Impl::CudaTraits::ConstantMemoryUsage / sizeof(unsigned long) ] ;
+
+#endif
+
+
+namespace Kokkos {
+namespace Impl {
+  struct CudaLockArraysStruct {
+    int* atomic;
+    int* scratch;
+    int* threadid;
+  };
+}
+}
+__device__ __constant__
+#ifdef KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE
+extern
+#endif
+Kokkos::Impl::CudaLockArraysStruct kokkos_impl_cuda_lock_arrays ;
+
+#define CUDA_SPACE_ATOMIC_MASK 0x1FFFF
+#define CUDA_SPACE_ATOMIC_XOR_MASK 0x15A39
+
+namespace Kokkos {
+namespace Impl {
+  void* cuda_resize_scratch_space(size_t bytes, bool force_shrink = false);
+}
+}
+
+namespace Kokkos {
+namespace Impl {
+__device__ inline
+bool lock_address_cuda_space(void* ptr) {
+  size_t offset = size_t(ptr);
+  offset = offset >> 2;
+  offset = offset & CUDA_SPACE_ATOMIC_MASK;
+  return (0 == atomicCAS(&kokkos_impl_cuda_lock_arrays.atomic[offset],0,1));
+}
+
+__device__ inline
+void unlock_address_cuda_space(void* ptr) {
+  size_t offset = size_t(ptr);
+  offset = offset >> 2;
+  offset = offset & CUDA_SPACE_ATOMIC_MASK;
+  atomicExch( &kokkos_impl_cuda_lock_arrays.atomic[ offset ], 0);
+}
+
+}
+}
+
+template< typename T >
+inline
+__device__
+T * kokkos_impl_cuda_shared_memory()
+{ extern __shared__ Kokkos::CudaSpace::size_type sh[]; return (T*) sh ; }
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+// See section B.17 of Cuda C Programming Guide Version 3.2
+// for discussion of
+//   __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
+// function qualifier which could be used to improve performance.
+//----------------------------------------------------------------------------
+// Maximize L1 cache and minimize shared memory:
+//   cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1 );
+// For 2.0 capability: 48 KB L1 and 16 KB shared
+//----------------------------------------------------------------------------
+
+template< class DriverType >
+__global__
+static void cuda_parallel_launch_constant_memory()
+{
+  const DriverType & driver =
+    *((const DriverType *) kokkos_impl_cuda_constant_memory_buffer );
+
+  driver();
+}
+
+template< class DriverType >
+__global__
+static void cuda_parallel_launch_local_memory( const DriverType driver )
+{
+  driver();
+}
+
+template < class DriverType ,
+           bool Large = ( CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType) ) >
+struct CudaParallelLaunch ;
+
+template < class DriverType >
+struct CudaParallelLaunch< DriverType , true > {
+
+  inline
+  CudaParallelLaunch( const DriverType & driver
+                    , const dim3       & grid
+                    , const dim3       & block
+                    , const int          shmem
+                    , const cudaStream_t stream = 0 )
+  {
+    if ( grid.x && ( block.x * block.y * block.z ) ) {
+
+      if ( sizeof( Kokkos::Impl::CudaTraits::ConstantGlobalBufferType ) <
+           sizeof( DriverType ) ) {
+        Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: Functor is too large") );
+      }
+
+      // Fence before changing settings and copying closure
+      Kokkos::Cuda::fence();
+
+      if ( CudaTraits::SharedMemoryCapacity < shmem ) {
+        Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
+      }
+      #ifndef KOKKOS_ARCH_KEPLER //On Kepler the L1 has no benefit since it doesn't cache reads
+      else if ( shmem ) {
+        CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType > , cudaFuncCachePreferShared ) );
+      } else {
+        CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType > , cudaFuncCachePreferL1 ) );
+      }
+      #endif
+
+      // Copy functor to constant memory on the device
+      cudaMemcpyToSymbol( kokkos_impl_cuda_constant_memory_buffer , & driver , sizeof(DriverType) );
+
+      #ifndef KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE
+      Kokkos::Impl::CudaLockArraysStruct locks;
+      locks.atomic = atomic_lock_array_cuda_space_ptr(false);
+      locks.scratch = scratch_lock_array_cuda_space_ptr(false);
+      locks.threadid = threadid_lock_array_cuda_space_ptr(false);
+      cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
+      #endif
+
+      // Invoke the driver function on the device
+      cuda_parallel_launch_constant_memory< DriverType ><<< grid , block , shmem , stream >>>();
+
+#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
+      CUDA_SAFE_CALL( cudaGetLastError() );
+      Kokkos::Cuda::fence();
+#endif
+    }
+  }
+};
+
+template < class DriverType >
+struct CudaParallelLaunch< DriverType , false > {
+
+  inline
+  CudaParallelLaunch( const DriverType & driver
+                    , const dim3       & grid
+                    , const dim3       & block
+                    , const int          shmem
+                    , const cudaStream_t stream = 0 )
+  {
+    if ( grid.x && ( block.x * block.y * block.z ) ) {
+
+      if ( CudaTraits::SharedMemoryCapacity < shmem ) {
+        Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
+      }
+      #ifndef KOKKOS_ARCH_KEPLER //On Kepler the L1 has no benefit since it doesn't cache reads
+      else if ( shmem ) {
+        CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType > , cudaFuncCachePreferShared ) );
+      } else {
+        CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType > , cudaFuncCachePreferL1 ) );
+      }
+      #endif
+
+      #ifndef KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE
+      Kokkos::Impl::CudaLockArraysStruct locks;
+      locks.atomic = atomic_lock_array_cuda_space_ptr(false);
+      locks.scratch = scratch_lock_array_cuda_space_ptr(false);
+      locks.threadid = threadid_lock_array_cuda_space_ptr(false);
+      cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
+      #endif
+
+      cuda_parallel_launch_local_memory< DriverType ><<< grid , block , shmem , stream >>>( driver );
+
+#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
+      CUDA_SAFE_CALL( cudaGetLastError() );
+      Kokkos::Cuda::fence();
+#endif
+    }
+  }
+};
+
+//----------------------------------------------------------------------------
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* defined( __CUDACC__ ) */
+#endif /* defined( KOKKOS_HAVE_CUDA ) */
+#endif /* #ifndef KOKKOS_CUDAEXEC_HPP */
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a4f372d65d1ee6456d9ff6d21cd4775d6fb6c448
--- /dev/null
+++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
@@ -0,0 +1,829 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <stdlib.h>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <algorithm>
+#include <Kokkos_Macros.hpp>
+
+/* only compile this file if CUDA is enabled for Kokkos */
+#ifdef KOKKOS_HAVE_CUDA
+
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Cuda.hpp>
+#include <Kokkos_CudaSpace.hpp>
+
+#include <Cuda/Kokkos_Cuda_Internal.hpp>
+#include <impl/Kokkos_Error.hpp>
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+namespace {
+   cudaStream_t get_deep_copy_stream() {
+     static cudaStream_t s = 0;
+     if( s == 0) {
+       cudaStreamCreate ( &s );
+     }
+     return s;
+   }
+}
+
+DeepCopy<CudaSpace,CudaSpace,Cuda>::DeepCopy( void * dst , const void * src , size_t n )
+{ CUDA_SAFE_CALL( cudaMemcpy( dst , src , n , cudaMemcpyDefault ) ); }
+
+DeepCopy<HostSpace,CudaSpace,Cuda>::DeepCopy( void * dst , const void * src , size_t n )
+{ CUDA_SAFE_CALL( cudaMemcpy( dst , src , n , cudaMemcpyDefault ) ); }
+
+DeepCopy<CudaSpace,HostSpace,Cuda>::DeepCopy( void * dst , const void * src , size_t n )
+{ CUDA_SAFE_CALL( cudaMemcpy( dst , src , n , cudaMemcpyDefault ) ); }
+
+DeepCopy<CudaSpace,CudaSpace,Cuda>::DeepCopy( const Cuda & instance , void * dst , const void * src , size_t n )
+{ CUDA_SAFE_CALL( cudaMemcpyAsync( dst , src , n , cudaMemcpyDefault , instance.cuda_stream() ) ); }
+
+DeepCopy<HostSpace,CudaSpace,Cuda>::DeepCopy( const Cuda & instance , void * dst , const void * src , size_t n )
+{ CUDA_SAFE_CALL( cudaMemcpyAsync( dst , src , n , cudaMemcpyDefault , instance.cuda_stream() ) ); }
+
+DeepCopy<CudaSpace,HostSpace,Cuda>::DeepCopy( const Cuda & instance , void * dst , const void * src , size_t n )
+{ CUDA_SAFE_CALL( cudaMemcpyAsync( dst , src , n , cudaMemcpyDefault , instance.cuda_stream() ) ); }
+
+void DeepCopyAsyncCuda( void * dst , const void * src , size_t n) {
+  cudaStream_t s = get_deep_copy_stream();
+  CUDA_SAFE_CALL( cudaMemcpyAsync( dst , src , n , cudaMemcpyDefault , s ) );
+  cudaStreamSynchronize(s);
+}
+
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+
+namespace Kokkos {
+
+void CudaSpace::access_error()
+{
+  const std::string msg("Kokkos::CudaSpace::access_error attempt to execute Cuda function from non-Cuda space" );
+  Kokkos::Impl::throw_runtime_exception( msg );
+}
+
+void CudaSpace::access_error( const void * const )
+{
+  const std::string msg("Kokkos::CudaSpace::access_error attempt to execute Cuda function from non-Cuda space" );
+  Kokkos::Impl::throw_runtime_exception( msg );
+}
+
+/*--------------------------------------------------------------------------*/
+
+bool CudaUVMSpace::available()
+{
+#if defined( CUDA_VERSION ) && ( 6000 <= CUDA_VERSION ) && !defined(__APPLE__)
+  enum { UVM_available = true };
+#else
+  enum { UVM_available = false };
+#endif
+  return UVM_available;
+}
+
+/*--------------------------------------------------------------------------*/
+
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+CudaSpace::CudaSpace()
+  : m_device( Kokkos::Cuda().cuda_device() )
+{
+}
+
+CudaUVMSpace::CudaUVMSpace()
+  : m_device( Kokkos::Cuda().cuda_device() )
+{
+}
+
+CudaHostPinnedSpace::CudaHostPinnedSpace()
+{
+}
+
+void * CudaSpace::allocate( const size_t arg_alloc_size ) const
+{
+  void * ptr = NULL;
+
+  CUDA_SAFE_CALL( cudaMalloc( &ptr, arg_alloc_size ) );
+
+  return ptr ;
+}
+
+void * CudaUVMSpace::allocate( const size_t arg_alloc_size ) const
+{
+  void * ptr = NULL;
+
+  CUDA_SAFE_CALL( cudaMallocManaged( &ptr, arg_alloc_size , cudaMemAttachGlobal ) );
+
+  return ptr ;
+}
+
+void * CudaHostPinnedSpace::allocate( const size_t arg_alloc_size ) const
+{
+  void * ptr = NULL;
+
+  CUDA_SAFE_CALL( cudaHostAlloc( &ptr, arg_alloc_size , cudaHostAllocDefault ) );
+
+  return ptr ;
+}
+
+void CudaSpace::deallocate( void * const arg_alloc_ptr , const size_t /* arg_alloc_size */ ) const
+{
+  try {
+    CUDA_SAFE_CALL( cudaFree( arg_alloc_ptr ) );
+  } catch(...) {}
+}
+
+void CudaUVMSpace::deallocate( void * const arg_alloc_ptr , const size_t /* arg_alloc_size */ ) const
+{
+  try {
+    CUDA_SAFE_CALL( cudaFree( arg_alloc_ptr ) );
+  } catch(...) {}
+}
+
+void CudaHostPinnedSpace::deallocate( void * const arg_alloc_ptr , const size_t /* arg_alloc_size */ ) const
+{
+  try {
+    CUDA_SAFE_CALL( cudaFreeHost( arg_alloc_ptr ) );
+  } catch(...) {}
+}
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+SharedAllocationRecord< void , void >
+SharedAllocationRecord< Kokkos::CudaSpace , void >::s_root_record ;
+
+SharedAllocationRecord< void , void >
+SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::s_root_record ;
+
+SharedAllocationRecord< void , void >
+SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::s_root_record ;
+
+::cudaTextureObject_t
+SharedAllocationRecord< Kokkos::CudaSpace , void >::
+attach_texture_object( const unsigned sizeof_alias
+                     , void *   const alloc_ptr
+                     , size_t   const alloc_size )
+{
+  enum { TEXTURE_BOUND_1D = 1u << 27 };
+
+  if ( ( alloc_ptr == 0 ) || ( sizeof_alias * TEXTURE_BOUND_1D <= alloc_size ) ) {
+    std::ostringstream msg ;
+    msg << "Kokkos::CudaSpace ERROR: Cannot attach texture object to"
+        << " alloc_ptr(" << alloc_ptr << ")"
+        << " alloc_size(" << alloc_size << ")"
+        << " max_size(" << ( sizeof_alias * TEXTURE_BOUND_1D ) << ")" ;
+    std::cerr << msg.str() << std::endl ;
+    std::cerr.flush();
+    Kokkos::Impl::throw_runtime_exception( msg.str() );
+  }
+
+  ::cudaTextureObject_t tex_obj ;
+
+  struct cudaResourceDesc resDesc ;
+  struct cudaTextureDesc  texDesc ;
+
+  memset( & resDesc , 0 , sizeof(resDesc) );
+  memset( & texDesc , 0 , sizeof(texDesc) );
+
+  resDesc.resType                = cudaResourceTypeLinear ;
+  resDesc.res.linear.desc        = ( sizeof_alias ==  4 ?  cudaCreateChannelDesc< int >() :
+                                   ( sizeof_alias ==  8 ?  cudaCreateChannelDesc< ::int2 >() :
+                                  /* sizeof_alias == 16 */ cudaCreateChannelDesc< ::int4 >() ) );
+  resDesc.res.linear.sizeInBytes = alloc_size ;
+  resDesc.res.linear.devPtr      = alloc_ptr ;
+
+  CUDA_SAFE_CALL( cudaCreateTextureObject( & tex_obj , & resDesc, & texDesc, NULL ) );
+
+  return tex_obj ;
+}
+
+std::string
+SharedAllocationRecord< Kokkos::CudaSpace , void >::get_label() const
+{
+  SharedAllocationHeader header ;
+
+  Kokkos::Impl::DeepCopy< Kokkos::HostSpace , Kokkos::CudaSpace >( & header , RecordBase::head() , sizeof(SharedAllocationHeader) );
+
+  return std::string( header.m_label );
+}
+
+std::string
+SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::get_label() const
+{
+  return std::string( RecordBase::head()->m_label );
+}
+
+std::string
+SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::get_label() const
+{
+  return std::string( RecordBase::head()->m_label );
+}
+
+SharedAllocationRecord< Kokkos::CudaSpace , void > *
+SharedAllocationRecord< Kokkos::CudaSpace , void >::
+allocate( const Kokkos::CudaSpace &  arg_space
+        , const std::string       &  arg_label
+        , const size_t               arg_alloc_size
+        )
+{
+  return new SharedAllocationRecord( arg_space , arg_label , arg_alloc_size );
+}
+
+SharedAllocationRecord< Kokkos::CudaUVMSpace , void > *
+SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
+allocate( const Kokkos::CudaUVMSpace &  arg_space
+        , const std::string          &  arg_label
+        , const size_t                  arg_alloc_size
+        )
+{
+  return new SharedAllocationRecord( arg_space , arg_label , arg_alloc_size );
+}
+
+SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void > *
+SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::
+allocate( const Kokkos::CudaHostPinnedSpace &  arg_space
+        , const std::string                 &  arg_label
+        , const size_t                         arg_alloc_size
+        )
+{
+  return new SharedAllocationRecord( arg_space , arg_label , arg_alloc_size );
+}
+
+void
+SharedAllocationRecord< Kokkos::CudaSpace , void >::
+deallocate( SharedAllocationRecord< void , void > * arg_rec )
+{
+  delete static_cast<SharedAllocationRecord*>(arg_rec);
+}
+
+void
+SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
+deallocate( SharedAllocationRecord< void , void > * arg_rec )
+{
+  delete static_cast<SharedAllocationRecord*>(arg_rec);
+}
+
+void
+SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::
+deallocate( SharedAllocationRecord< void , void > * arg_rec )
+{
+  delete static_cast<SharedAllocationRecord*>(arg_rec);
+}
+
+SharedAllocationRecord< Kokkos::CudaSpace , void >::
+~SharedAllocationRecord()
+{
+  m_space.deallocate( SharedAllocationRecord< void , void >::m_alloc_ptr
+                    , SharedAllocationRecord< void , void >::m_alloc_size
+                    );
+}
+
+SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
+~SharedAllocationRecord()
+{
+  m_space.deallocate( SharedAllocationRecord< void , void >::m_alloc_ptr
+                    , SharedAllocationRecord< void , void >::m_alloc_size
+                    );
+}
+
+SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::
+~SharedAllocationRecord()
+{
+  m_space.deallocate( SharedAllocationRecord< void , void >::m_alloc_ptr
+                    , SharedAllocationRecord< void , void >::m_alloc_size
+                    );
+}
+
+SharedAllocationRecord< Kokkos::CudaSpace , void >::
+SharedAllocationRecord( const Kokkos::CudaSpace & arg_space
+                      , const std::string       & arg_label
+                      , const size_t              arg_alloc_size
+                      , const SharedAllocationRecord< void , void >::function_type arg_dealloc
+                      )
+  // Pass through allocated [ SharedAllocationHeader , user_memory ]
+  // Pass through deallocation function
+  : SharedAllocationRecord< void , void >
+      ( & SharedAllocationRecord< Kokkos::CudaSpace , void >::s_root_record
+      , reinterpret_cast<SharedAllocationHeader*>( arg_space.allocate( sizeof(SharedAllocationHeader) + arg_alloc_size ) )
+      , sizeof(SharedAllocationHeader) + arg_alloc_size
+      , arg_dealloc
+      )
+  , m_tex_obj( 0 )
+  , m_space( arg_space )
+{
+  SharedAllocationHeader header ;
+
+  // Fill in the Header information
+  header.m_record = static_cast< SharedAllocationRecord< void , void > * >( this );
+
+  strncpy( header.m_label
+          , arg_label.c_str()
+          , SharedAllocationHeader::maximum_label_length
+          );
+
+  // Copy to device memory
+  Kokkos::Impl::DeepCopy<CudaSpace,HostSpace>::DeepCopy( RecordBase::m_alloc_ptr , & header , sizeof(SharedAllocationHeader) );
+}
+
+SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
+SharedAllocationRecord( const Kokkos::CudaUVMSpace & arg_space
+                      , const std::string          & arg_label
+                      , const size_t                 arg_alloc_size
+                      , const SharedAllocationRecord< void , void >::function_type arg_dealloc
+                      )
+  // Pass through allocated [ SharedAllocationHeader , user_memory ]
+  // Pass through deallocation function
+  : SharedAllocationRecord< void , void >
+      ( & SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::s_root_record
+      , reinterpret_cast<SharedAllocationHeader*>( arg_space.allocate( sizeof(SharedAllocationHeader) + arg_alloc_size ) )
+      , sizeof(SharedAllocationHeader) + arg_alloc_size
+      , arg_dealloc
+      )
+  , m_tex_obj( 0 )
+  , m_space( arg_space )
+{
+  // Fill in the Header information, directly accessible via UVM
+
+  RecordBase::m_alloc_ptr->m_record = this ;
+
+  strncpy( RecordBase::m_alloc_ptr->m_label
+          , arg_label.c_str()
+          , SharedAllocationHeader::maximum_label_length
+          );
+}
+
+SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::
+SharedAllocationRecord( const Kokkos::CudaHostPinnedSpace & arg_space
+                      , const std::string                 & arg_label
+                      , const size_t                        arg_alloc_size
+                      , const SharedAllocationRecord< void , void >::function_type arg_dealloc
+                      )
+  // Pass through allocated [ SharedAllocationHeader , user_memory ]
+  // Pass through deallocation function
+  : SharedAllocationRecord< void , void >
+      ( & SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::s_root_record
+      , reinterpret_cast<SharedAllocationHeader*>( arg_space.allocate( sizeof(SharedAllocationHeader) + arg_alloc_size ) )
+      , sizeof(SharedAllocationHeader) + arg_alloc_size
+      , arg_dealloc
+      )
+  , m_space( arg_space )
+{
+  // Fill in the Header information, directly accessible via UVM
+
+  RecordBase::m_alloc_ptr->m_record = this ;
+
+  strncpy( RecordBase::m_alloc_ptr->m_label
+          , arg_label.c_str()
+          , SharedAllocationHeader::maximum_label_length
+          );
+}
+
+//----------------------------------------------------------------------------
+
+void * SharedAllocationRecord< Kokkos::CudaSpace , void >::
+allocate_tracked( const Kokkos::CudaSpace & arg_space
+                , const std::string & arg_alloc_label
+                , const size_t arg_alloc_size )
+{
+  if ( ! arg_alloc_size ) return (void *) 0 ;
+
+  SharedAllocationRecord * const r =
+    allocate( arg_space , arg_alloc_label , arg_alloc_size );
+
+  RecordBase::increment( r );
+
+  return r->data();
+}
+
+void SharedAllocationRecord< Kokkos::CudaSpace , void >::
+deallocate_tracked( void * const arg_alloc_ptr )
+{
+  if ( arg_alloc_ptr != 0 ) {
+    SharedAllocationRecord * const r = get_record( arg_alloc_ptr );
+
+    RecordBase::decrement( r );
+  }
+}
+
+void * SharedAllocationRecord< Kokkos::CudaSpace , void >::
+reallocate_tracked( void * const arg_alloc_ptr
+                  , const size_t arg_alloc_size )
+{
+  SharedAllocationRecord * const r_old = get_record( arg_alloc_ptr );
+  SharedAllocationRecord * const r_new = allocate( r_old->m_space , r_old->get_label() , arg_alloc_size );
+
+  Kokkos::Impl::DeepCopy<CudaSpace,CudaSpace>( r_new->data() , r_old->data()
+                                             , std::min( r_old->size() , r_new->size() ) );
+
+  RecordBase::increment( r_new );
+  RecordBase::decrement( r_old );
+
+  return r_new->data();
+}
+
+void * SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
+allocate_tracked( const Kokkos::CudaUVMSpace & arg_space
+                , const std::string & arg_alloc_label
+                , const size_t arg_alloc_size )
+{
+  if ( ! arg_alloc_size ) return (void *) 0 ;
+
+  SharedAllocationRecord * const r =
+    allocate( arg_space , arg_alloc_label , arg_alloc_size );
+
+  RecordBase::increment( r );
+
+  return r->data();
+}
+
+void SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
+deallocate_tracked( void * const arg_alloc_ptr )
+{
+  if ( arg_alloc_ptr != 0 ) {
+    SharedAllocationRecord * const r = get_record( arg_alloc_ptr );
+
+    RecordBase::decrement( r );
+  }
+}
+
+void * SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
+reallocate_tracked( void * const arg_alloc_ptr
+                  , const size_t arg_alloc_size )
+{
+  SharedAllocationRecord * const r_old = get_record( arg_alloc_ptr );
+  SharedAllocationRecord * const r_new = allocate( r_old->m_space , r_old->get_label() , arg_alloc_size );
+
+  Kokkos::Impl::DeepCopy<CudaUVMSpace,CudaUVMSpace>( r_new->data() , r_old->data()
+                                             , std::min( r_old->size() , r_new->size() ) );
+
+  RecordBase::increment( r_new );
+  RecordBase::decrement( r_old );
+
+  return r_new->data();
+}
+
+void * SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::
+allocate_tracked( const Kokkos::CudaHostPinnedSpace & arg_space
+                , const std::string & arg_alloc_label
+                , const size_t arg_alloc_size )
+{
+  if ( ! arg_alloc_size ) return (void *) 0 ;
+
+  SharedAllocationRecord * const r =
+    allocate( arg_space , arg_alloc_label , arg_alloc_size );
+
+  RecordBase::increment( r );
+
+  return r->data();
+}
+
+void SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::
+deallocate_tracked( void * const arg_alloc_ptr )
+{
+  if ( arg_alloc_ptr != 0 ) {
+    SharedAllocationRecord * const r = get_record( arg_alloc_ptr );
+
+    RecordBase::decrement( r );
+  }
+}
+
+void * SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::
+reallocate_tracked( void * const arg_alloc_ptr
+                  , const size_t arg_alloc_size )
+{
+  SharedAllocationRecord * const r_old = get_record( arg_alloc_ptr );
+  SharedAllocationRecord * const r_new = allocate( r_old->m_space , r_old->get_label() , arg_alloc_size );
+
+  Kokkos::Impl::DeepCopy<CudaHostPinnedSpace,CudaHostPinnedSpace>( r_new->data() , r_old->data()
+                                             , std::min( r_old->size() , r_new->size() ) );
+
+  RecordBase::increment( r_new );
+  RecordBase::decrement( r_old );
+
+  return r_new->data();
+}
+
+//----------------------------------------------------------------------------
+
+SharedAllocationRecord< Kokkos::CudaSpace , void > *
+SharedAllocationRecord< Kokkos::CudaSpace , void >::get_record( void * alloc_ptr )
+{
+  using Header     = SharedAllocationHeader ;
+  using RecordBase = SharedAllocationRecord< void , void > ;
+  using RecordCuda = SharedAllocationRecord< Kokkos::CudaSpace , void > ;
+
+#if 0
+  // Copy the header from the allocation
+  Header head ;
+
+  Header const * const head_cuda = alloc_ptr ? Header::get_header( alloc_ptr ) : (Header*) 0 ;
+
+  if ( alloc_ptr ) {
+    Kokkos::Impl::DeepCopy<HostSpace,CudaSpace>::DeepCopy( & head , head_cuda , sizeof(SharedAllocationHeader) );
+  }
+
+  RecordCuda * const record = alloc_ptr ? static_cast< RecordCuda * >( head.m_record ) : (RecordCuda *) 0 ;
+
+  if ( ! alloc_ptr || record->m_alloc_ptr != head_cuda ) {
+    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void >::get_record ERROR" ) );
+  }
+
+#else
+
+  // Iterate the list to search for the record among all allocations
+  // requires obtaining the root of the list and then locking the list.
+
+  RecordCuda * const record = static_cast< RecordCuda * >( RecordBase::find( & s_root_record , alloc_ptr ) );
+
+  if ( record == 0 ) {
+    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void >::get_record ERROR" ) );
+  }
+
+#endif
+
+  return record ;
+}
+
+SharedAllocationRecord< Kokkos::CudaUVMSpace , void > *
+SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::get_record( void * alloc_ptr )
+{
+  using Header     = SharedAllocationHeader ;
+  using RecordCuda = SharedAllocationRecord< Kokkos::CudaUVMSpace , void > ;
+
+  Header * const h = alloc_ptr ? reinterpret_cast< Header * >( alloc_ptr ) - 1 : (Header *) 0 ;
+
+  if ( ! alloc_ptr || h->m_record->m_alloc_ptr != h ) {
+    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::get_record ERROR" ) );
+  }
+
+  return static_cast< RecordCuda * >( h->m_record );
+}
+
+SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void > *
+SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::get_record( void * alloc_ptr )
+{
+  using Header     = SharedAllocationHeader ;
+  using RecordCuda = SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void > ;
+
+  Header * const h = alloc_ptr ? reinterpret_cast< Header * >( alloc_ptr ) - 1 : (Header *) 0 ;
+
+  if ( ! alloc_ptr || h->m_record->m_alloc_ptr != h ) {
+    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::get_record ERROR" ) );
+  }
+
+  return static_cast< RecordCuda * >( h->m_record );
+}
+
+// Iterate records to print orphaned memory ...
+void
+SharedAllocationRecord< Kokkos::CudaSpace , void >::
+print_records( std::ostream & s , const Kokkos::CudaSpace & space , bool detail )
+{
+  SharedAllocationRecord< void , void > * r = & s_root_record ;
+
+  char buffer[256] ;
+
+  SharedAllocationHeader head ;
+
+  if ( detail ) {
+    do {
+      if ( r->m_alloc_ptr ) {
+        Kokkos::Impl::DeepCopy<HostSpace,CudaSpace>::DeepCopy( & head , r->m_alloc_ptr , sizeof(SharedAllocationHeader) );
+      }
+      else {
+        head.m_label[0] = 0 ;
+      }
+
+      //Formatting dependent on sizeof(uintptr_t)
+      const char * format_string;
+
+      if (sizeof(uintptr_t) == sizeof(unsigned long)) { 
+        format_string = "Cuda addr( 0x%.12lx ) list( 0x%.12lx 0x%.12lx ) extent[ 0x%.12lx + %.8ld ] count(%d) dealloc(0x%.12lx) %s\n";
+      }
+      else if (sizeof(uintptr_t) == sizeof(unsigned long long)) { 
+        format_string = "Cuda addr( 0x%.12llx ) list( 0x%.12llx 0x%.12llx ) extent[ 0x%.12llx + %.8ld ] count(%d) dealloc(0x%.12llx) %s\n";
+      }
+
+      snprintf( buffer , 256 
+              , format_string
+              , reinterpret_cast<uintptr_t>( r )
+              , reinterpret_cast<uintptr_t>( r->m_prev )
+              , reinterpret_cast<uintptr_t>( r->m_next )
+              , reinterpret_cast<uintptr_t>( r->m_alloc_ptr )
+              , r->m_alloc_size
+              , r->m_count
+              , reinterpret_cast<uintptr_t>( r->m_dealloc )
+              , head.m_label
+              );
+      std::cout << buffer ;
+      r = r->m_next ;
+    } while ( r != & s_root_record );
+  }
+  else {
+    do {
+      if ( r->m_alloc_ptr ) {
+
+        Kokkos::Impl::DeepCopy<HostSpace,CudaSpace>::DeepCopy( & head , r->m_alloc_ptr , sizeof(SharedAllocationHeader) );
+
+        //Formatting dependent on sizeof(uintptr_t)
+        const char * format_string;
+
+        if (sizeof(uintptr_t) == sizeof(unsigned long)) { 
+          format_string = "Cuda [ 0x%.12lx + %ld ] %s\n";
+        }
+        else if (sizeof(uintptr_t) == sizeof(unsigned long long)) { 
+          format_string = "Cuda [ 0x%.12llx + %ld ] %s\n";
+        }
+
+        snprintf( buffer , 256 
+                , format_string
+                , reinterpret_cast< uintptr_t >( r->data() )
+                , r->size()
+                , head.m_label
+                );
+      }
+      else {
+        snprintf( buffer , 256 , "Cuda [ 0 + 0 ]\n" );
+      }
+      std::cout << buffer ;
+      r = r->m_next ;
+    } while ( r != & s_root_record );
+  }
+}
+
+void
+SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
+print_records( std::ostream & s , const Kokkos::CudaUVMSpace & space , bool detail )
+{
+  SharedAllocationRecord< void , void >::print_host_accessible_records( s , "CudaUVM" , & s_root_record , detail );
+}
+
+void
+SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::
+print_records( std::ostream & s , const Kokkos::CudaHostPinnedSpace & space , bool detail )
+{
+  SharedAllocationRecord< void , void >::print_host_accessible_records( s , "CudaHostPinned" , & s_root_record , detail );
+}
+
+} // namespace Impl
+} // namespace Experimental
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace {
+  __global__ void init_lock_array_kernel_atomic() {
+    unsigned i = blockIdx.x*blockDim.x + threadIdx.x;
+
+    if(i<CUDA_SPACE_ATOMIC_MASK+1)
+      kokkos_impl_cuda_lock_arrays.atomic[i] = 0;
+  }
+
+  __global__ void init_lock_array_kernel_scratch_threadid(int N) {
+    unsigned i = blockIdx.x*blockDim.x + threadIdx.x;
+
+    if(i<N) {
+      kokkos_impl_cuda_lock_arrays.scratch[i] = 0;
+      kokkos_impl_cuda_lock_arrays.threadid[i] = 0;
+    }
+  }
+}
+
+
+namespace Impl {
+int* atomic_lock_array_cuda_space_ptr(bool deallocate) {
+  static int* ptr = NULL;
+  if(deallocate) {
+    cudaFree(ptr);
+    ptr = NULL;
+  }
+
+  if(ptr==NULL && !deallocate)
+    cudaMalloc(&ptr,sizeof(int)*(CUDA_SPACE_ATOMIC_MASK+1));
+  return ptr;
+}
+
+int* scratch_lock_array_cuda_space_ptr(bool deallocate) {
+  static int* ptr = NULL;
+  if(deallocate) {
+    cudaFree(ptr);
+    ptr = NULL;
+  }
+
+  if(ptr==NULL && !deallocate)
+    cudaMalloc(&ptr,sizeof(int)*(Cuda::concurrency()));
+  return ptr;
+}
+
+int* threadid_lock_array_cuda_space_ptr(bool deallocate) {
+  static int* ptr = NULL;
+  if(deallocate) {
+    cudaFree(ptr);
+    ptr = NULL;
+  }
+
+  if(ptr==NULL && !deallocate)
+    cudaMalloc(&ptr,sizeof(int)*(Cuda::concurrency()));
+  return ptr;
+}
+
+void init_lock_arrays_cuda_space() {
+  static int is_initialized = 0;
+  if(! is_initialized) {
+    Kokkos::Impl::CudaLockArraysStruct locks;
+    locks.atomic = atomic_lock_array_cuda_space_ptr(false);
+    locks.scratch = scratch_lock_array_cuda_space_ptr(false);
+    locks.threadid = threadid_lock_array_cuda_space_ptr(false);
+    cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
+    init_lock_array_kernel_atomic<<<(CUDA_SPACE_ATOMIC_MASK+255)/256,256>>>();
+    init_lock_array_kernel_scratch_threadid<<<(Kokkos::Cuda::concurrency()+255)/256,256>>>(Kokkos::Cuda::concurrency());
+  }
+}
+
+void* cuda_resize_scratch_space(size_t bytes, bool force_shrink) {
+  static void* ptr = NULL;
+  static size_t current_size = 0;
+  if(current_size == 0) {
+    current_size = bytes;
+    ptr = Kokkos::kokkos_malloc<Kokkos::CudaSpace>("CudaSpace::ScratchMemory",current_size);
+  }
+  if(bytes > current_size) {
+    current_size = bytes;
+    ptr = Kokkos::kokkos_realloc<Kokkos::CudaSpace>(ptr,current_size);
+  }
+  if((bytes < current_size) && (force_shrink)) {
+    current_size = bytes;
+    Kokkos::kokkos_free<Kokkos::CudaSpace>(ptr);
+    ptr = Kokkos::kokkos_malloc<Kokkos::CudaSpace>("CudaSpace::ScratchMemory",current_size);
+  }
+  return ptr;
+}
+
+}
+}
+#endif // KOKKOS_HAVE_CUDA
+
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Alloc.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Alloc.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..10999ee57bfd39e81e28b64a5f5a0df5ee877c42
--- /dev/null
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Alloc.hpp
@@ -0,0 +1,182 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_ALLOCATION_TRACKING_HPP
+#define KOKKOS_CUDA_ALLOCATION_TRACKING_HPP
+
+#include <Kokkos_Macros.hpp>
+
+/* only compile this file if CUDA is enabled for Kokkos */
+#ifdef KOKKOS_HAVE_CUDA
+
+#include <impl/Kokkos_Traits.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+template< class DestructFunctor >
+SharedAllocationRecord *
+shared_allocation_record( Kokkos::CudaSpace const & arg_space
+                        , void *            const   arg_alloc_ptr
+                        , DestructFunctor   const & arg_destruct )
+{
+  SharedAllocationRecord * const record = SharedAllocationRecord::get_record( arg_alloc_ptr );
+
+  // assert: record != 0
+
+  // assert: sizeof(DestructFunctor) <= record->m_destruct_size
+
+  // assert: record->m_destruct_function == 0
+
+  DestructFunctor * const functor =
+    reinterpret_cast< DestructFunctor * >(
+    reinterpret_cast< uintptr_t >( record ) + sizeof(SharedAllocationRecord) );
+
+  new( functor ) DestructFunctor( arg_destruct );
+
+  record->m_destruct_functor = & shared_allocation_destroy< DestructFunctor > ;
+  
+  return record ;
+}
+
+
+/// class CudaUnmanagedAllocator
+/// does nothing when deallocate(ptr,size) is called
+struct CudaUnmanagedAllocator
+{
+  static const char * name()
+  {
+    return "Cuda Unmanaged Allocator";
+  }
+
+  static void deallocate(void * /*ptr*/, size_t /*size*/) {}
+
+  static bool support_texture_binding() { return true; }
+};
+
+/// class CudaUnmanagedAllocator
+/// does nothing when deallocate(ptr,size) is called
+struct CudaUnmanagedUVMAllocator
+{
+  static const char * name()
+  {
+    return "Cuda Unmanaged UVM Allocator";
+  }
+
+  static void deallocate(void * /*ptr*/, size_t /*size*/) {}
+
+  static bool support_texture_binding() { return true; }
+};
+
+/// class CudaUnmanagedHostAllocator
+/// does nothing when deallocate(ptr,size) is called
+class CudaUnmanagedHostAllocator
+{
+public:
+  static const char * name()
+  {
+    return "Cuda Unmanaged Host Allocator";
+  }
+  // Unmanaged deallocate does nothing
+  static void deallocate(void * /*ptr*/, size_t /*size*/) {}
+};
+
+/// class CudaMallocAllocator
+class CudaMallocAllocator
+{
+public:
+  static const char * name()
+  {
+    return "Cuda Malloc Allocator";
+  }
+
+  static void* allocate(size_t size);
+
+  static void deallocate(void * ptr, size_t);
+
+  static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
+
+  static bool support_texture_binding() { return true; }
+};
+
+/// class CudaUVMAllocator
+class CudaUVMAllocator
+{
+public:
+  static const char * name()
+  {
+    return "Cuda UVM Allocator";
+  }
+
+  static void* allocate(size_t size);
+
+  static void deallocate(void * ptr, size_t);
+
+  static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
+
+  static bool support_texture_binding() { return true; }
+};
+
+/// class CudaHostAllocator
+class CudaHostAllocator
+{
+public:
+  static const char * name()
+  {
+    return "Cuda Host Allocator";
+  }
+
+  static void* allocate(size_t size);
+
+  static void deallocate(void * ptr, size_t);
+
+  static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
+};
+
+
+}} // namespace Kokkos::Impl
+
+#endif //KOKKOS_HAVE_CUDA
+
+#endif // #ifndef KOKKOS_CUDA_ALLOCATION_TRACKING_HPP
+
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a0b29ddc2b270212f9c8b9d18e6ee394b9a61b39
--- /dev/null
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp
@@ -0,0 +1,69 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_ERROR_HPP
+#define KOKKOS_CUDA_ERROR_HPP
+
+#include <Kokkos_Macros.hpp>
+
+/* only compile this file if CUDA is enabled for Kokkos */
+#ifdef KOKKOS_HAVE_CUDA
+
+namespace Kokkos { namespace Impl {
+
+void cuda_device_synchronize();
+
+void cuda_internal_error_throw( cudaError e , const char * name, const char * file = NULL, const int line = 0 );
+
+inline void cuda_internal_safe_call( cudaError e , const char * name, const char * file = NULL, const int line = 0)
+{
+  if ( cudaSuccess != e ) { cuda_internal_error_throw( e , name, file, line ); }
+}
+
+#define CUDA_SAFE_CALL( call )  \
+	Kokkos::Impl::cuda_internal_safe_call( call , #call, __FILE__, __LINE__ )
+
+}} // namespace Kokkos::Impl
+
+#endif //KOKKOS_HAVE_CUDA
+#endif //KOKKOS_CUDA_ERROR_HPP
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2d8d07d0772f2dd2d27a73a4b804f3000953c824
--- /dev/null
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp
@@ -0,0 +1,778 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+/*--------------------------------------------------------------------------*/
+/* Kokkos interfaces */
+
+#include <Kokkos_Core.hpp>
+
+/* only compile this file if CUDA is enabled for Kokkos */
+#ifdef KOKKOS_HAVE_CUDA
+
+#include <Cuda/Kokkos_Cuda_Error.hpp>
+#include <Cuda/Kokkos_Cuda_Internal.hpp>
+#include <impl/Kokkos_Error.hpp>
+#include <impl/Kokkos_Profiling_Interface.hpp>
+
+/*--------------------------------------------------------------------------*/
+/* Standard 'C' libraries */
+#include <stdlib.h>
+
+/* Standard 'C++' libraries */
+#include <vector>
+#include <iostream>
+#include <sstream>
+#include <string>
+
+#ifdef KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE
+
+__device__ __constant__
+unsigned long kokkos_impl_cuda_constant_memory_buffer[ Kokkos::Impl::CudaTraits::ConstantMemoryUsage / sizeof(unsigned long) ] ;
+
+__device__ __constant__
+Kokkos::Impl::CudaLockArraysStruct kokkos_impl_cuda_lock_arrays ;
+
+#endif
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+namespace {
+
+__global__
+void query_cuda_kernel_arch( int * d_arch )
+{
+#if defined( __CUDA_ARCH__ )
+  *d_arch = __CUDA_ARCH__ ;
+#else
+  *d_arch = 0 ;
+#endif
+}
+
+/** Query what compute capability is actually launched to the device: */
+int cuda_kernel_arch()
+{
+  int * d_arch = 0 ;
+  cudaMalloc( (void **) & d_arch , sizeof(int) );
+  query_cuda_kernel_arch<<<1,1>>>( d_arch );
+  int arch = 0 ;
+  cudaMemcpy( & arch , d_arch , sizeof(int) , cudaMemcpyDefault );
+  cudaFree( d_arch );
+  return arch ;
+}
+
+bool cuda_launch_blocking()
+{
+  const char * env = getenv("CUDA_LAUNCH_BLOCKING");
+
+  if (env == 0) return false;
+
+  return atoi(env);
+}
+
+}
+
+void cuda_device_synchronize()
+{
+//  static const bool launch_blocking = cuda_launch_blocking();
+
+//  if (!launch_blocking) {
+    CUDA_SAFE_CALL( cudaDeviceSynchronize() );
+//  }
+}
+
+void cuda_internal_error_throw( cudaError e , const char * name, const char * file, const int line )
+{
+  std::ostringstream out ;
+  out << name << " error( " << cudaGetErrorName(e) << "): " << cudaGetErrorString(e);
+  if (file) {
+    out << " " << file << ":" << line;
+  }
+  throw_runtime_exception( out.str() );
+}
+
+//----------------------------------------------------------------------------
+// Some significant cuda device properties:
+//
+// cudaDeviceProp::name                : Text label for device
+// cudaDeviceProp::major               : Device major number
+// cudaDeviceProp::minor               : Device minor number
+// cudaDeviceProp::warpSize            : number of threads per warp
+// cudaDeviceProp::multiProcessorCount : number of multiprocessors
+// cudaDeviceProp::sharedMemPerBlock   : capacity of shared memory per block
+// cudaDeviceProp::totalConstMem       : capacity of constant memory
+// cudaDeviceProp::totalGlobalMem      : capacity of global memory
+// cudaDeviceProp::maxGridSize[3]      : maximum grid size
+
+//
+//  Section 4.4.2.4 of the CUDA Toolkit Reference Manual
+//
+// struct cudaDeviceProp {
+//   char name[256];
+//   size_t totalGlobalMem;
+//   size_t sharedMemPerBlock;
+//   int regsPerBlock;
+//   int warpSize;
+//   size_t memPitch;
+//   int maxThreadsPerBlock;
+//   int maxThreadsDim[3];
+//   int maxGridSize[3];
+//   size_t totalConstMem;
+//   int major;
+//   int minor;
+//   int clockRate;
+//   size_t textureAlignment;
+//   int deviceOverlap;
+//   int multiProcessorCount;
+//   int kernelExecTimeoutEnabled;
+//   int integrated;
+//   int canMapHostMemory;
+//   int computeMode;
+//   int concurrentKernels;
+//   int ECCEnabled;
+//   int pciBusID;
+//   int pciDeviceID;
+//   int tccDriver;
+//   int asyncEngineCount;
+//   int unifiedAddressing;
+//   int memoryClockRate;
+//   int memoryBusWidth;
+//   int l2CacheSize;
+//   int maxThreadsPerMultiProcessor;
+// };
+
+
+namespace {
+
+
+
+class CudaInternalDevices {
+public:
+  enum { MAXIMUM_DEVICE_COUNT = 64 };
+  struct cudaDeviceProp  m_cudaProp[ MAXIMUM_DEVICE_COUNT ] ;
+  int                    m_cudaDevCount ;
+
+  CudaInternalDevices();
+
+  static const CudaInternalDevices & singleton();
+};
+
+CudaInternalDevices::CudaInternalDevices()
+{
+  // See 'cudaSetDeviceFlags' for host-device thread interaction
+  // Section 4.4.2.6 of the CUDA Toolkit Reference Manual
+
+  CUDA_SAFE_CALL (cudaGetDeviceCount( & m_cudaDevCount ) );
+
+  if(m_cudaDevCount > MAXIMUM_DEVICE_COUNT) {
+    Kokkos::abort("Sorry, you have more GPUs per node than we thought anybody would ever have. Please report this to github.com/kokkos/kokkos.");
+  }
+  for ( int i = 0 ; i < m_cudaDevCount ; ++i ) {
+    CUDA_SAFE_CALL( cudaGetDeviceProperties( m_cudaProp + i , i ) );
+  }
+}
+
+const CudaInternalDevices & CudaInternalDevices::singleton()
+{
+  static CudaInternalDevices self ; return self ;
+}
+
+}
+
+//----------------------------------------------------------------------------
+
+class CudaInternal {
+private:
+
+  CudaInternal( const CudaInternal & );
+  CudaInternal & operator = ( const CudaInternal & );
+
+
+public:
+
+  typedef Cuda::size_type size_type ;
+
+  int         m_cudaDev ;
+  int         m_cudaArch ;
+  unsigned    m_multiProcCount ;
+  unsigned    m_maxWarpCount ;
+  unsigned    m_maxBlock ;
+  unsigned    m_maxSharedWords ;
+  size_type   m_scratchSpaceCount ;
+  size_type   m_scratchFlagsCount ;
+  size_type   m_scratchUnifiedCount ;
+  size_type   m_scratchUnifiedSupported ;
+  size_type   m_streamCount ;
+  size_type * m_scratchSpace ;
+  size_type * m_scratchFlags ;
+  size_type * m_scratchUnified ;
+  cudaStream_t * m_stream ;
+
+  static int was_initialized;
+  static int was_finalized;
+
+  static CudaInternal & singleton();
+
+  int verify_is_initialized( const char * const label ) const ;
+
+  int is_initialized() const
+    { return 0 != m_scratchSpace && 0 != m_scratchFlags ; }
+
+  void initialize( int cuda_device_id , int stream_count );
+  void finalize();
+
+  void print_configuration( std::ostream & ) const ;
+
+  ~CudaInternal();
+
+  CudaInternal()
+    : m_cudaDev( -1 )
+    , m_cudaArch( -1 )
+    , m_multiProcCount( 0 )
+    , m_maxWarpCount( 0 )
+    , m_maxBlock( 0 )
+    , m_maxSharedWords( 0 )
+    , m_scratchSpaceCount( 0 )
+    , m_scratchFlagsCount( 0 )
+    , m_scratchUnifiedCount( 0 )
+    , m_scratchUnifiedSupported( 0 )
+    , m_streamCount( 0 )
+    , m_scratchSpace( 0 )
+    , m_scratchFlags( 0 )
+    , m_scratchUnified( 0 )
+    , m_stream( 0 )
+    {}
+
+  size_type * scratch_space( const size_type size );
+  size_type * scratch_flags( const size_type size );
+  size_type * scratch_unified( const size_type size );
+};
+
+int CudaInternal::was_initialized = 0;
+int CudaInternal::was_finalized = 0;
+//----------------------------------------------------------------------------
+
+
+void CudaInternal::print_configuration( std::ostream & s ) const
+{
+  const CudaInternalDevices & dev_info = CudaInternalDevices::singleton();
+
+#if defined( KOKKOS_HAVE_CUDA )
+    s << "macro  KOKKOS_HAVE_CUDA      : defined" << std::endl ;
+#endif
+#if defined( CUDA_VERSION )
+    s << "macro  CUDA_VERSION          = " << CUDA_VERSION
+      << " = version " << CUDA_VERSION / 1000
+      << "." << ( CUDA_VERSION % 1000 ) / 10
+      << std::endl ;
+#endif
+
+  for ( int i = 0 ; i < dev_info.m_cudaDevCount ; ++i ) {
+    s << "Kokkos::Cuda[ " << i << " ] "
+      << dev_info.m_cudaProp[i].name
+      << " capability " << dev_info.m_cudaProp[i].major << "." << dev_info.m_cudaProp[i].minor
+      << ", Total Global Memory: " << human_memory_size(dev_info.m_cudaProp[i].totalGlobalMem)
+      << ", Shared Memory per Block: " << human_memory_size(dev_info.m_cudaProp[i].sharedMemPerBlock);
+    if ( m_cudaDev == i ) s << " : Selected" ;
+    s << std::endl ;
+  }
+}
+
+//----------------------------------------------------------------------------
+
+CudaInternal::~CudaInternal()
+{
+  if ( m_stream ||
+       m_scratchSpace ||
+       m_scratchFlags ||
+       m_scratchUnified ) {
+    std::cerr << "Kokkos::Cuda ERROR: Failed to call Kokkos::Cuda::finalize()"
+              << std::endl ;
+    std::cerr.flush();
+  }
+
+  m_cudaDev                 = -1 ;
+  m_cudaArch                = -1 ;
+  m_multiProcCount          = 0 ;
+  m_maxWarpCount            = 0 ;
+  m_maxBlock                = 0 ;
+  m_maxSharedWords          = 0 ;
+  m_scratchSpaceCount       = 0 ;
+  m_scratchFlagsCount       = 0 ;
+  m_scratchUnifiedCount     = 0 ;
+  m_scratchUnifiedSupported = 0 ;
+  m_streamCount             = 0 ;
+  m_scratchSpace            = 0 ;
+  m_scratchFlags            = 0 ;
+  m_scratchUnified          = 0 ;
+  m_stream                  = 0 ;
+}
+
+int CudaInternal::verify_is_initialized( const char * const label ) const
+{
+  if ( m_cudaDev < 0 ) {
+    std::cerr << "Kokkos::Cuda::" << label << " : ERROR device not initialized" << std::endl ;
+  }
+  return 0 <= m_cudaDev ;
+}
+
+CudaInternal & CudaInternal::singleton()
+{
+  static CudaInternal self ;
+  return self ;
+}
+
+void CudaInternal::initialize( int cuda_device_id , int stream_count )
+{
+  if ( was_finalized ) Kokkos::abort("Calling Cuda::initialize after Cuda::finalize is illegal\n");
+  was_initialized = 1;
+  if ( is_initialized() ) return;
+
+  enum { WordSize = sizeof(size_type) };
+
+  if ( ! HostSpace::execution_space::is_initialized() ) {
+    const std::string msg("Cuda::initialize ERROR : HostSpace::execution_space is not initialized");
+    throw_runtime_exception( msg );
+  }
+
+  const CudaInternalDevices & dev_info = CudaInternalDevices::singleton();
+
+  const bool ok_init = 0 == m_scratchSpace || 0 == m_scratchFlags ;
+
+  const bool ok_id   = 0 <= cuda_device_id &&
+                            cuda_device_id < dev_info.m_cudaDevCount ;
+
+  // Need device capability 2.0 or better
+
+  const bool ok_dev = ok_id &&
+    ( 2 <= dev_info.m_cudaProp[ cuda_device_id ].major &&
+      0 <= dev_info.m_cudaProp[ cuda_device_id ].minor );
+
+  if ( ok_init && ok_dev ) {
+
+    const struct cudaDeviceProp & cudaProp =
+      dev_info.m_cudaProp[ cuda_device_id ];
+
+    m_cudaDev = cuda_device_id ;
+
+    CUDA_SAFE_CALL( cudaSetDevice( m_cudaDev ) );
+    CUDA_SAFE_CALL( cudaDeviceReset() );
+    Kokkos::Impl::cuda_device_synchronize();
+
+    // Query what compute capability architecture a kernel executes:
+    m_cudaArch = cuda_kernel_arch();
+
+    if ( m_cudaArch != cudaProp.major * 100 + cudaProp.minor * 10 ) {
+      std::cerr << "Kokkos::Cuda::initialize WARNING: running kernels compiled for compute capability "
+                << ( m_cudaArch / 100 ) << "." << ( ( m_cudaArch % 100 ) / 10 )
+                << " on device with compute capability "
+                << cudaProp.major << "." << cudaProp.minor
+                << " , this will likely reduce potential performance."
+                << std::endl ;
+    }
+
+    // number of multiprocessors
+
+    m_multiProcCount = cudaProp.multiProcessorCount ;
+
+    //----------------------------------
+    // Maximum number of warps,
+    // at most one warp per thread in a warp for reduction.
+
+    // HCE 2012-February :
+    // Found bug in CUDA 4.1 that sometimes a kernel launch would fail
+    // if the thread count == 1024 and a functor is passed to the kernel.
+    // Copying the kernel to constant memory and then launching with
+    // thread count == 1024 would work fine.
+    //
+    // HCE 2012-October :
+    // All compute capabilities support at least 16 warps (512 threads).
+    // However, we have found that 8 warps typically gives better performance.
+
+    m_maxWarpCount = 8 ;
+
+    // m_maxWarpCount = cudaProp.maxThreadsPerBlock / Impl::CudaTraits::WarpSize ;
+
+    if ( Impl::CudaTraits::WarpSize < m_maxWarpCount ) {
+      m_maxWarpCount = Impl::CudaTraits::WarpSize ;
+    }
+
+    m_maxSharedWords = cudaProp.sharedMemPerBlock / WordSize ;
+
+    //----------------------------------
+    // Maximum number of blocks:
+
+    m_maxBlock = m_cudaArch < 300 ? 65535 : cudaProp.maxGridSize[0] ;
+
+    //----------------------------------
+
+    m_scratchUnifiedSupported = cudaProp.unifiedAddressing ;
+
+    if ( ! m_scratchUnifiedSupported ) {
+      std::cout << "Kokkos::Cuda device "
+                << cudaProp.name << " capability "
+                << cudaProp.major << "." << cudaProp.minor
+                << " does not support unified virtual address space"
+                << std::endl ;
+    }
+
+    //----------------------------------
+    // Multiblock reduction uses scratch flags for counters
+    // and scratch space for partial reduction values.
+    // Allocate some initial space.  This will grow as needed.
+
+    {
+      const unsigned reduce_block_count = m_maxWarpCount * Impl::CudaTraits::WarpSize ;
+
+      (void) scratch_unified( 16 * sizeof(size_type) );
+      (void) scratch_flags( reduce_block_count * 2  * sizeof(size_type) );
+      (void) scratch_space( reduce_block_count * 16 * sizeof(size_type) );
+    }
+    //----------------------------------
+
+    if ( stream_count ) {
+      m_stream = (cudaStream_t*) ::malloc( stream_count * sizeof(cudaStream_t) );
+      m_streamCount = stream_count ;
+      for ( size_type i = 0 ; i < m_streamCount ; ++i ) m_stream[i] = 0 ;
+    }
+  }
+  else {
+
+    std::ostringstream msg ;
+    msg << "Kokkos::Cuda::initialize(" << cuda_device_id << ") FAILED" ;
+
+    if ( ! ok_init ) {
+      msg << " : Already initialized" ;
+    }
+    if ( ! ok_id ) {
+      msg << " : Device identifier out of range "
+          << "[0.." << dev_info.m_cudaDevCount << "]" ;
+    }
+    else if ( ! ok_dev ) {
+      msg << " : Device " ;
+      msg << dev_info.m_cudaProp[ cuda_device_id ].major ;
+      msg << "." ;
+      msg << dev_info.m_cudaProp[ cuda_device_id ].minor ;
+      msg << " has insufficient capability, required 2.0 or better" ;
+    }
+    Kokkos::Impl::throw_runtime_exception( msg.str() );
+  }
+
+  #ifdef KOKKOS_CUDA_USE_UVM
+    if(!cuda_launch_blocking()) {
+      std::cout << "Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default" << std::endl;
+      std::cout << "                                  without setting CUDA_LAUNCH_BLOCKING=1." << std::endl;
+      std::cout << "                                  The code must call Cuda::fence() after each kernel" << std::endl;
+      std::cout << "                                  or will likely crash when accessing data on the host." << std::endl; 
+    }
+
+    const char * env_force_device_alloc = getenv("CUDA_MANAGED_FORCE_DEVICE_ALLOC");
+    bool force_device_alloc;
+    if (env_force_device_alloc == 0) force_device_alloc=false;
+    else force_device_alloc=atoi(env_force_device_alloc)!=0;
+  
+    const char * env_visible_devices = getenv("CUDA_VISIBLE_DEVICES");
+    bool visible_devices_one=true;
+    if (env_visible_devices == 0) visible_devices_one=false;
+    
+    if(!visible_devices_one && !force_device_alloc) {
+      std::cout << "Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default" << std::endl;
+      std::cout << "                                  without setting CUDA_MANAGED_FORCE_DEVICE_ALLOC=1 or " << std::endl;
+      std::cout << "                                  setting CUDA_VISIBLE_DEVICES." << std::endl;
+      std::cout << "                                  This could on multi GPU systems lead to severe performance" << std::endl;
+      std::cout << "                                  penalties." << std::endl;
+    }
+  #endif
+
+  cudaThreadSetCacheConfig(cudaFuncCachePreferShared);
+
+  // Init the array for used for arbitrarily sized atomics
+  Impl::init_lock_arrays_cuda_space();
+
+  #ifdef KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE
+  Kokkos::Impl::CudaLockArraysStruct locks;
+  locks.atomic = atomic_lock_array_cuda_space_ptr(false);
+  locks.scratch = scratch_lock_array_cuda_space_ptr(false);
+  locks.threadid = threadid_lock_array_cuda_space_ptr(false);
+  cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
+  #endif
+}
+
+//----------------------------------------------------------------------------
+
+typedef Cuda::size_type ScratchGrain[ Impl::CudaTraits::WarpSize ] ;
+enum { sizeScratchGrain = sizeof(ScratchGrain) };
+
+
+Cuda::size_type *
+CudaInternal::scratch_flags( const Cuda::size_type size )
+{
+  if ( verify_is_initialized("scratch_flags") && m_scratchFlagsCount * sizeScratchGrain < size ) {
+
+
+    m_scratchFlagsCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;
+
+    typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void > Record ;
+
+    Record * const r = Record::allocate( Kokkos::CudaSpace()
+                                       , "InternalScratchFlags"
+                                       , ( sizeof( ScratchGrain ) * m_scratchFlagsCount ) );
+
+    Record::increment( r );
+
+    m_scratchFlags = reinterpret_cast<size_type *>( r->data() );
+
+    CUDA_SAFE_CALL( cudaMemset( m_scratchFlags , 0 , m_scratchFlagsCount * sizeScratchGrain ) );
+  }
+
+  return m_scratchFlags ;
+}
+
+Cuda::size_type *
+CudaInternal::scratch_space( const Cuda::size_type size )
+{
+  if ( verify_is_initialized("scratch_space") && m_scratchSpaceCount * sizeScratchGrain < size ) {
+
+    m_scratchSpaceCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;
+
+     typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void > Record ;
+
+     Record * const r = Record::allocate( Kokkos::CudaSpace()
+                                        , "InternalScratchSpace"
+                                        , ( sizeof( ScratchGrain ) * m_scratchSpaceCount ) );
+
+     Record::increment( r );
+
+     m_scratchSpace = reinterpret_cast<size_type *>( r->data() );
+  }
+
+  return m_scratchSpace ;
+}
+
+Cuda::size_type *
+CudaInternal::scratch_unified( const Cuda::size_type size )
+{
+  if ( verify_is_initialized("scratch_unified") &&
+       m_scratchUnifiedSupported && m_scratchUnifiedCount * sizeScratchGrain < size ) {
+
+    m_scratchUnifiedCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;
+
+    typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void > Record ;
+
+    Record * const r = Record::allocate( Kokkos::CudaHostPinnedSpace()
+                                       , "InternalScratchUnified"
+                                       , ( sizeof( ScratchGrain ) * m_scratchUnifiedCount ) );
+
+    Record::increment( r );
+
+    m_scratchUnified = reinterpret_cast<size_type *>( r->data() );
+  }
+
+  return m_scratchUnified ;
+}
+
+//----------------------------------------------------------------------------
+
+void CudaInternal::finalize()
+{
+  was_finalized = 1;
+  if ( 0 != m_scratchSpace || 0 != m_scratchFlags ) {
+
+    atomic_lock_array_cuda_space_ptr(false);
+    scratch_lock_array_cuda_space_ptr(false);
+    threadid_lock_array_cuda_space_ptr(false);
+
+    if ( m_stream ) {
+      for ( size_type i = 1 ; i < m_streamCount ; ++i ) {
+        cudaStreamDestroy( m_stream[i] );
+        m_stream[i] = 0 ;
+      }
+      ::free( m_stream );
+    }
+
+    typedef Kokkos::Experimental::Impl::SharedAllocationRecord< CudaSpace > RecordCuda ;
+    typedef Kokkos::Experimental::Impl::SharedAllocationRecord< CudaHostPinnedSpace > RecordHost ;
+
+    RecordCuda::decrement( RecordCuda::get_record( m_scratchFlags ) );
+    RecordCuda::decrement( RecordCuda::get_record( m_scratchSpace ) );
+    RecordHost::decrement( RecordHost::get_record( m_scratchUnified ) );
+
+    m_cudaDev             = -1 ;
+    m_multiProcCount      = 0 ;
+    m_maxWarpCount        = 0 ;
+    m_maxBlock            = 0 ;
+    m_maxSharedWords      = 0 ;
+    m_scratchSpaceCount   = 0 ;
+    m_scratchFlagsCount   = 0 ;
+    m_scratchUnifiedCount = 0 ;
+    m_streamCount         = 0 ;
+    m_scratchSpace        = 0 ;
+    m_scratchFlags        = 0 ;
+    m_scratchUnified      = 0 ;
+    m_stream              = 0 ;
+  }
+}
+
+//----------------------------------------------------------------------------
+
+Cuda::size_type cuda_internal_multiprocessor_count()
+{ return CudaInternal::singleton().m_multiProcCount ; }
+
+Cuda::size_type cuda_internal_maximum_warp_count()
+{ return CudaInternal::singleton().m_maxWarpCount ; }
+
+Cuda::size_type cuda_internal_maximum_grid_count()
+{ return CudaInternal::singleton().m_maxBlock ; }
+
+Cuda::size_type cuda_internal_maximum_shared_words()
+{ return CudaInternal::singleton().m_maxSharedWords ; }
+
+Cuda::size_type * cuda_internal_scratch_space( const Cuda::size_type size )
+{ return CudaInternal::singleton().scratch_space( size ); }
+
+Cuda::size_type * cuda_internal_scratch_flags( const Cuda::size_type size )
+{ return CudaInternal::singleton().scratch_flags( size ); }
+
+Cuda::size_type * cuda_internal_scratch_unified( const Cuda::size_type size )
+{ return CudaInternal::singleton().scratch_unified( size ); }
+
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+Cuda::size_type Cuda::detect_device_count()
+{ return Impl::CudaInternalDevices::singleton().m_cudaDevCount ; }
+
+int Cuda::concurrency() {
+  return 131072;
+}
+
+int Cuda::is_initialized()
+{ return Impl::CudaInternal::singleton().is_initialized(); }
+
+void Cuda::initialize( const Cuda::SelectDevice config , size_t num_instances )
+{
+  Impl::CudaInternal::singleton().initialize( config.cuda_device_id , num_instances );
+
+  #if (KOKKOS_ENABLE_PROFILING)
+    Kokkos::Profiling::initialize();
+  #endif
+}
+
+std::vector<unsigned>
+Cuda::detect_device_arch()
+{
+  const Impl::CudaInternalDevices & s = Impl::CudaInternalDevices::singleton();
+
+  std::vector<unsigned> output( s.m_cudaDevCount );
+
+  for ( int i = 0 ; i < s.m_cudaDevCount ; ++i ) {
+    output[i] = s.m_cudaProp[i].major * 100 + s.m_cudaProp[i].minor ;
+  }
+
+  return output ;
+}
+
+Cuda::size_type Cuda::device_arch()
+{
+  const int dev_id = Impl::CudaInternal::singleton().m_cudaDev ;
+
+  int dev_arch = 0 ;
+
+  if ( 0 <= dev_id ) {
+    const struct cudaDeviceProp & cudaProp =
+      Impl::CudaInternalDevices::singleton().m_cudaProp[ dev_id ] ;
+
+    dev_arch = cudaProp.major * 100 + cudaProp.minor ;
+  }
+
+  return dev_arch ;
+}
+
+void Cuda::finalize()
+{
+  Impl::CudaInternal::singleton().finalize();
+
+  #if (KOKKOS_ENABLE_PROFILING)
+    Kokkos::Profiling::finalize();
+  #endif
+}
+
+Cuda::Cuda()
+  : m_device( Impl::CudaInternal::singleton().m_cudaDev )
+  , m_stream( 0 )
+{
+  Impl::CudaInternal::singleton().verify_is_initialized( "Cuda instance constructor" );
+}
+
+Cuda::Cuda( const int instance_id )
+  : m_device( Impl::CudaInternal::singleton().m_cudaDev )
+  , m_stream(
+      Impl::CudaInternal::singleton().verify_is_initialized( "Cuda instance constructor" )
+        ? Impl::CudaInternal::singleton().m_stream[ instance_id % Impl::CudaInternal::singleton().m_streamCount ]
+        : 0 )
+{}
+
+void Cuda::print_configuration( std::ostream & s , const bool )
+{ Impl::CudaInternal::singleton().print_configuration( s ); }
+
+bool Cuda::sleep() { return false ; }
+
+bool Cuda::wake() { return true ; }
+
+void Cuda::fence()
+{
+  Kokkos::Impl::cuda_device_synchronize();
+}
+
+} // namespace Kokkos
+
+#endif // KOKKOS_HAVE_CUDA
+//----------------------------------------------------------------------------
+
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Internal.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Internal.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..8b10d47f8857e9ca19b4ae962659f7e9137d78a1
--- /dev/null
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Internal.hpp
@@ -0,0 +1,202 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_INTERNAL_HPP
+#define KOKKOS_CUDA_INTERNAL_HPP
+#include<iostream>
+#include <Kokkos_Macros.hpp>
+
+/* only compile this file if CUDA is enabled for Kokkos */
+#ifdef KOKKOS_HAVE_CUDA
+
+#include <Cuda/Kokkos_Cuda_Error.hpp>
+
+namespace Kokkos { namespace Impl {
+
+template<class DriverType, bool Large>
+struct CudaGetMaxBlockSize;
+
+template<class DriverType, bool Large = (CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType))>
+int cuda_get_max_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
+                            const size_t shmem_extra_block, const size_t shmem_extra_thread) {
+  return CudaGetMaxBlockSize<DriverType,Large>::get_block_size(f,vector_length, shmem_extra_block,shmem_extra_thread);
+}
+
+
+template<class DriverType>
+struct CudaGetMaxBlockSize<DriverType,true> {
+  static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
+                            const size_t shmem_extra_block, const size_t shmem_extra_thread) {
+    int numBlocks;
+    int blockSize=32;
+    int sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
+                    FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &numBlocks,
+        cuda_parallel_launch_constant_memory<DriverType>,
+        blockSize,
+        sharedmem);
+
+    while (blockSize<1024 && numBlocks>0) {
+      blockSize*=2;
+      sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
+                  FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
+
+      cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+          &numBlocks,
+          cuda_parallel_launch_constant_memory<DriverType>,
+          blockSize,
+          sharedmem);
+    }
+    if(numBlocks>0) return blockSize;
+    else return blockSize/2;
+  }
+};
+
+template<class DriverType>
+struct CudaGetMaxBlockSize<DriverType,false> {
+  static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
+                            const size_t shmem_extra_block, const size_t shmem_extra_thread) {
+    int numBlocks;
+
+    int blockSize=32;
+    int sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
+                    FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &numBlocks,
+        cuda_parallel_launch_local_memory<DriverType>,
+        blockSize,
+        sharedmem);
+
+    while (blockSize<1024 && numBlocks>0) {
+      blockSize*=2;
+      sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
+                  FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
+
+      cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+          &numBlocks,
+          cuda_parallel_launch_local_memory<DriverType>,
+          blockSize,
+          sharedmem);
+    }
+    if(numBlocks>0) return blockSize;
+    else return blockSize/2;
+  }
+};
+
+
+
+template<class DriverType, bool Large>
+struct CudaGetOptBlockSize;
+
+template<class DriverType, bool Large = (CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType))>
+int cuda_get_opt_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
+                            const size_t shmem_extra_block, const size_t shmem_extra_thread) {
+  return CudaGetOptBlockSize<DriverType,Large>::get_block_size(f,vector_length,shmem_extra_block,shmem_extra_thread);
+}
+
+template<class DriverType>
+struct CudaGetOptBlockSize<DriverType,true> {
+  static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
+                            const size_t shmem_extra_block, const size_t shmem_extra_thread) {
+    int blockSize=16;
+    int numBlocks;
+    int sharedmem;
+    int maxOccupancy=0;
+    int bestBlockSize=0;
+
+    while(blockSize<1024) {
+      blockSize*=2;
+
+      //calculate the occupancy with that optBlockSize and check whether its larger than the largest one found so far
+      sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
+                  FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
+      cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+              &numBlocks,
+              cuda_parallel_launch_constant_memory<DriverType>,
+              blockSize,
+              sharedmem);
+      if(maxOccupancy < numBlocks*blockSize) {
+         maxOccupancy = numBlocks*blockSize;
+         bestBlockSize = blockSize;
+      }
+    }
+    return bestBlockSize;
+  }
+};
+
+template<class DriverType>
+struct CudaGetOptBlockSize<DriverType,false> {
+  static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
+                            const size_t shmem_extra_block, const size_t shmem_extra_thread) {
+    int blockSize=16;
+    int numBlocks;
+    int sharedmem;
+    int maxOccupancy=0;
+    int bestBlockSize=0;
+
+    while(blockSize<1024) {
+      blockSize*=2;
+      sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
+                  FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
+
+      cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+              &numBlocks,
+              cuda_parallel_launch_local_memory<DriverType>,
+              blockSize,
+              sharedmem);
+
+      if(maxOccupancy < numBlocks*blockSize) {
+        maxOccupancy = numBlocks*blockSize;
+        bestBlockSize = blockSize;
+      }
+    }
+    return bestBlockSize;
+  }
+};
+
+}} // namespace Kokkos::Impl
+
+#endif // KOKKOS_HAVE_CUDA
+#endif /* #ifndef KOKKOS_CUDA_INTERNAL_HPP */
+
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..7afa06fdf5582cd3543294b4156ac90a906a6ce7
--- /dev/null
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
@@ -0,0 +1,1926 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_PARALLEL_HPP
+#define KOKKOS_CUDA_PARALLEL_HPP
+
+#include <iostream>
+#include <algorithm>
+#include <stdio.h>
+
+#include <Kokkos_Macros.hpp>
+
+/* only compile this file if CUDA is enabled for Kokkos */
+#if defined( __CUDACC__ ) && defined( KOKKOS_HAVE_CUDA )
+
+#include <utility>
+#include <Kokkos_Parallel.hpp>
+
+#include <Cuda/Kokkos_CudaExec.hpp>
+#include <Cuda/Kokkos_Cuda_ReduceScan.hpp>
+#include <Cuda/Kokkos_Cuda_Internal.hpp>
+#include <Kokkos_Vectorization.hpp>
+
+#if (KOKKOS_ENABLE_PROFILING)
+#include <impl/Kokkos_Profiling_Interface.hpp>
+#include <typeinfo>
+#endif
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< typename Type >
+struct CudaJoinFunctor {
+  typedef Type value_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update ,
+                    volatile const value_type & input )
+    { update += input ; }
+};
+
+class CudaTeamMember {
+private:
+
+  typedef Kokkos::Cuda                           execution_space ;
+  typedef execution_space::scratch_memory_space  scratch_memory_space ;
+
+  void                * m_team_reduce ;
+  scratch_memory_space  m_team_shared ;
+  int                   m_league_rank ;
+  int                   m_league_size ;
+
+public:
+
+#if defined( __CUDA_ARCH__ )
+
+  __device__ inline
+  const execution_space::scratch_memory_space & team_shmem() const
+    { return m_team_shared.set_team_thread_mode(0,1,0) ; }
+  __device__ inline
+  const execution_space::scratch_memory_space & team_scratch(const int& level) const
+    { return m_team_shared.set_team_thread_mode(level,1,0) ; }
+  __device__ inline
+  const execution_space::scratch_memory_space & thread_scratch(const int& level) const
+    { return m_team_shared.set_team_thread_mode(level,team_size(),team_rank()) ; }
+
+  __device__ inline int league_rank() const { return m_league_rank ; }
+  __device__ inline int league_size() const { return m_league_size ; }
+  __device__ inline int team_rank() const { return threadIdx.y ; }
+  __device__ inline int team_size() const { return blockDim.y ; }
+
+  __device__ inline void team_barrier() const { __syncthreads(); }
+
+  template<class ValueType>
+  __device__ inline void team_broadcast(ValueType& value, const int& thread_id) const {
+    __shared__ ValueType sh_val;
+    if(threadIdx.x == 0 && threadIdx.y == thread_id) {
+      sh_val = value;
+    }
+    team_barrier();
+    value = sh_val;
+    team_barrier();
+  }
+
+#ifdef KOKKOS_HAVE_CXX11
+  template< class ValueType, class JoinOp >
+  __device__ inline
+  typename JoinOp::value_type team_reduce( const ValueType & value
+                                         , const JoinOp & op_in ) const
+    {
+      typedef JoinLambdaAdapter<ValueType,JoinOp> JoinOpFunctor ;
+      const JoinOpFunctor op(op_in);
+      ValueType * const base_data = (ValueType *) m_team_reduce ;
+#else
+  template< class JoinOp >
+  __device__ inline
+  typename JoinOp::value_type team_reduce( const typename JoinOp::value_type & value
+                                         , const JoinOp & op ) const
+    {
+      typedef JoinOp JoinOpFunctor ;
+      typename JoinOp::value_type * const base_data = (typename JoinOp::value_type *) m_team_reduce ;
+#endif
+
+      __syncthreads(); // Don't write in to shared data until all threads have entered this function
+
+      if ( 0 == threadIdx.y ) { base_data[0] = 0 ; }
+
+      base_data[ threadIdx.y ] = value ;
+
+      Impl::cuda_intra_block_reduce_scan<false,JoinOpFunctor,void>( op , base_data );
+
+      return base_data[ blockDim.y - 1 ];
+    }
+
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
+   *          with intra-team non-deterministic ordering accumulation.
+   *
+   *  The global inter-team accumulation value will, at the end of the
+   *  league's parallel execution, be the scan's total.
+   *  Parallel execution ordering of the league's teams is non-deterministic.
+   *  As such the base value for each team's scan operation is similarly
+   *  non-deterministic.
+   */
+  template< typename Type >
+  __device__ inline Type team_scan( const Type & value , Type * const global_accum ) const
+    {
+      Type * const base_data = (Type *) m_team_reduce ;
+
+      __syncthreads(); // Don't write in to shared data until all threads have entered this function
+
+      if ( 0 == threadIdx.y ) { base_data[0] = 0 ; }
+
+      base_data[ threadIdx.y + 1 ] = value ;
+
+      Impl::cuda_intra_block_reduce_scan<true,Impl::CudaJoinFunctor<Type>,void>( Impl::CudaJoinFunctor<Type>() , base_data + 1 );
+
+      if ( global_accum ) {
+        if ( blockDim.y == threadIdx.y + 1 ) {
+          base_data[ blockDim.y ] = atomic_fetch_add( global_accum , base_data[ blockDim.y ] );
+        }
+        __syncthreads(); // Wait for atomic
+        base_data[ threadIdx.y ] += base_data[ blockDim.y ] ;
+      }
+
+      return base_data[ threadIdx.y ];
+    }
+
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering.
+   *
+   *  The highest rank thread can compute the reduction total as
+   *    reduction_total = dev.team_scan( value ) + value ;
+   */
+  template< typename Type >
+  __device__ inline Type team_scan( const Type & value ) const
+    { return this->template team_scan<Type>( value , 0 ); }
+
+  //----------------------------------------
+  // Private for the driver
+
+  __device__ inline
+  CudaTeamMember( void * shared
+                , const int shared_begin
+                , const int shared_size
+                , void*     scratch_level_1_ptr
+                , const int scratch_level_1_size
+                , const int arg_league_rank
+                , const int arg_league_size )
+    : m_team_reduce( shared )
+    , m_team_shared( ((char *)shared) + shared_begin , shared_size,  scratch_level_1_ptr, scratch_level_1_size)
+    , m_league_rank( arg_league_rank ) 
+    , m_league_size( arg_league_size ) 
+    {}
+
+#else
+
+  const execution_space::scratch_memory_space & team_shmem() const
+    { return m_team_shared.set_team_thread_mode(0, 1,0) ; }
+  const execution_space::scratch_memory_space & team_scratch(const int& level) const
+    { return m_team_shared.set_team_thread_mode(level,1,0) ; }
+  const execution_space::scratch_memory_space & thread_scratch(const int& level) const
+    { return m_team_shared.set_team_thread_mode(level,team_size(),team_rank()) ; }
+
+  int league_rank() const {return 0;}
+  int league_size() const {return 1;}
+  int team_rank() const {return 0;}
+  int team_size() const {return 1;}
+
+  void team_barrier() const {}
+  template<class ValueType>
+  void team_broadcast(ValueType& value, const int& thread_id) const {}
+
+  template< class JoinOp >
+  typename JoinOp::value_type team_reduce( const typename JoinOp::value_type & value
+                                         , const JoinOp & op ) const {return typename JoinOp::value_type();}
+
+  template< typename Type >
+  Type team_scan( const Type & value , Type * const global_accum ) const {return Type();}
+
+  template< typename Type >
+  Type team_scan( const Type & value ) const {return Type();}
+
+  //----------------------------------------
+  // Private for the driver
+
+  CudaTeamMember( void * shared
+                , const int shared_begin
+                , const int shared_end
+                , void*     scratch_level_1_ptr
+                , const int scratch_level_1_size
+                , const int arg_league_rank
+                , const int arg_league_size );
+
+#endif /* #if ! defined( __CUDA_ARCH__ ) */
+
+};
+
+} // namespace Impl
+
+namespace Impl {
+template< class ... Properties >
+class TeamPolicyInternal< Kokkos::Cuda , Properties ... >: public PolicyTraits<Properties ... >
+{
+public:
+
+  //! Tag this class as a kokkos execution policy
+  typedef TeamPolicyInternal      execution_policy ;
+
+  typedef PolicyTraits<Properties ... > traits;
+
+private:
+
+  enum { MAX_WARP = 8 };
+
+  int m_league_size ;
+  int m_team_size ;
+  int m_vector_length ;
+  int m_team_scratch_size[2] ;
+  int m_thread_scratch_size[2] ;
+  int m_chunk_size;
+
+public:
+
+  //! Execution space of this execution policy
+  typedef Kokkos::Cuda  execution_space ;
+
+  TeamPolicyInternal& operator = (const TeamPolicyInternal& p) {
+    m_league_size = p.m_league_size;
+    m_team_size = p.m_team_size;
+    m_vector_length = p.m_vector_length;
+    m_team_scratch_size[0] = p.m_team_scratch_size[0];
+    m_team_scratch_size[1] = p.m_team_scratch_size[1];
+    m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
+    m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
+    m_chunk_size = p.m_chunk_size;
+    return *this;
+  }
+
+  //----------------------------------------
+
+  template< class FunctorType >
+  inline static
+  int team_size_max( const FunctorType & functor )
+    {
+      int n = MAX_WARP * Impl::CudaTraits::WarpSize ;
+
+      for ( ; n ; n >>= 1 ) {
+        const int shmem_size =
+          /* for global reduce */ Impl::cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,typename traits::work_tag>( functor , n )
+          /* for team   reduce */ + ( n + 2 ) * sizeof(double)
+          /* for team   shared */ + Impl::FunctorTeamShmemSize< FunctorType >::value( functor , n );
+
+        if ( shmem_size < Impl::CudaTraits::SharedMemoryCapacity ) break ;
+      }
+
+      return n ;
+    }
+
+  template< class FunctorType >
+  static int team_size_recommended( const FunctorType & functor )
+    { return team_size_max( functor ); }
+
+  template< class FunctorType >
+  static int team_size_recommended( const FunctorType & functor , const int vector_length)
+    {
+      int max = team_size_max( functor )/vector_length;
+      if(max<1) max = 1;
+      return max;
+    }
+
+  inline static
+  int vector_length_max()
+    { return Impl::CudaTraits::WarpSize; }
+
+  //----------------------------------------
+
+  inline int vector_length()   const { return m_vector_length ; }
+  inline int team_size()   const { return m_team_size ; }
+  inline int league_size() const { return m_league_size ; }
+  inline int scratch_size(int level, int team_size_ = -1) const {
+    if(team_size_<0) team_size_ = m_team_size;
+    return m_team_scratch_size[level] + team_size_*m_thread_scratch_size[level];
+  }
+  inline size_t team_scratch_size(int level) const {
+    return m_team_scratch_size[level];
+  }
+  inline size_t thread_scratch_size(int level) const {
+    return m_thread_scratch_size[level];
+  }
+
+  TeamPolicyInternal()
+    : m_league_size( 0 )
+    , m_team_size( 0 )
+    , m_vector_length( 0 )
+    , m_team_scratch_size {0,0}
+    , m_thread_scratch_size {0,0}
+    , m_chunk_size ( 32 ) 
+   {}
+
+  /** \brief  Specify league size, request team size */
+  TeamPolicyInternal( execution_space &
+            , int league_size_
+            , int team_size_request
+            , int vector_length_request = 1 )
+    : m_league_size( league_size_ )
+    , m_team_size( team_size_request )
+    , m_vector_length( vector_length_request )
+    , m_team_scratch_size {0,0}
+    , m_thread_scratch_size {0,0}
+    , m_chunk_size ( 32 )
+    {
+      // Allow only power-of-two vector_length
+      if ( ! Kokkos::Impl::is_integral_power_of_two( vector_length_request ) ) {
+        Impl::throw_runtime_exception( "Requested non-power-of-two vector length for TeamPolicy.");
+      }
+
+      // Make sure league size is permissable
+      if(league_size_ >= int(Impl::cuda_internal_maximum_grid_count()))
+        Impl::throw_runtime_exception( "Requested too large league_size for TeamPolicy on Cuda execution space.");
+
+      // Make sure total block size is permissable
+      if ( m_team_size * m_vector_length > 1024 ) {
+        Impl::throw_runtime_exception(std::string("Kokkos::TeamPolicy< Cuda > the team size is too large. Team size x vector length must be smaller than 1024."));
+      }
+    }
+
+  /** \brief  Specify league size, request team size */
+  TeamPolicyInternal( execution_space &
+            , int league_size_
+            , const Kokkos::AUTO_t & /* team_size_request */
+            , int vector_length_request = 1 )
+    : m_league_size( league_size_ )
+    , m_team_size( -1 )
+    , m_vector_length( vector_length_request )
+    , m_team_scratch_size {0,0}
+    , m_thread_scratch_size {0,0}
+    , m_chunk_size ( 32 )
+    {
+      // Allow only power-of-two vector_length
+      if ( ! Kokkos::Impl::is_integral_power_of_two( vector_length_request ) ) {
+        Impl::throw_runtime_exception( "Requested non-power-of-two vector length for TeamPolicy.");
+      }
+
+      // Make sure league size is permissable
+      if(league_size_ >= int(Impl::cuda_internal_maximum_grid_count()))
+        Impl::throw_runtime_exception( "Requested too large league_size for TeamPolicy on Cuda execution space.");
+    }
+
+  TeamPolicyInternal( int league_size_
+            , int team_size_request
+            , int vector_length_request = 1 )
+    : m_league_size( league_size_ )
+    , m_team_size( team_size_request )
+    , m_vector_length ( vector_length_request )
+    , m_team_scratch_size {0,0}
+    , m_thread_scratch_size {0,0}
+    , m_chunk_size ( 32 )
+    {
+      // Allow only power-of-two vector_length
+      if ( ! Kokkos::Impl::is_integral_power_of_two( vector_length_request ) ) {
+        Impl::throw_runtime_exception( "Requested non-power-of-two vector length for TeamPolicy.");
+      }
+
+      // Make sure league size is permissable
+      if(league_size_ >= int(Impl::cuda_internal_maximum_grid_count()))
+        Impl::throw_runtime_exception( "Requested too large league_size for TeamPolicy on Cuda execution space.");
+
+      // Make sure total block size is permissable
+      if ( m_team_size * m_vector_length > 1024 ) {
+        Impl::throw_runtime_exception(std::string("Kokkos::TeamPolicy< Cuda > the team size is too large. Team size x vector length must be smaller than 1024."));
+      }
+    }
+
+  TeamPolicyInternal( int league_size_
+            , const Kokkos::AUTO_t & /* team_size_request */
+            , int vector_length_request = 1 )
+    : m_league_size( league_size_ )
+    , m_team_size( -1 )
+    , m_vector_length ( vector_length_request )
+    , m_team_scratch_size {0,0}
+    , m_thread_scratch_size {0,0}
+    , m_chunk_size ( 32 )
+    {
+      // Allow only power-of-two vector_length
+      if ( ! Kokkos::Impl::is_integral_power_of_two( vector_length_request ) ) {
+        Impl::throw_runtime_exception( "Requested non-power-of-two vector length for TeamPolicy.");
+      }
+
+      // Make sure league size is permissable
+      if(league_size_ >= int(Impl::cuda_internal_maximum_grid_count()))
+        Impl::throw_runtime_exception( "Requested too large league_size for TeamPolicy on Cuda execution space.");
+    }
+
+  inline int chunk_size() const { return m_chunk_size ; }
+
+  /** \brief set chunk_size to a discrete value*/
+  inline TeamPolicyInternal set_chunk_size(typename traits::index_type chunk_size_) const {
+    TeamPolicyInternal p = *this;
+    p.m_chunk_size = chunk_size_;
+    return p;
+  }
+
+  /** \brief set per team scratch size for a specific level of the scratch hierarchy */
+  inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team) const {
+    TeamPolicyInternal p = *this;
+    p.m_team_scratch_size[level] = per_team.value;
+    return p;
+  };
+
+  /** \brief set per thread scratch size for a specific level of the scratch hierarchy */
+  inline TeamPolicyInternal set_scratch_size(const int& level, const PerThreadValue& per_thread) const {
+    TeamPolicyInternal p = *this;
+    p.m_thread_scratch_size[level] = per_thread.value;
+    return p;
+  };
+
+  /** \brief set per thread and per team scratch size for a specific level of the scratch hierarchy */
+  inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team, const PerThreadValue& per_thread) const {
+    TeamPolicyInternal p = *this;
+    p.m_team_scratch_size[level] = per_team.value;
+    p.m_thread_scratch_size[level] = per_thread.value;
+    return p;
+  };
+
+  typedef Kokkos::Impl::CudaTeamMember member_type ;
+};
+} // namspace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class ... Traits >
+class ParallelFor< FunctorType
+                 , Kokkos::RangePolicy< Traits ... >
+                 , Kokkos::Cuda
+                 >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Traits ... > Policy;
+  typedef typename Policy::member_type  Member ;
+  typedef typename Policy::work_tag     WorkTag ;
+
+  const FunctorType  m_functor ;
+  const Policy       m_policy ;  
+
+  ParallelFor() = delete ;
+  ParallelFor & operator = ( const ParallelFor & ) = delete ;
+
+  template< class TagType >
+  inline __device__
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec_range( const Member i ) const
+    { m_functor( i ); }
+
+  template< class TagType >
+  inline __device__
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec_range( const Member i ) const
+    { m_functor( TagType() , i ); }
+
+public:
+
+  typedef FunctorType functor_type ;
+
+  inline
+  __device__
+  void operator()(void) const
+    {
+      const Member work_stride = blockDim.y * gridDim.x ;
+      const Member work_end    = m_policy.end();
+
+      for ( Member
+              iwork =  m_policy.begin() + threadIdx.y + blockDim.y * blockIdx.x ;
+              iwork <  work_end ;
+              iwork += work_stride ) {
+        this-> template exec_range< WorkTag >( iwork );
+      }
+    }
+
+  inline
+  void execute() const
+    {
+      const int nwork = m_policy.end() - m_policy.begin();
+      const dim3 block(  1 , CudaTraits::WarpSize * cuda_internal_maximum_warp_count(), 1);
+      const dim3 grid( std::min( ( nwork + block.y - 1 ) / block.y , cuda_internal_maximum_grid_count() ) , 1 , 1);
+
+      CudaParallelLaunch< ParallelFor >( *this , grid , block , 0 );
+    }
+
+  ParallelFor( const FunctorType  & arg_functor ,
+               const Policy       & arg_policy )
+    : m_functor( arg_functor )
+    , m_policy(  arg_policy )
+    { }
+};
+
+template< class FunctorType , class ... Properties >
+class ParallelFor< FunctorType
+                 , Kokkos::TeamPolicy< Properties ... >
+                 , Kokkos::Cuda
+                 >
+{
+private:
+
+  typedef TeamPolicyInternal< Kokkos::Cuda , Properties ... >   Policy ;
+  typedef typename Policy::member_type  Member ;
+  typedef typename Policy::work_tag     WorkTag ;
+
+public:
+
+  typedef FunctorType      functor_type ;
+  typedef Cuda::size_type  size_type ;
+
+private:
+
+  // Algorithmic constraints: blockDim.y is a power of two AND blockDim.y == blockDim.z == 1
+  // shared memory utilization:
+  //
+  //  [ team   reduce space ]
+  //  [ team   shared space ]
+  //
+
+  const FunctorType m_functor ;
+  const size_type   m_league_size ;
+  const size_type   m_team_size ;
+  const size_type   m_vector_size ;
+  const size_type   m_shmem_begin ;
+  const size_type   m_shmem_size ;
+  void*             m_scratch_ptr[2] ;
+  const int         m_scratch_size[2] ;
+
+  template< class TagType >
+  __device__ inline
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec_team( const Member & member ) const
+    { m_functor( member ); }
+
+  template< class TagType >
+  __device__ inline
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec_team( const Member & member ) const
+    { m_functor( TagType() , member ); }
+
+public:
+
+  __device__ inline
+  void operator()(void) const
+  {
+    // Iterate this block through the league
+    for ( int league_rank = blockIdx.x ; league_rank < m_league_size ; league_rank += gridDim.x ) {
+
+      this-> template exec_team< WorkTag >(
+        typename Policy::member_type( kokkos_impl_cuda_shared_memory<void>()
+                                    , m_shmem_begin
+                                    , m_shmem_size
+                                    , m_scratch_ptr[1]
+                                    , m_scratch_size[1]
+                                    , league_rank
+                                    , m_league_size ) );
+    }
+  }
+
+  inline
+  void execute() const
+    {
+      const int shmem_size_total = m_shmem_begin + m_shmem_size ;
+      const dim3 grid( int(m_league_size) , 1 , 1 );
+      const dim3 block( int(m_vector_size) , int(m_team_size) , 1 );
+
+      CudaParallelLaunch< ParallelFor >( *this, grid, block, shmem_size_total ); // copy to device and execute
+
+    }
+
+  ParallelFor( const FunctorType  & arg_functor 
+             , const Policy       & arg_policy 
+             )
+    : m_functor( arg_functor )
+    , m_league_size( arg_policy.league_size() )
+    , m_team_size( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
+        Kokkos::Impl::cuda_get_opt_block_size< ParallelFor >( arg_functor , arg_policy.vector_length(), arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) / arg_policy.vector_length() )
+    , m_vector_size( arg_policy.vector_length() )
+    , m_shmem_begin( sizeof(double) * ( m_team_size + 2 ) )
+    , m_shmem_size( arg_policy.scratch_size(0,m_team_size) + FunctorTeamShmemSize< FunctorType >::value( m_functor , m_team_size ) )
+    , m_scratch_ptr{NULL,NULL}
+    , m_scratch_size{arg_policy.scratch_size(0,m_team_size),arg_policy.scratch_size(1,m_team_size)}
+    {
+      // Functor's reduce memory, team scan memory, and team shared memory depend upon team size.
+      m_scratch_ptr[1] = cuda_resize_scratch_space(m_scratch_size[1]*(Cuda::concurrency()/(m_team_size*m_vector_size)));
+
+      const int shmem_size_total = m_shmem_begin + m_shmem_size ;
+      if ( CudaTraits::SharedMemoryCapacity < shmem_size_total ) {
+        Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelFor< Cuda > insufficient shared memory"));
+      }
+
+      if ( int(m_team_size) >
+           int(Kokkos::Impl::cuda_get_max_block_size< ParallelFor >
+                 ( arg_functor , arg_policy.vector_length(), arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) / arg_policy.vector_length())) {
+        Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelFor< Cuda > requested too large team size."));
+      }
+    }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class ReducerType, class ... Traits >
+class ParallelReduce< FunctorType
+                    , Kokkos::RangePolicy< Traits ... >
+                    , ReducerType
+                    , Kokkos::Cuda 
+                    >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Traits ... >         Policy ;
+
+  typedef typename Policy::WorkRange    WorkRange ;
+  typedef typename Policy::work_tag     WorkTag ;
+  typedef typename Policy::member_type  Member ;
+
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
+  typedef typename ReducerConditional::type ReducerTypeFwd;
+
+  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd, WorkTag > ValueInit ;
+  typedef Kokkos::Impl::FunctorValueJoin<   ReducerTypeFwd, WorkTag > ValueJoin ;
+
+public:
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::value_type      value_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+  typedef FunctorType                           functor_type ;
+  typedef Cuda::size_type                       size_type ;
+
+  // Algorithmic constraints: blockSize is a power of two AND blockDim.y == blockDim.z == 1
+
+  const FunctorType   m_functor ;
+  const Policy        m_policy ;
+  const ReducerType   m_reducer ;
+  const pointer_type  m_result_ptr ;
+  size_type *         m_scratch_space ;
+  size_type *         m_scratch_flags ;
+  size_type *         m_unified_space ;
+
+  // Shall we use the shfl based reduction or not (only use it for static sized types of more than 128bit
+  enum { UseShflReduction = ((sizeof(value_type)>2*sizeof(double)) && ValueTraits::StaticValueSize) };
+  // Some crutch to do function overloading
+private:
+  typedef double DummyShflReductionType;
+  typedef int DummySHMEMReductionType;
+
+public:
+  template< class TagType >
+  __device__ inline
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec_range( const Member & i , reference_type update ) const
+    { m_functor( i , update ); }
+
+  template< class TagType >
+  __device__ inline
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec_range( const Member & i , reference_type update ) const
+    { m_functor( TagType() , i , update ); }
+
+  __device__ inline
+  void operator() () const {
+    run(Kokkos::Impl::if_c<UseShflReduction, DummyShflReductionType, DummySHMEMReductionType>::select(1,1.0) );
+  }
+
+  __device__ inline
+  void run(const DummySHMEMReductionType& ) const
+  {
+    const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) >
+      word_count( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) / sizeof(size_type) );
+
+    {
+      reference_type value =
+        ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , kokkos_impl_cuda_shared_memory<size_type>() + threadIdx.y * word_count.value );
+
+      // Number of blocks is bounded so that the reduction can be limited to two passes.
+      // Each thread block is given an approximately equal amount of work to perform.
+      // Accumulate the values for this block.
+      // The accumulation ordering does not match the final pass, but is arithmatically equivalent.
+
+      const WorkRange range( m_policy , blockIdx.x , gridDim.x );
+
+      for ( Member iwork = range.begin() + threadIdx.y , iwork_end = range.end() ;
+            iwork < iwork_end ; iwork += blockDim.y ) {
+        this-> template exec_range< WorkTag >( iwork , value );
+      }
+    }
+
+    // Reduce with final value at blockDim.y - 1 location.
+    if ( cuda_single_inter_block_reduce_scan<false,ReducerTypeFwd,WorkTag>(
+           ReducerConditional::select(m_functor , m_reducer) , blockIdx.x , gridDim.x ,
+           kokkos_impl_cuda_shared_memory<size_type>() , m_scratch_space , m_scratch_flags ) ) {
+
+      // This is the final block with the final result at the final threads' location
+
+      size_type * const shared = kokkos_impl_cuda_shared_memory<size_type>() + ( blockDim.y - 1 ) * word_count.value ;
+      size_type * const global = m_unified_space ? m_unified_space : m_scratch_space ;
+
+      if ( threadIdx.y == 0 ) {
+        Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , shared );
+      }
+
+      if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); }
+
+      for ( unsigned i = threadIdx.y ; i < word_count.value ; i += blockDim.y ) { global[i] = shared[i]; }
+    }
+  }
+
+  __device__ inline
+   void run(const DummyShflReductionType&) const
+   {
+
+     value_type value;
+     ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &value);
+     // Number of blocks is bounded so that the reduction can be limited to two passes.
+     // Each thread block is given an approximately equal amount of work to perform.
+     // Accumulate the values for this block.
+     // The accumulation ordering does not match the final pass, but is arithmatically equivalent.
+
+     const WorkRange range( m_policy , blockIdx.x , gridDim.x );
+
+     for ( Member iwork = range.begin() + threadIdx.y , iwork_end = range.end() ;
+           iwork < iwork_end ; iwork += blockDim.y ) {
+       this-> template exec_range< WorkTag >( iwork , value );
+     }
+
+     pointer_type const result = (pointer_type) (m_unified_space ? m_unified_space : m_scratch_space) ;
+
+     int max_active_thread = range.end()-range.begin() < blockDim.y ? range.end() - range.begin():blockDim.y;
+
+     max_active_thread = (max_active_thread == 0)?blockDim.y:max_active_thread;
+
+    value_type init;
+    ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &init);
+     if(Impl::cuda_inter_block_reduction<ReducerTypeFwd,ValueJoin,WorkTag>
+            (value,init,ValueJoin(ReducerConditional::select(m_functor , m_reducer)),m_scratch_space,result,m_scratch_flags,max_active_thread)) {
+       const unsigned id = threadIdx.y*blockDim.x + threadIdx.x;
+       if(id==0) {
+         Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , (void*) &value );
+         *result = value;
+       }
+     }
+   }
+
+  // Determine block size constrained by shared memory:
+  static inline
+  unsigned local_block_size( const FunctorType & f )
+    {
+      unsigned n = CudaTraits::WarpSize * 8 ;
+      while ( n && CudaTraits::SharedMemoryCapacity < cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,WorkTag>( f , n ) ) { n >>= 1 ; }
+      return n ;
+    }
+
+  inline
+  void execute()
+    {
+      const int nwork = m_policy.end() - m_policy.begin();
+      if ( nwork ) {
+        const int block_size = local_block_size( m_functor );
+  
+        m_scratch_space = cuda_internal_scratch_space( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) * block_size /* block_size == max block_count */ );
+        m_scratch_flags = cuda_internal_scratch_flags( sizeof(size_type) );
+        m_unified_space = cuda_internal_scratch_unified( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) );
+  
+        // REQUIRED ( 1 , N , 1 )
+        const dim3 block( 1 , block_size , 1 );
+        // Required grid.x <= block.y
+        const dim3 grid( std::min( int(block.y) , int( ( nwork + block.y - 1 ) / block.y ) ) , 1 , 1 );
+  
+      const int shmem = UseShflReduction?0:cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,WorkTag>( m_functor , block.y );
+
+  
+      CudaParallelLaunch< ParallelReduce >( *this, grid, block, shmem ); // copy to device and execute
+  
+      Cuda::fence();
+  
+      if ( m_result_ptr ) {
+        if ( m_unified_space ) {
+          const int count = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer)  );
+          for ( int i = 0 ; i < count ; ++i ) { m_result_ptr[i] = pointer_type(m_unified_space)[i] ; }
+        }
+        else {
+          const int size = ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer)  );
+          DeepCopy<HostSpace,CudaSpace>( m_result_ptr , m_scratch_space , size );
+        }
+      }
+    }
+    else {
+      if (m_result_ptr) {
+        ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , m_result_ptr );
+      }
+    }
+  }
+
+  template< class HostViewType >
+  ParallelReduce( const FunctorType  & arg_functor 
+                , const Policy       & arg_policy 
+                , const HostViewType & arg_result
+                , typename std::enable_if<
+                   Kokkos::is_view< HostViewType >::value
+                ,void*>::type = NULL)
+  : m_functor( arg_functor )
+  , m_policy(  arg_policy )
+  , m_reducer( InvalidType() )
+  , m_result_ptr( arg_result.ptr_on_device() )
+  , m_scratch_space( 0 )
+  , m_scratch_flags( 0 )
+  , m_unified_space( 0 )
+  { }
+
+  ParallelReduce( const FunctorType  & arg_functor
+                , const Policy       & arg_policy
+                , const ReducerType & reducer)
+  : m_functor( arg_functor )
+  , m_policy(  arg_policy )
+  , m_reducer( reducer )
+  , m_result_ptr( reducer.result_view().ptr_on_device() )
+  , m_scratch_space( 0 )
+  , m_scratch_flags( 0 )
+  , m_unified_space( 0 )
+  { }
+};
+
+//----------------------------------------------------------------------------
+
+template< class FunctorType , class ReducerType, class ... Properties >
+class ParallelReduce< FunctorType
+                    , Kokkos::TeamPolicy< Properties ... >
+                    , ReducerType
+                    , Kokkos::Cuda
+                    >
+{
+private:
+
+  typedef TeamPolicyInternal< Kokkos::Cuda, Properties ... >  Policy ;
+  typedef typename Policy::member_type  Member ;
+  typedef typename Policy::work_tag     WorkTag ;
+
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
+  typedef typename ReducerConditional::type ReducerTypeFwd;
+
+  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd, WorkTag > ValueInit ;
+  typedef Kokkos::Impl::FunctorValueJoin<   ReducerTypeFwd, WorkTag > ValueJoin ;
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+  typedef typename ValueTraits::value_type      value_type ;
+
+
+public:
+
+  typedef FunctorType      functor_type ;
+  typedef Cuda::size_type  size_type ;
+
+  enum { UseShflReduction = (true && ValueTraits::StaticValueSize) };
+
+private:
+  typedef double DummyShflReductionType;
+  typedef int DummySHMEMReductionType;
+
+
+  // Algorithmic constraints: blockDim.y is a power of two AND blockDim.y == blockDim.z == 1
+  // shared memory utilization:
+  //
+  //  [ global reduce space ]
+  //  [ team   reduce space ]
+  //  [ team   shared space ]
+  //
+
+  const FunctorType   m_functor ;
+  const ReducerType   m_reducer ;
+  const pointer_type  m_result_ptr ;
+  size_type *         m_scratch_space ;
+  size_type *         m_scratch_flags ;
+  size_type *         m_unified_space ;
+  size_type           m_team_begin ;
+  size_type           m_shmem_begin ;
+  size_type           m_shmem_size ;
+  void*               m_scratch_ptr[2] ;
+  int                 m_scratch_size[2] ;
+  const size_type     m_league_size ;
+  const size_type     m_team_size ;
+  const size_type     m_vector_size ;
+
+  template< class TagType >
+  __device__ inline
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec_team( const Member & member , reference_type update ) const
+    { m_functor( member , update ); }
+
+  template< class TagType >
+  __device__ inline
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec_team( const Member & member , reference_type update ) const
+    { m_functor( TagType() , member , update ); }
+
+public:
+
+  __device__ inline
+  void operator() () const {
+    run(Kokkos::Impl::if_c<UseShflReduction, DummyShflReductionType, DummySHMEMReductionType>::select(1,1.0) );
+  }
+
+  __device__ inline
+  void run(const DummySHMEMReductionType&) const
+  {
+    const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) >
+      word_count( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) / sizeof(size_type) );
+
+    reference_type value =
+      ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , kokkos_impl_cuda_shared_memory<size_type>() + threadIdx.y * word_count.value );
+
+    // Iterate this block through the league
+    for ( int league_rank = blockIdx.x ; league_rank < m_league_size ; league_rank += gridDim.x ) {
+      this-> template exec_team< WorkTag >
+        ( Member( kokkos_impl_cuda_shared_memory<char>() + m_team_begin
+                                        , m_shmem_begin
+                                        , m_shmem_size
+                                        , m_scratch_ptr[1]
+                                        , m_scratch_size[1]
+                                        , league_rank
+                                        , m_league_size )
+        , value );
+    }
+
+    // Reduce with final value at blockDim.y - 1 location.
+    if ( cuda_single_inter_block_reduce_scan<false,FunctorType,WorkTag>(
+           ReducerConditional::select(m_functor , m_reducer) , blockIdx.x , gridDim.x ,
+           kokkos_impl_cuda_shared_memory<size_type>() , m_scratch_space , m_scratch_flags ) ) {
+
+      // This is the final block with the final result at the final threads' location
+
+      size_type * const shared = kokkos_impl_cuda_shared_memory<size_type>() + ( blockDim.y - 1 ) * word_count.value ;
+      size_type * const global = m_unified_space ? m_unified_space : m_scratch_space ;
+
+      if ( threadIdx.y == 0 ) {
+        Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , shared );
+      }
+
+      if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); }
+
+      for ( unsigned i = threadIdx.y ; i < word_count.value ; i += blockDim.y ) { global[i] = shared[i]; }
+    }
+  }
+
+  __device__ inline
+  void run(const DummyShflReductionType&) const
+  {
+    value_type value;
+    ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &value);
+
+    // Iterate this block through the league
+    for ( int league_rank = blockIdx.x ; league_rank < m_league_size ; league_rank += gridDim.x ) {
+      this-> template exec_team< WorkTag >
+        ( Member( kokkos_impl_cuda_shared_memory<char>() + m_team_begin
+                                        , m_shmem_begin
+                                        , m_shmem_size
+                                        , m_scratch_ptr[1]
+                                        , m_scratch_size[1]
+                                        , league_rank
+                                        , m_league_size )
+        , value );
+    }
+
+    pointer_type const result = (pointer_type) (m_unified_space ? m_unified_space : m_scratch_space) ;
+
+    value_type init;
+    ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &init);
+    if(Impl::cuda_inter_block_reduction<FunctorType,ValueJoin,WorkTag>
+           (value,init,ValueJoin(ReducerConditional::select(m_functor , m_reducer)),m_scratch_space,result,m_scratch_flags,blockDim.y)) {
+      const unsigned id = threadIdx.y*blockDim.x + threadIdx.x;
+      if(id==0) {
+        Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , (void*) &value );
+        *result = value;
+      }
+    }
+  }
+
+  inline
+  void execute()
+    {
+      const int block_count = UseShflReduction? std::min( m_league_size , size_type(1024) )
+                                               :std::min( m_league_size , m_team_size );
+
+      m_scratch_space = cuda_internal_scratch_space( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) * block_count );
+      m_scratch_flags = cuda_internal_scratch_flags( sizeof(size_type) );
+      m_unified_space = cuda_internal_scratch_unified( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) );
+
+      const dim3 block( m_vector_size , m_team_size , 1 );
+      const dim3 grid( block_count , 1 , 1 );
+      const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size ;
+
+      CudaParallelLaunch< ParallelReduce >( *this, grid, block, shmem_size_total ); // copy to device and execute
+
+      Cuda::fence();
+
+      if ( m_result_ptr ) {
+        if ( m_unified_space ) {
+          const int count = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
+          for ( int i = 0 ; i < count ; ++i ) { m_result_ptr[i] = pointer_type(m_unified_space)[i] ; }
+        }
+        else {
+          const int size = ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) );
+          DeepCopy<HostSpace,CudaSpace>( m_result_ptr, m_scratch_space, size );
+        }
+      }
+    }
+
+  template< class HostViewType >
+  ParallelReduce( const FunctorType  & arg_functor 
+                , const Policy       & arg_policy 
+                , const HostViewType & arg_result
+                , typename std::enable_if<
+                                   Kokkos::is_view< HostViewType >::value
+                                ,void*>::type = NULL)
+  : m_functor( arg_functor )
+  , m_reducer( InvalidType() )
+  , m_result_ptr( arg_result.ptr_on_device() )
+  , m_scratch_space( 0 )
+  , m_scratch_flags( 0 )
+  , m_unified_space( 0 )
+  , m_team_begin( 0 )
+  , m_shmem_begin( 0 )
+  , m_shmem_size( 0 )
+  , m_scratch_ptr{NULL,NULL}
+  , m_league_size( arg_policy.league_size() )
+  , m_team_size( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
+      Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(),
+                                                               arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) /
+      arg_policy.vector_length() )
+  , m_vector_size( arg_policy.vector_length() )
+  , m_scratch_size{arg_policy.scratch_size(0,m_team_size),arg_policy.scratch_size(1,m_team_size)}
+  {
+    // Return Init value if the number of worksets is zero
+    if( arg_policy.league_size() == 0) {
+      ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , arg_result.ptr_on_device() );
+      return ;
+    }
+
+    m_team_begin = UseShflReduction?0:cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,WorkTag>( arg_functor , m_team_size );
+    m_shmem_begin = sizeof(double) * ( m_team_size + 2 );
+    m_shmem_size = arg_policy.scratch_size(0,m_team_size) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , m_team_size );
+    m_scratch_ptr[1] = cuda_resize_scratch_space(m_scratch_size[1]*(Cuda::concurrency()/(m_team_size*m_vector_size)));
+    m_scratch_size[0] = m_shmem_size;
+    m_scratch_size[1] = arg_policy.scratch_size(1,m_team_size);
+
+    // The global parallel_reduce does not support vector_length other than 1 at the moment
+    if( (arg_policy.vector_length() > 1) && !UseShflReduction )
+      Impl::throw_runtime_exception( "Kokkos::parallel_reduce with a TeamPolicy using a vector length of greater than 1 is not currently supported for CUDA for dynamic sized reduction types.");
+
+    if( (m_team_size < 32) && !UseShflReduction )
+      Impl::throw_runtime_exception( "Kokkos::parallel_reduce with a TeamPolicy using a team_size smaller than 32 is not currently supported with CUDA for dynamic sized reduction types.");
+
+    // Functor's reduce memory, team scan memory, and team shared memory depend upon team size.
+
+    const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size ;
+
+    if (! Kokkos::Impl::is_integral_power_of_two( m_team_size )  && !UseShflReduction ) {
+      Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelReduce< Cuda > bad team size"));
+    }
+
+    if ( CudaTraits::SharedMemoryCapacity < shmem_size_total ) {
+      Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too much L0 scratch memory"));
+    }
+
+    if ( m_team_size >
+         Kokkos::Impl::cuda_get_max_block_size< ParallelReduce >
+               ( arg_functor , arg_policy.vector_length(), arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) / arg_policy.vector_length()) {
+      Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too large team size."));
+    }
+
+  }
+
+  ParallelReduce( const FunctorType  & arg_functor
+                , const Policy       & arg_policy
+                , const ReducerType & reducer)
+  : m_functor( arg_functor )
+  , m_reducer( reducer )
+  , m_result_ptr( reducer.result_view().ptr_on_device() )
+  , m_scratch_space( 0 )
+  , m_scratch_flags( 0 )
+  , m_unified_space( 0 )
+  , m_team_begin( 0 )
+  , m_shmem_begin( 0 )
+  , m_shmem_size( 0 )
+  , m_scratch_ptr{NULL,NULL}
+  , m_league_size( arg_policy.league_size() )
+  , m_team_size( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
+      Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(),
+                                                               arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) /
+      arg_policy.vector_length() )
+  , m_vector_size( arg_policy.vector_length() )
+  {
+    // Return Init value if the number of worksets is zero
+    if( arg_policy.league_size() == 0) {
+      ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , m_result_ptr );
+      return ;
+    }
+
+    m_team_begin = UseShflReduction?0:cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,WorkTag>( arg_functor , m_team_size );
+    m_shmem_begin = sizeof(double) * ( m_team_size + 2 );
+    m_shmem_size = arg_policy.scratch_size(0,m_team_size) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , m_team_size );
+    m_scratch_ptr[1] = cuda_resize_scratch_space(m_scratch_size[1]*(Cuda::concurrency()/(m_team_size*m_vector_size)));
+    m_scratch_size[0] = m_shmem_size;
+    m_scratch_size[1] = arg_policy.scratch_size(1,m_team_size);
+
+    // The global parallel_reduce does not support vector_length other than 1 at the moment
+    if( (arg_policy.vector_length() > 1) && !UseShflReduction )
+      Impl::throw_runtime_exception( "Kokkos::parallel_reduce with a TeamPolicy using a vector length of greater than 1 is not currently supported for CUDA for dynamic sized reduction types.");
+
+    if( (m_team_size < 32) && !UseShflReduction )
+      Impl::throw_runtime_exception( "Kokkos::parallel_reduce with a TeamPolicy using a team_size smaller than 32 is not currently supported with CUDA for dynamic sized reduction types.");
+
+    // Functor's reduce memory, team scan memory, and team shared memory depend upon team size.
+
+    const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size ;
+
+    if ( (! Kokkos::Impl::is_integral_power_of_two( m_team_size )  && !UseShflReduction ) ||
+         CudaTraits::SharedMemoryCapacity < shmem_size_total ) {
+      Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelReduce< Cuda > bad team size"));
+    }
+
+    if ( int(m_team_size) >
+         int(Kokkos::Impl::cuda_get_max_block_size< ParallelReduce >
+               ( arg_functor , arg_policy.vector_length(), arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) / arg_policy.vector_length())) {
+      Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too large team size."));
+    }
+
+  }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class ... Traits >
+class ParallelScan< FunctorType
+                  , Kokkos::RangePolicy< Traits ... >
+                  , Kokkos::Cuda
+                  >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Traits ... >  Policy ;
+  typedef typename Policy::member_type  Member ;
+  typedef typename Policy::work_tag     WorkTag ;
+  typedef typename Policy::WorkRange    WorkRange ;
+
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   FunctorType, WorkTag > ValueInit ;
+  typedef Kokkos::Impl::FunctorValueOps<    FunctorType, WorkTag > ValueOps ;
+
+public:
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+  typedef FunctorType                           functor_type ;
+  typedef Cuda::size_type                       size_type ;
+
+private:
+
+  // Algorithmic constraints:
+  //  (a) blockDim.y is a power of two
+  //  (b) blockDim.y == blockDim.z == 1
+  //  (c) gridDim.x  <= blockDim.y * blockDim.y
+  //  (d) gridDim.y  == gridDim.z == 1
+
+  const FunctorType m_functor ;
+  const Policy      m_policy ;
+  size_type *       m_scratch_space ;
+  size_type *       m_scratch_flags ;
+  size_type         m_final ;
+
+  template< class TagType >
+  __device__ inline
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec_range( const Member & i , reference_type update , const bool final_result ) const
+    { m_functor( i , update , final_result ); }
+
+  template< class TagType >
+  __device__ inline
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec_range( const Member & i , reference_type update , const bool final_result ) const
+    { m_functor( TagType() , i , update , final_result ); }
+
+  //----------------------------------------
+
+  __device__ inline
+  void initial(void) const
+  {
+    const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) >
+      word_count( ValueTraits::value_size( m_functor ) / sizeof(size_type) );
+
+    size_type * const shared_value = kokkos_impl_cuda_shared_memory<size_type>() + word_count.value * threadIdx.y ;
+
+    ValueInit::init( m_functor , shared_value );
+
+    // Number of blocks is bounded so that the reduction can be limited to two passes.
+    // Each thread block is given an approximately equal amount of work to perform.
+    // Accumulate the values for this block.
+    // The accumulation ordering does not match the final pass, but is arithmatically equivalent.
+
+    const WorkRange range( m_policy , blockIdx.x , gridDim.x );
+
+    for ( Member iwork = range.begin() + threadIdx.y , iwork_end = range.end() ;
+          iwork < iwork_end ; iwork += blockDim.y ) {
+      this-> template exec_range< WorkTag >( iwork , ValueOps::reference( shared_value ) , false );
+    }
+
+    // Reduce and scan, writing out scan of blocks' totals and block-groups' totals.
+    // Blocks' scan values are written to 'blockIdx.x' location.
+    // Block-groups' scan values are at: i = ( j * blockDim.y - 1 ) for i < gridDim.x
+    cuda_single_inter_block_reduce_scan<true,FunctorType,WorkTag>( m_functor , blockIdx.x , gridDim.x , kokkos_impl_cuda_shared_memory<size_type>() , m_scratch_space , m_scratch_flags );
+  }
+
+  //----------------------------------------
+
+  __device__ inline
+  void final(void) const
+  {
+    const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) >
+      word_count( ValueTraits::value_size( m_functor ) / sizeof(size_type) );
+
+    // Use shared memory as an exclusive scan: { 0 , value[0] , value[1] , value[2] , ... }
+    size_type * const shared_data   = kokkos_impl_cuda_shared_memory<size_type>();
+    size_type * const shared_prefix = shared_data + word_count.value * threadIdx.y ;
+    size_type * const shared_accum  = shared_data + word_count.value * ( blockDim.y + 1 );
+
+    // Starting value for this thread block is the previous block's total.
+    if ( blockIdx.x ) {
+      size_type * const block_total = m_scratch_space + word_count.value * ( blockIdx.x - 1 );
+      for ( unsigned i = threadIdx.y ; i < word_count.value ; ++i ) { shared_accum[i] = block_total[i] ; }
+    }
+    else if ( 0 == threadIdx.y ) {
+      ValueInit::init( m_functor , shared_accum );
+    }
+
+    const WorkRange range( m_policy , blockIdx.x , gridDim.x );
+
+    for ( typename Policy::member_type iwork_base = range.begin(); iwork_base < range.end() ; iwork_base += blockDim.y ) {
+
+      const typename Policy::member_type iwork = iwork_base + threadIdx.y ;
+
+      __syncthreads(); // Don't overwrite previous iteration values until they are used
+
+      ValueInit::init( m_functor , shared_prefix + word_count.value );
+
+      // Copy previous block's accumulation total into thread[0] prefix and inclusive scan value of this block
+      for ( unsigned i = threadIdx.y ; i < word_count.value ; ++i ) {
+        shared_data[i + word_count.value] = shared_data[i] = shared_accum[i] ;
+      }
+
+      if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); } // Protect against large scan values.
+
+      // Call functor to accumulate inclusive scan value for this work item
+      if ( iwork < range.end() ) {
+        this-> template exec_range< WorkTag >( iwork , ValueOps::reference( shared_prefix + word_count.value ) , false );
+      }
+
+      // Scan block values into locations shared_data[1..blockDim.y]
+      cuda_intra_block_reduce_scan<true,FunctorType,WorkTag>( m_functor , ValueTraits::pointer_type(shared_data+word_count.value) );
+
+      {
+        size_type * const block_total = shared_data + word_count.value * blockDim.y ;
+        for ( unsigned i = threadIdx.y ; i < word_count.value ; ++i ) { shared_accum[i] = block_total[i]; }
+      }
+
+      // Call functor with exclusive scan value
+      if ( iwork < range.end() ) {
+        this-> template exec_range< WorkTag >( iwork , ValueOps::reference( shared_prefix ) , true );
+      }
+    }
+  }
+
+public:
+
+  //----------------------------------------
+
+  __device__ inline
+  void operator()(void) const
+  {
+    if ( ! m_final ) {
+      initial();
+    }
+    else {
+      final();
+    }
+  }
+
+  // Determine block size constrained by shared memory:
+  static inline
+  unsigned local_block_size( const FunctorType & f )
+    {
+      // blockDim.y must be power of two = 128 (4 warps) or 256 (8 warps) or 512 (16 warps)
+      // gridDim.x <= blockDim.y * blockDim.y
+      //
+      // 4 warps was 10% faster than 8 warps and 20% faster than 16 warps in unit testing
+
+      unsigned n = CudaTraits::WarpSize * 4 ;
+      while ( n && CudaTraits::SharedMemoryCapacity < cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,WorkTag>( f , n ) ) { n >>= 1 ; }
+      return n ;
+    }
+
+  inline
+  void execute()
+    {
+      const int nwork    = m_policy.end() - m_policy.begin();
+      if ( nwork ) {
+        enum { GridMaxComputeCapability_2x = 0x0ffff };
+  
+        const int block_size = local_block_size( m_functor );
+  
+        const int grid_max =
+          ( block_size * block_size ) < GridMaxComputeCapability_2x ?
+          ( block_size * block_size ) : GridMaxComputeCapability_2x ;
+  
+        // At most 'max_grid' blocks:
+        const int max_grid = std::min( int(grid_max) , int(( nwork + block_size - 1 ) / block_size ));
+  
+        // How much work per block:
+        const int work_per_block = ( nwork + max_grid - 1 ) / max_grid ;
+  
+        // How many block are really needed for this much work:
+        const int grid_x = ( nwork + work_per_block - 1 ) / work_per_block ;
+  
+        m_scratch_space = cuda_internal_scratch_space( ValueTraits::value_size( m_functor ) * grid_x );
+        m_scratch_flags = cuda_internal_scratch_flags( sizeof(size_type) * 1 );
+  
+        const dim3 grid( grid_x , 1 , 1 );
+        const dim3 block( 1 , block_size , 1 ); // REQUIRED DIMENSIONS ( 1 , N , 1 )
+        const int shmem = ValueTraits::value_size( m_functor ) * ( block_size + 2 );
+  
+        m_final = false ;
+        CudaParallelLaunch< ParallelScan >( *this, grid, block, shmem ); // copy to device and execute
+  
+        m_final = true ;
+        CudaParallelLaunch< ParallelScan >( *this, grid, block, shmem ); // copy to device and execute
+      }
+    }
+
+  ParallelScan( const FunctorType  & arg_functor ,
+                const Policy       & arg_policy )
+  : m_functor( arg_functor )
+  , m_policy( arg_policy )
+  , m_scratch_space( 0 )
+  , m_scratch_flags( 0 )
+  , m_final( false )
+  { }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+  template<typename iType>
+  struct TeamThreadRangeBoundariesStruct<iType,CudaTeamMember> {
+    typedef iType index_type;
+    const iType start;
+    const iType end;
+    const iType increment;
+    const CudaTeamMember& thread;
+
+#ifdef __CUDA_ARCH__
+    __device__ inline
+    TeamThreadRangeBoundariesStruct (const CudaTeamMember& thread_, const iType& count):
+      start( threadIdx.y ),
+      end( count ),
+      increment( blockDim.y ),
+      thread(thread_)
+    {}
+    __device__ inline
+    TeamThreadRangeBoundariesStruct (const CudaTeamMember& thread_, const iType& begin_, const iType& end_):
+      start( begin_+threadIdx.y ),
+      end( end_ ),
+      increment( blockDim.y ),
+      thread(thread_)
+    {}
+#else
+    KOKKOS_INLINE_FUNCTION
+    TeamThreadRangeBoundariesStruct (const CudaTeamMember& thread_, const iType& count):
+      start( 0 ),
+      end( count ),
+      increment( 1 ),
+      thread(thread_)
+    {}
+    KOKKOS_INLINE_FUNCTION
+    TeamThreadRangeBoundariesStruct (const CudaTeamMember& thread_,  const iType& begin_, const iType& end_):
+      start( begin_ ),
+      end( end_ ),
+      increment( 1 ),
+      thread(thread_)
+    {}
+#endif
+  };
+
+  template<typename iType>
+  struct ThreadVectorRangeBoundariesStruct<iType,CudaTeamMember> {
+    typedef iType index_type;
+    const iType start;
+    const iType end;
+    const iType increment;
+
+#ifdef __CUDA_ARCH__
+    __device__ inline
+    ThreadVectorRangeBoundariesStruct (const CudaTeamMember& thread, const iType& count):
+    start( threadIdx.x ),
+    end( count ),
+    increment( blockDim.x )
+    {}
+#else
+    KOKKOS_INLINE_FUNCTION
+    ThreadVectorRangeBoundariesStruct (const CudaTeamMember& thread_, const iType& count):
+      start( 0 ),
+      end( count ),
+      increment( 1 )
+    {}
+#endif
+    };
+
+} // namespace Impl
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl::CudaTeamMember>
+  TeamThreadRange(const Impl::CudaTeamMember& thread, const iType& count) {
+  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::CudaTeamMember>(thread,count);
+}
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl::CudaTeamMember>
+  TeamThreadRange(const Impl::CudaTeamMember& thread, const iType& begin, const iType& end) {
+  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::CudaTeamMember>(thread,begin,end);
+}
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >
+  ThreadVectorRange(const Impl::CudaTeamMember& thread, const iType& count) {
+  return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >(thread,count);
+}
+
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadSingleStruct<Impl::CudaTeamMember> PerTeam(const Impl::CudaTeamMember& thread) {
+  return Impl::ThreadSingleStruct<Impl::CudaTeamMember>(thread);
+}
+
+KOKKOS_INLINE_FUNCTION
+Impl::VectorSingleStruct<Impl::CudaTeamMember> PerThread(const Impl::CudaTeamMember& thread) {
+  return Impl::VectorSingleStruct<Impl::CudaTeamMember>(thread);
+}
+
+} // namespace Kokkos
+
+namespace Kokkos {
+
+  /** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
+   *
+   * The range i=0..N-1 is mapped to all threads of the the calling thread team.
+   * This functionality requires C++11 support.*/
+template<typename iType, class Lambda>
+KOKKOS_INLINE_FUNCTION
+void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::CudaTeamMember>& loop_boundaries, const Lambda& lambda) {
+  #ifdef __CUDA_ARCH__
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
+    lambda(i);
+  #endif
+}
+
+/** \brief  Inter-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all threads of the the calling thread team and a summation of
+ * val is performed and put into result. This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::CudaTeamMember>& loop_boundaries,
+                     const Lambda & lambda, ValueType& result) {
+
+#ifdef __CUDA_ARCH__
+  result = ValueType();
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,result);
+  }
+
+  Impl::cuda_intra_warp_reduction(result,[&] (ValueType& dst, const ValueType& src) { dst+=src; });
+  Impl::cuda_inter_warp_reduction(result,[&] (ValueType& dst, const ValueType& src) { dst+=src; });
+
+#endif
+}
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
+ * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
+ * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
+ * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
+ * '1 for *'). This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::CudaTeamMember>& loop_boundaries,
+                     const Lambda & lambda, const JoinType& join, ValueType& init_result) {
+
+#ifdef __CUDA_ARCH__
+  ValueType result = init_result;
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,result);
+  }
+
+  Impl::cuda_intra_warp_reduction(result, join );
+  Impl::cuda_inter_warp_reduction(result, join );
+
+  init_result = result;
+#endif
+}
+
+} //namespace Kokkos
+
+namespace Kokkos {
+/** \brief  Intra-thread vector parallel_for. Executes lambda(iType i) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread.
+ * This functionality requires C++11 support.*/
+template<typename iType, class Lambda>
+KOKKOS_INLINE_FUNCTION
+void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >&
+    loop_boundaries, const Lambda& lambda) {
+#ifdef __CUDA_ARCH__
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
+    lambda(i);
+#endif
+}
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a summation of
+ * val is performed and put into result. This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >&
+      loop_boundaries, const Lambda & lambda, ValueType& result) {
+#ifdef __CUDA_ARCH__
+  result = ValueType();
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,result);
+  }
+
+  if (loop_boundaries.increment > 1)
+    result += shfl_down(result, 1,loop_boundaries.increment);
+  if (loop_boundaries.increment > 2)
+    result += shfl_down(result, 2,loop_boundaries.increment);
+  if (loop_boundaries.increment > 4)
+    result += shfl_down(result, 4,loop_boundaries.increment);
+  if (loop_boundaries.increment > 8)
+    result += shfl_down(result, 8,loop_boundaries.increment);
+  if (loop_boundaries.increment > 16)
+    result += shfl_down(result, 16,loop_boundaries.increment);
+
+  result = shfl(result,0,loop_boundaries.increment);
+#endif
+}
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
+ * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
+ * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
+ * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
+ * '1 for *'). This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >&
+      loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& init_result) {
+
+#ifdef __CUDA_ARCH__
+  ValueType result = init_result;
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,result);
+  }
+
+  if (loop_boundaries.increment > 1)
+    join( result, shfl_down(result, 1,loop_boundaries.increment));
+  if (loop_boundaries.increment > 2)
+    join( result, shfl_down(result, 2,loop_boundaries.increment));
+  if (loop_boundaries.increment > 4)
+    join( result, shfl_down(result, 4,loop_boundaries.increment));
+  if (loop_boundaries.increment > 8)
+    join( result, shfl_down(result, 8,loop_boundaries.increment));
+  if (loop_boundaries.increment > 16)
+    join( result, shfl_down(result, 16,loop_boundaries.increment));
+
+  init_result = shfl(result,0,loop_boundaries.increment);
+#endif
+}
+
+/** \brief  Intra-thread vector parallel exclusive prefix sum. Executes lambda(iType i, ValueType & val, bool final)
+ *          for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes in the thread and a scan operation is performed.
+ * Depending on the target execution space the operator might be called twice: once with final=false
+ * and once with final=true. When final==true val contains the prefix sum value. The contribution of this
+ * "i" needs to be added to val no matter whether final==true or not. In a serial execution
+ * (i.e. team_size==1) the operator is only called once with final==true. Scan_val will be set
+ * to the final sum value over all vector lanes.
+ * This functionality requires C++11 support.*/
+template< typename iType, class FunctorType >
+KOKKOS_INLINE_FUNCTION
+void parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >&
+      loop_boundaries, const FunctorType & lambda) {
+
+#ifdef __CUDA_ARCH__
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ;
+  typedef typename ValueTraits::value_type value_type ;
+
+  value_type scan_val = value_type();
+  const int VectorLength = blockDim.x;
+
+  iType loop_bound = ((loop_boundaries.end+VectorLength-1)/VectorLength) * VectorLength;
+  for(int _i = threadIdx.x; _i < loop_bound; _i += VectorLength) {
+    value_type val = value_type();
+    if(_i<loop_boundaries.end)
+      lambda(_i , val , false);
+
+    value_type tmp = val;
+    value_type result_i;
+
+    if(threadIdx.x%VectorLength == 0)
+      result_i = tmp;
+    if (VectorLength > 1) {
+      const value_type tmp2 = shfl_up(tmp, 1,VectorLength);
+      if(threadIdx.x > 0)
+        tmp+=tmp2;
+    }
+    if(threadIdx.x%VectorLength == 1)
+      result_i = tmp;
+    if (VectorLength > 3) {
+      const value_type tmp2 = shfl_up(tmp, 2,VectorLength);
+      if(threadIdx.x > 1)
+        tmp+=tmp2;
+    }
+    if ((threadIdx.x%VectorLength >= 2) &&
+        (threadIdx.x%VectorLength < 4))
+      result_i = tmp;
+    if (VectorLength > 7) {
+      const value_type tmp2 = shfl_up(tmp, 4,VectorLength);
+      if(threadIdx.x > 3)
+        tmp+=tmp2;
+    }
+    if ((threadIdx.x%VectorLength >= 4) &&
+        (threadIdx.x%VectorLength < 8))
+      result_i = tmp;
+    if (VectorLength > 15) {
+      const value_type tmp2 = shfl_up(tmp, 8,VectorLength);
+      if(threadIdx.x > 7)
+        tmp+=tmp2;
+    }
+    if ((threadIdx.x%VectorLength >= 8) &&
+        (threadIdx.x%VectorLength < 16))
+      result_i = tmp;
+    if (VectorLength > 31) {
+      const value_type tmp2 = shfl_up(tmp, 16,VectorLength);
+      if(threadIdx.x > 15)
+        tmp+=tmp2;
+    }
+    if (threadIdx.x%VectorLength >= 16)
+      result_i = tmp;
+
+    val = scan_val + result_i - val;
+    scan_val += shfl(tmp,VectorLength-1,VectorLength);
+    if(_i<loop_boundaries.end)
+      lambda(_i , val , true);
+  }
+#endif
+}
+
+}
+
+namespace Kokkos {
+
+template<class FunctorType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::VectorSingleStruct<Impl::CudaTeamMember>& , const FunctorType& lambda) {
+#ifdef __CUDA_ARCH__
+  if(threadIdx.x == 0) lambda();
+#endif
+}
+
+template<class FunctorType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::ThreadSingleStruct<Impl::CudaTeamMember>& , const FunctorType& lambda) {
+#ifdef __CUDA_ARCH__
+  if(threadIdx.x == 0 && threadIdx.y == 0) lambda();
+#endif
+}
+
+template<class FunctorType, class ValueType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::VectorSingleStruct<Impl::CudaTeamMember>& , const FunctorType& lambda, ValueType& val) {
+#ifdef __CUDA_ARCH__
+  if(threadIdx.x == 0) lambda(val);
+  val = shfl(val,0,blockDim.x);
+#endif
+}
+
+template<class FunctorType, class ValueType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::ThreadSingleStruct<Impl::CudaTeamMember>& single_struct, const FunctorType& lambda, ValueType& val) {
+#ifdef __CUDA_ARCH__
+  if(threadIdx.x == 0 && threadIdx.y == 0) {
+    lambda(val);
+  }
+  single_struct.team_member.team_broadcast(val,0);
+#endif
+}
+
+}
+
+namespace Kokkos {
+
+namespace Impl {
+  template< class FunctorType, class ExecPolicy, class ValueType , class Tag = typename ExecPolicy::work_tag>
+  struct CudaFunctorAdapter {
+    const FunctorType f;
+    typedef ValueType value_type;
+    CudaFunctorAdapter(const FunctorType& f_):f(f_) {}
+
+    __device__ inline
+    void operator() (typename ExecPolicy::work_tag, const typename ExecPolicy::member_type& i, ValueType& val) const {
+      //Insert Static Assert with decltype on ValueType equals third argument type of FunctorType::operator()
+      f(typename ExecPolicy::work_tag(), i,val);
+    }
+  };
+
+  template< class FunctorType, class ExecPolicy, class ValueType >
+  struct CudaFunctorAdapter<FunctorType,ExecPolicy,ValueType,void> {
+    const FunctorType f;
+    typedef ValueType value_type;
+    CudaFunctorAdapter(const FunctorType& f_):f(f_) {}
+
+    __device__ inline
+    void operator() (const typename ExecPolicy::member_type& i, ValueType& val) const {
+      //Insert Static Assert with decltype on ValueType equals second argument type of FunctorType::operator()
+      f(i,val);
+    }
+    __device__ inline
+    void operator() (typename ExecPolicy::member_type& i, ValueType& val) const {
+      //Insert Static Assert with decltype on ValueType equals second argument type of FunctorType::operator()
+      f(i,val);
+    }
+
+  };
+
+  template< class FunctorType, class Enable = void>
+  struct ReduceFunctorHasInit {
+    enum {value = false};
+  };
+
+  template< class FunctorType>
+  struct ReduceFunctorHasInit<FunctorType, typename Impl::enable_if< 0 < sizeof( & FunctorType::init ) >::type > {
+    enum {value = true};
+  };
+
+  template< class FunctorType, class Enable = void>
+  struct ReduceFunctorHasJoin {
+    enum {value = false};
+  };
+
+  template< class FunctorType>
+  struct ReduceFunctorHasJoin<FunctorType, typename Impl::enable_if< 0 < sizeof( & FunctorType::join ) >::type > {
+    enum {value = true};
+  };
+
+  template< class FunctorType, class Enable = void>
+  struct ReduceFunctorHasFinal {
+    enum {value = false};
+  };
+
+  template< class FunctorType>
+  struct ReduceFunctorHasFinal<FunctorType, typename Impl::enable_if< 0 < sizeof( & FunctorType::final ) >::type > {
+    enum {value = true};
+  };
+
+  template< class FunctorType, class Enable = void>
+    struct ReduceFunctorHasShmemSize {
+      enum {value = false};
+    };
+
+    template< class FunctorType>
+    struct ReduceFunctorHasShmemSize<FunctorType, typename Impl::enable_if< 0 < sizeof( & FunctorType::team_shmem_size ) >::type > {
+      enum {value = true};
+    };
+
+  template< class FunctorType, bool Enable =
+      ( FunctorDeclaresValueType<FunctorType,void>::value) ||
+      ( ReduceFunctorHasInit<FunctorType>::value  ) ||
+      ( ReduceFunctorHasJoin<FunctorType>::value  ) ||
+      ( ReduceFunctorHasFinal<FunctorType>::value ) ||
+      ( ReduceFunctorHasShmemSize<FunctorType>::value )
+      >
+  struct IsNonTrivialReduceFunctor {
+    enum {value = false};
+  };
+
+  template< class FunctorType>
+  struct IsNonTrivialReduceFunctor<FunctorType, true> {
+    enum {value = true};
+  };
+
+  template<class FunctorType, class ResultType, class Tag, bool Enable = IsNonTrivialReduceFunctor<FunctorType>::value >
+  struct FunctorReferenceType {
+    typedef ResultType& reference_type;
+  };
+
+  template<class FunctorType, class ResultType, class Tag>
+  struct FunctorReferenceType<FunctorType, ResultType, Tag, true> {
+    typedef typename Kokkos::Impl::FunctorValueTraits< FunctorType ,Tag >::reference_type reference_type;
+  };
+
+  template< class FunctorTypeIn, class ExecPolicy, class ValueType>
+  struct ParallelReduceFunctorType<FunctorTypeIn,ExecPolicy,ValueType,Cuda> {
+
+    enum {FunctorHasValueType = IsNonTrivialReduceFunctor<FunctorTypeIn>::value };
+    typedef typename Kokkos::Impl::if_c<FunctorHasValueType, FunctorTypeIn, Impl::CudaFunctorAdapter<FunctorTypeIn,ExecPolicy,ValueType> >::type functor_type;
+    static functor_type functor(const FunctorTypeIn& functor_in) {
+      return Impl::if_c<FunctorHasValueType,FunctorTypeIn,functor_type>::select(functor_in,functor_type(functor_in));
+    }
+  };
+
+}
+
+} // namespace Kokkos
+#endif /* defined( __CUDACC__ ) */
+
+#endif /* #ifndef KOKKOS_CUDA_PARALLEL_HPP */
+
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..1778f631c0ef07b2bad25ea2c855e65c258e6f57
--- /dev/null
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
@@ -0,0 +1,433 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_REDUCESCAN_HPP
+#define KOKKOS_CUDA_REDUCESCAN_HPP
+
+#include <Kokkos_Macros.hpp>
+
+/* only compile this file if CUDA is enabled for Kokkos */
+#if defined( __CUDACC__ ) && defined( KOKKOS_HAVE_CUDA )
+
+#include <utility>
+
+#include <Kokkos_Parallel.hpp>
+#include <impl/Kokkos_FunctorAdapter.hpp>
+#include <impl/Kokkos_Error.hpp>
+#include <Cuda/Kokkos_Cuda_Vectorization.hpp>
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+
+
+//Shfl based reductions
+/*
+ *  Algorithmic constraints:
+ *   (a) threads with same threadIdx.y have same value
+ *   (b) blockDim.x == power of two
+ *   (c) blockDim.z == 1
+ */
+
+template< class ValueType , class JoinOp>
+__device__
+inline void cuda_intra_warp_reduction( ValueType& result,
+                                       const JoinOp& join,
+                                       const int max_active_thread = blockDim.y) {
+
+  unsigned int shift = 1;
+
+  //Reduce over values from threads with different threadIdx.y
+  while(blockDim.x * shift < 32 ) {
+    const ValueType tmp = shfl_down(result, blockDim.x*shift,32u);
+    //Only join if upper thread is active (this allows non power of two for blockDim.y
+    if(threadIdx.y + shift < max_active_thread)
+      join(result , tmp);
+    shift*=2;
+  }
+
+  result = shfl(result,0,32);
+}
+
+template< class ValueType , class JoinOp>
+__device__
+inline void cuda_inter_warp_reduction( ValueType& value,
+                                       const JoinOp& join,
+                                       const int max_active_thread = blockDim.y) {
+
+  #define STEP_WIDTH 4
+  __shared__ char sh_result[sizeof(ValueType)*STEP_WIDTH];
+  ValueType* result = (ValueType*) & sh_result;
+  const unsigned step = 32 / blockDim.x;
+  unsigned shift = STEP_WIDTH;
+  const int id = threadIdx.y%step==0?threadIdx.y/step:65000;
+  if(id < STEP_WIDTH ) {
+    result[id] = value;
+  }
+  __syncthreads();
+  while (shift<=max_active_thread/step) {
+    if(shift<=id && shift+STEP_WIDTH>id && threadIdx.x==0) {
+      join(result[id%STEP_WIDTH],value);
+    }
+    __syncthreads();
+    shift+=STEP_WIDTH;
+  }
+
+
+  value = result[0];
+  for(int i = 1; (i*step<max_active_thread) && i<STEP_WIDTH; i++)
+    join(value,result[i]);
+}
+
+template< class ValueType , class JoinOp>
+__device__
+inline void cuda_intra_block_reduction( ValueType& value,
+                                        const JoinOp& join,
+                                        const int max_active_thread = blockDim.y) {
+  cuda_intra_warp_reduction(value,join,max_active_thread);
+  cuda_inter_warp_reduction(value,join,max_active_thread);
+}
+
+template< class FunctorType , class JoinOp , class ArgTag = void >
+__device__
+bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , ArgTag >::reference_type  value,
+                                 typename FunctorValueTraits< FunctorType , ArgTag >::reference_type  neutral,
+                                 const JoinOp& join,
+                                 Cuda::size_type * const m_scratch_space,
+                                 typename FunctorValueTraits< FunctorType , ArgTag >::pointer_type const result,
+                                 Cuda::size_type * const m_scratch_flags,
+                                 const int max_active_thread = blockDim.y) {
+  typedef typename FunctorValueTraits< FunctorType , ArgTag >::pointer_type pointer_type;
+  typedef typename FunctorValueTraits< FunctorType , ArgTag >::value_type value_type;
+
+  //Do the intra-block reduction with shfl operations and static shared memory
+  cuda_intra_block_reduction(value,join,max_active_thread);
+
+  const unsigned id = threadIdx.y*blockDim.x + threadIdx.x;
+
+  //One thread in the block writes block result to global scratch_memory
+  if(id == 0 ) {
+    pointer_type global = ((pointer_type) m_scratch_space) + blockIdx.x;
+    *global = value;
+  }
+
+  //One warp of last block performs inter block reduction through loading the block values from global scratch_memory
+  bool last_block = false;
+
+  __syncthreads();
+  if ( id < 32 ) {
+    Cuda::size_type count;
+
+    //Figure out whether this is the last block
+    if(id == 0)
+      count = Kokkos::atomic_fetch_add(m_scratch_flags,1);
+    count = Kokkos::shfl(count,0,32);
+
+    //Last block does the inter block reduction
+    if( count == gridDim.x - 1) {
+      //set flag back to zero
+      if(id == 0)
+        *m_scratch_flags = 0;
+      last_block = true;
+      value = neutral;
+
+      pointer_type const volatile global = (pointer_type) m_scratch_space ;
+
+      //Reduce all global values with splitting work over threads in one warp
+      const int step_size = blockDim.x*blockDim.y < 32 ? blockDim.x*blockDim.y : 32;
+      for(int i=id; i<gridDim.x; i+=step_size) {
+        value_type tmp = global[i];
+        join(value, tmp);
+      }
+
+      //Perform shfl reductions within the warp only join if contribution is valid (allows gridDim.x non power of two and <32)
+      if (blockDim.x*blockDim.y > 1) {
+        value_type tmp = Kokkos::shfl_down(value, 1,32);
+        if( id + 1 < gridDim.x )
+          join(value, tmp);
+      }
+      if (blockDim.x*blockDim.y > 2) {
+        value_type tmp = Kokkos::shfl_down(value, 2,32);
+        if( id + 2 < gridDim.x )
+          join(value, tmp);
+      }
+      if (blockDim.x*blockDim.y > 4) {
+        value_type tmp = Kokkos::shfl_down(value, 4,32);
+        if( id + 4 < gridDim.x )
+          join(value, tmp);
+      }
+      if (blockDim.x*blockDim.y > 8) {
+        value_type tmp = Kokkos::shfl_down(value, 8,32);
+        if( id + 8 < gridDim.x )
+          join(value, tmp);
+      }
+      if (blockDim.x*blockDim.y > 16) {
+        value_type tmp = Kokkos::shfl_down(value, 16,32);
+        if( id + 16 < gridDim.x )
+          join(value, tmp);
+      }
+    }
+  }
+
+  //The last block has in its thread=0 the global reduction value through "value"
+  return last_block;
+}
+
+//----------------------------------------------------------------------------
+// See section B.17 of Cuda C Programming Guide Version 3.2
+// for discussion of
+//   __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
+// function qualifier which could be used to improve performance.
+//----------------------------------------------------------------------------
+// Maximize shared memory and minimize L1 cache:
+//   cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferShared );
+// For 2.0 capability: 48 KB shared and 16 KB L1
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+/*
+ *  Algorithmic constraints:
+ *   (a) blockDim.y is a power of two
+ *   (b) blockDim.y <= 512
+ *   (c) blockDim.x == blockDim.z == 1
+ */
+
+template< bool DoScan , class FunctorType , class ArgTag >
+__device__
+void cuda_intra_block_reduce_scan( const FunctorType & functor ,
+                                   const typename FunctorValueTraits< FunctorType , ArgTag >::pointer_type base_data )
+{
+  typedef FunctorValueTraits< FunctorType , ArgTag >  ValueTraits ;
+  typedef FunctorValueJoin<   FunctorType , ArgTag >  ValueJoin ;
+
+  typedef typename ValueTraits::pointer_type  pointer_type ;
+
+  const unsigned value_count   = ValueTraits::value_count( functor );
+  const unsigned BlockSizeMask = blockDim.y - 1 ;
+
+  // Must have power of two thread count
+
+  if ( BlockSizeMask & blockDim.y ) { Kokkos::abort("Cuda::cuda_intra_block_scan requires power-of-two blockDim"); }
+
+#define BLOCK_REDUCE_STEP( R , TD , S )  \
+  if ( ! ( R & ((1<<(S+1))-1) ) ) { ValueJoin::join( functor , TD , (TD - (value_count<<S)) ); }
+
+#define BLOCK_SCAN_STEP( TD , N , S )  \
+  if ( N == (1<<S) ) { ValueJoin::join( functor , TD , (TD - (value_count<<S))); }
+
+  const unsigned     rtid_intra = threadIdx.y ^ BlockSizeMask ;
+  const pointer_type tdata_intra = base_data + value_count * threadIdx.y ;
+
+  { // Intra-warp reduction:
+    BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,0)
+    BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,1)
+    BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,2)
+    BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,3)
+    BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,4)
+  }
+
+  __syncthreads(); // Wait for all warps to reduce
+
+  { // Inter-warp reduce-scan by a single warp to avoid extra synchronizations
+    const unsigned rtid_inter = ( threadIdx.y ^ BlockSizeMask ) << CudaTraits::WarpIndexShift ;
+
+    if ( rtid_inter < blockDim.y ) {
+
+      const pointer_type tdata_inter = base_data + value_count * ( rtid_inter ^ BlockSizeMask );
+
+      if ( (1<<5) < BlockSizeMask ) {                        BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,5) }
+      if ( (1<<6) < BlockSizeMask ) { __threadfence_block(); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,6) }
+      if ( (1<<7) < BlockSizeMask ) { __threadfence_block(); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,7) }
+      if ( (1<<8) < BlockSizeMask ) { __threadfence_block(); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,8) }
+
+      if ( DoScan ) {
+
+        int n = ( rtid_inter &  32 ) ?  32 : (
+                ( rtid_inter &  64 ) ?  64 : (
+                ( rtid_inter & 128 ) ? 128 : (
+                ( rtid_inter & 256 ) ? 256 : 0 )));
+
+        if ( ! ( rtid_inter + n < blockDim.y ) ) n = 0 ;
+
+        BLOCK_SCAN_STEP(tdata_inter,n,8)
+        BLOCK_SCAN_STEP(tdata_inter,n,7)
+        BLOCK_SCAN_STEP(tdata_inter,n,6)
+        BLOCK_SCAN_STEP(tdata_inter,n,5)
+      }
+    }
+  }
+
+  __syncthreads(); // Wait for inter-warp reduce-scan to complete
+
+  if ( DoScan ) {
+    int n = ( rtid_intra &  1 ) ?  1 : (
+            ( rtid_intra &  2 ) ?  2 : (
+            ( rtid_intra &  4 ) ?  4 : (
+            ( rtid_intra &  8 ) ?  8 : (
+            ( rtid_intra & 16 ) ? 16 : 0 ))));
+
+    if ( ! ( rtid_intra + n < blockDim.y ) ) n = 0 ;
+
+    BLOCK_SCAN_STEP(tdata_intra,n,4) __threadfence_block();
+    BLOCK_SCAN_STEP(tdata_intra,n,3) __threadfence_block();
+    BLOCK_SCAN_STEP(tdata_intra,n,2) __threadfence_block();
+    BLOCK_SCAN_STEP(tdata_intra,n,1) __threadfence_block();
+    BLOCK_SCAN_STEP(tdata_intra,n,0)
+  }
+
+#undef BLOCK_SCAN_STEP
+#undef BLOCK_REDUCE_STEP
+}
+
+//----------------------------------------------------------------------------
+/**\brief  Input value-per-thread starting at 'shared_data'.
+ *         Reduction value at last thread's location.
+ *
+ *  If 'DoScan' then write blocks' scan values and block-groups' scan values.
+ *
+ *  Global reduce result is in the last threads' 'shared_data' location.
+ */
+template< bool DoScan , class FunctorType , class ArgTag >
+__device__
+bool cuda_single_inter_block_reduce_scan( const FunctorType     & functor ,
+                                          const Cuda::size_type   block_id ,
+                                          const Cuda::size_type   block_count ,
+                                          Cuda::size_type * const shared_data ,
+                                          Cuda::size_type * const global_data ,
+                                          Cuda::size_type * const global_flags )
+{
+  typedef Cuda::size_type                  size_type ;
+  typedef FunctorValueTraits< FunctorType , ArgTag >  ValueTraits ;
+  typedef FunctorValueJoin<   FunctorType , ArgTag >  ValueJoin ;
+  typedef FunctorValueInit<   FunctorType , ArgTag >  ValueInit ;
+  typedef FunctorValueOps<    FunctorType , ArgTag >  ValueOps ;
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+
+  // '__ffs' = position of the least significant bit set to 1.
+  // 'blockDim.y' is guaranteed to be a power of two so this
+  // is the integral shift value that can replace an integral divide.
+  const unsigned BlockSizeShift = __ffs( blockDim.y ) - 1 ;
+  const unsigned BlockSizeMask  = blockDim.y - 1 ;
+
+  // Must have power of two thread count
+  if ( BlockSizeMask & blockDim.y ) { Kokkos::abort("Cuda::cuda_single_inter_block_reduce_scan requires power-of-two blockDim"); }
+
+  const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) >
+    word_count( ValueTraits::value_size( functor ) / sizeof(size_type) );
+
+  // Reduce the accumulation for the entire block.
+  cuda_intra_block_reduce_scan<false,FunctorType,ArgTag>( functor , pointer_type(shared_data) );
+
+  {
+    // Write accumulation total to global scratch space.
+    // Accumulation total is the last thread's data.
+    size_type * const shared = shared_data + word_count.value * BlockSizeMask ;
+    size_type * const global = global_data + word_count.value * block_id ;
+
+#if (__CUDA_ARCH__ < 500)
+    for ( size_type i = threadIdx.y ; i < word_count.value ; i += blockDim.y ) { global[i] = shared[i] ; }
+#else
+    for ( size_type i = 0 ; i < word_count.value ; i += 1 ) { global[i] = shared[i] ; }
+#endif
+
+  }
+
+  // Contributing blocks note that their contribution has been completed via an atomic-increment flag
+  // If this block is not the last block to contribute to this group then the block is done.
+  const bool is_last_block =
+    ! __syncthreads_or( threadIdx.y ? 0 : ( 1 + atomicInc( global_flags , block_count - 1 ) < block_count ) );
+
+  if ( is_last_block ) {
+
+    const size_type b = ( long(block_count) * long(threadIdx.y) ) >> BlockSizeShift ;
+    const size_type e = ( long(block_count) * long( threadIdx.y + 1 ) ) >> BlockSizeShift ;
+
+    {
+      void * const shared_ptr = shared_data + word_count.value * threadIdx.y ;
+      reference_type shared_value = ValueInit::init( functor , shared_ptr );
+
+      for ( size_type i = b ; i < e ; ++i ) {
+        ValueJoin::join( functor , shared_ptr , global_data + word_count.value * i );
+      }
+    }
+
+    cuda_intra_block_reduce_scan<DoScan,FunctorType,ArgTag>( functor , pointer_type(shared_data) );
+
+    if ( DoScan ) {
+
+      size_type * const shared_value = shared_data + word_count.value * ( threadIdx.y ? threadIdx.y - 1 : blockDim.y );
+
+      if ( ! threadIdx.y ) { ValueInit::init( functor , shared_value ); }
+
+      // Join previous inclusive scan value to each member
+      for ( size_type i = b ; i < e ; ++i ) {
+        size_type * const global_value = global_data + word_count.value * i ;
+        ValueJoin::join( functor , shared_value , global_value );
+        ValueOps ::copy( functor , global_value , shared_value );
+      }
+    }
+  }
+
+  return is_last_block ;
+}
+
+// Size in bytes required for inter block reduce or scan
+template< bool DoScan , class FunctorType , class ArgTag >
+inline
+unsigned cuda_single_inter_block_reduce_scan_shmem( const FunctorType & functor , const unsigned BlockSize )
+{
+  return ( BlockSize + 2 ) * Impl::FunctorValueTraits< FunctorType , ArgTag >::value_size( functor );
+}
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( __CUDACC__ ) */
+#endif /* KOKKOS_CUDA_REDUCESCAN_HPP */
+
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..701d267e1ba39413061afd337ac19c7d6acaacfc
--- /dev/null
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp
@@ -0,0 +1,179 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+
+#if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKPOLICY )
+
+#include <impl/Kokkos_TaskQueue_impl.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template class TaskQueue< Kokkos::Cuda > ;
+
+//----------------------------------------------------------------------------
+
+__device__
+void TaskQueueSpecialization< Kokkos::Cuda >::driver
+  ( TaskQueueSpecialization< Kokkos::Cuda >::queue_type * const queue )
+{
+  using Member = TaskExec< Kokkos::Cuda > ;
+  using Queue  = TaskQueue< Kokkos::Cuda > ;
+  using task_root_type = TaskBase< Kokkos::Cuda , void , void > ;
+
+  task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
+
+  Member single_exec( 1 );
+  Member team_exec( blockDim.y );
+
+  const int warp_lane = threadIdx.x + threadIdx.y * blockDim.x ;
+
+  union {
+    task_root_type * ptr ;
+    int              raw[2] ;
+  } task ;
+
+  // Loop until all queues are empty and no tasks in flight
+
+  do {
+
+    // Each team lead attempts to acquire either a thread team task
+    // or collection of single thread tasks for the team.
+
+    if ( 0 == warp_lane ) {
+
+      task.ptr = 0 < *((volatile int *) & queue->m_ready_count) ? end : 0 ;
+
+      // Loop by priority and then type
+      for ( int i = 0 ; i < Queue::NumQueue && end == task.ptr ; ++i ) {
+        for ( int j = 0 ; j < 2 && end == task.ptr ; ++j ) {
+          task.ptr = Queue::pop_task( & queue->m_ready[i][j] );
+        }
+      }
+
+#if 0
+printf("TaskQueue<Cuda>::driver(%d,%d) task(%lx)\n",threadIdx.z,blockIdx.x
+      , uintptr_t(task.ptr));
+#endif
+
+    }
+
+    // shuffle broadcast
+
+    task.raw[0] = __shfl( task.raw[0] , 0 );
+    task.raw[1] = __shfl( task.raw[1] , 0 );
+
+    if ( 0 == task.ptr ) break ; // 0 == queue->m_ready_count
+
+    if ( end != task.ptr ) {
+      if ( task_root_type::TaskTeam == task.ptr->m_task_type ) {
+        // Thread Team Task
+        (*task.ptr->m_apply)( task.ptr , & team_exec );
+      }
+      else if ( 0 == threadIdx.y ) {
+        // Single Thread Task
+        (*task.ptr->m_apply)( task.ptr , & single_exec );
+      }
+
+      if ( 0 == warp_lane ) {
+        queue->complete( task.ptr ); 
+      }
+    }
+  } while(1);
+}
+
+namespace {
+
+__global__
+void cuda_task_queue_execute( TaskQueue< Kokkos::Cuda > * queue )
+{ TaskQueueSpecialization< Kokkos::Cuda >::driver( queue ); }
+
+}
+
+void TaskQueueSpecialization< Kokkos::Cuda >::execute
+  ( TaskQueue< Kokkos::Cuda > * const queue )
+{
+  const int warps_per_block = 4 ;
+  const dim3 grid( Kokkos::Impl::cuda_internal_multiprocessor_count() , 1 , 1 );
+  const dim3 block( 1 , Kokkos::Impl::CudaTraits::WarpSize , warps_per_block );
+  const int shared = 0 ;
+  const cudaStream_t stream = 0 ;
+
+  CUDA_SAFE_CALL( cudaDeviceSynchronize() );
+
+#if 0
+printf("cuda_task_queue_execute before\n");
+#endif
+
+  // Query the stack size, in bytes:
+  //
+  // size_t stack_size = 0 ;
+  // CUDA_SAFE_CALL( cudaDeviceGetLimit( & stack_size , cudaLimitStackSize ) );
+  //
+  // If not large enough then set the stack size, in bytes:
+  //
+  // CUDA_SAFE_CALL( cudaDeviceSetLimit( cudaLimitStackSize , stack_size ) );
+ 
+  cuda_task_queue_execute<<< grid , block , shared , stream >>>( queue );
+
+  CUDA_SAFE_CALL( cudaGetLastError() );
+
+  CUDA_SAFE_CALL( cudaDeviceSynchronize() );
+
+#if 0
+printf("cuda_task_queue_execute after\n");
+#endif
+
+}
+
+}} /* namespace Kokkos::Impl */
+
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */
+
+
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..9d9347cc8d57c0c04a228fb0291c0f4e90b6243f
--- /dev/null
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp
@@ -0,0 +1,519 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_IMPL_CUDA_TASK_HPP
+#define KOKKOS_IMPL_CUDA_TASK_HPP
+
+#if defined( KOKKOS_ENABLE_TASKPOLICY )
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+namespace {
+
+template< typename TaskType >
+__global__
+void set_cuda_task_base_apply_function_pointer
+  ( TaskBase<Kokkos::Cuda,void,void>::function_type * ptr )
+{ *ptr = TaskType::apply ; }
+
+}
+
+template<>
+class TaskQueueSpecialization< Kokkos::Cuda >
+{
+public:
+
+  using execution_space = Kokkos::Cuda ;
+  using memory_space    = Kokkos::CudaUVMSpace ;
+  using queue_type      = TaskQueue< execution_space > ;
+
+  static
+  void iff_single_thread_recursive_execute( queue_type * const ) {}
+
+  __device__
+  static void driver( queue_type * const );
+
+  static
+  void execute( queue_type * const );
+
+  template< typename FunctorType >
+  static
+  void proc_set_apply( TaskBase<execution_space,void,void>::function_type * ptr )
+    {
+      using TaskType = TaskBase< execution_space
+                               , typename FunctorType::value_type
+                               , FunctorType > ;
+
+      CUDA_SAFE_CALL( cudaDeviceSynchronize() );
+
+      set_cuda_task_base_apply_function_pointer<TaskType><<<1,1>>>(ptr);
+
+      CUDA_SAFE_CALL( cudaGetLastError() );
+      CUDA_SAFE_CALL( cudaDeviceSynchronize() );
+    }
+};
+
+extern template class TaskQueue< Kokkos::Cuda > ;
+
+//----------------------------------------------------------------------------
+/**\brief  Impl::TaskExec<Cuda> is the TaskPolicy<Cuda>::member_type
+ *         passed to tasks running in a Cuda space.
+ *
+ *  Cuda thread blocks for tasking are dimensioned:
+ *    blockDim.x == vector length
+ *    blockDim.y == team size
+ *    blockDim.z == number of teams
+ *  where
+ *    blockDim.x * blockDim.y == WarpSize
+ *
+ *  Both single thread and thread team tasks are run by a full Cuda warp.
+ *  A single thread task is called by warp lane #0 and the remaining
+ *  lanes of the warp are idle.
+ */
+template<>
+class TaskExec< Kokkos::Cuda >
+{
+private:
+
+  TaskExec( TaskExec && ) = delete ;
+  TaskExec( TaskExec const & ) = delete ;
+  TaskExec & operator = ( TaskExec && ) = delete ;
+  TaskExec & operator = ( TaskExec const & ) = delete ;
+
+  friend class Kokkos::Impl::TaskQueue< Kokkos::Cuda > ;
+  friend class Kokkos::Impl::TaskQueueSpecialization< Kokkos::Cuda > ;
+
+  const int m_team_size ;
+
+  __device__
+  TaskExec( int arg_team_size = blockDim.y )
+    : m_team_size( arg_team_size ) {}
+
+public:
+
+#if defined( __CUDA_ARCH__ )
+  __device__ void team_barrier() { /* __threadfence_block(); */ }
+  __device__ int  team_rank() const { return threadIdx.y ; }
+  __device__ int  team_size() const { return m_team_size ; }
+#else
+  __host__ void team_barrier() {}
+  __host__ int  team_rank() const { return 0 ; }
+  __host__ int  team_size() const { return 0 ; }
+#endif
+
+};
+
+//----------------------------------------------------------------------------
+
+template<typename iType>
+struct TeamThreadRangeBoundariesStruct<iType, TaskExec< Kokkos::Cuda > >
+{
+  typedef iType index_type;
+  const iType start ;
+  const iType end ;
+  const iType increment ;
+  const TaskExec< Kokkos::Cuda > & thread;
+
+#if defined( __CUDA_ARCH__ )
+
+  __device__ inline
+  TeamThreadRangeBoundariesStruct
+    ( const TaskExec< Kokkos::Cuda > & arg_thread, const iType& arg_count)
+    : start( threadIdx.y )
+    , end(arg_count)
+    , increment( blockDim.y )
+    , thread(arg_thread)
+    {}
+
+  __device__ inline
+  TeamThreadRangeBoundariesStruct
+    ( const TaskExec< Kokkos::Cuda > & arg_thread
+    , const iType & arg_start
+    , const iType & arg_end
+    )
+    : start( arg_start + threadIdx.y )
+    , end(   arg_end)
+    , increment( blockDim.y )
+    , thread( arg_thread )
+    {}
+
+#else
+
+  TeamThreadRangeBoundariesStruct
+    ( const TaskExec< Kokkos::Cuda > & arg_thread, const iType& arg_count);
+
+  TeamThreadRangeBoundariesStruct
+    ( const TaskExec< Kokkos::Cuda > & arg_thread
+    , const iType & arg_start
+    , const iType & arg_end
+    );
+
+#endif
+
+};
+
+//----------------------------------------------------------------------------
+
+template<typename iType>
+struct ThreadVectorRangeBoundariesStruct<iType, TaskExec< Kokkos::Cuda > >
+{
+  typedef iType index_type;
+  const iType start ;
+  const iType end ;
+  const iType increment ;
+  const TaskExec< Kokkos::Cuda > & thread;
+
+#if defined( __CUDA_ARCH__ )
+
+  __device__ inline
+  ThreadVectorRangeBoundariesStruct
+    ( const TaskExec< Kokkos::Cuda > & arg_thread, const iType& arg_count)
+    : start( threadIdx.x )
+    , end(arg_count)
+    , increment( blockDim.x )
+    , thread(arg_thread)
+    {}
+
+#else
+
+  ThreadVectorRangeBoundariesStruct
+    ( const TaskExec< Kokkos::Cuda > & arg_thread, const iType& arg_count);
+
+#endif
+
+};
+
+}} /* namespace Kokkos::Impl */
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >
+TeamThreadRange( const Impl::TaskExec< Kokkos::Cuda > & thread
+               , const iType & count )
+{
+  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >(thread,count);
+}
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >
+TeamThreadRange( const Impl::TaskExec< Kokkos::Cuda > & thread, const iType & start , const iType & end )
+{
+  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Cuda > >(thread,start,end);
+}
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >
+ThreadVectorRange( const Impl::TaskExec< Kokkos::Cuda > & thread
+               , const iType & count )
+{
+  return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >(thread,count);
+}
+
+/** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all threads of the the calling thread team.
+ * This functionality requires C++11 support.
+*/
+template<typename iType, class Lambda>
+KOKKOS_INLINE_FUNCTION
+void parallel_for
+  ( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Cuda > >& loop_boundaries
+  , const Lambda& lambda
+  )
+{
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i);
+  }
+}
+
+// reduce across corresponding lanes between team members within warp
+// assume stride*team_size == warp_size
+template< typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void strided_shfl_warp_reduction
+  (const JoinType& join,
+   ValueType& val,
+   int team_size,
+   int stride)
+{
+  for (int lane_delta=(team_size*stride)>>1; lane_delta>=stride; lane_delta>>=1) {
+    join(val, Kokkos::shfl_down(val, lane_delta, team_size*stride));
+  }
+}
+
+// multiple within-warp non-strided reductions
+template< typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void multi_shfl_warp_reduction
+  (const JoinType& join,
+   ValueType& val,
+   int vec_length)
+{
+  for (int lane_delta=vec_length>>1; lane_delta; lane_delta>>=1) {
+    join(val, Kokkos::shfl_down(val, lane_delta, vec_length));
+  }
+}
+
+// broadcast within warp
+template< class ValueType >
+KOKKOS_INLINE_FUNCTION
+ValueType shfl_warp_broadcast
+  (ValueType& val,
+   int src_lane,
+   int width)
+{
+  return Kokkos::shfl(val, src_lane, width);
+}
+
+// all-reduce across corresponding vector lanes between team members within warp
+// assume vec_length*team_size == warp_size 
+// blockDim.x == vec_length == stride
+// blockDim.y == team_size
+// threadIdx.x == position in vec
+// threadIdx.y == member number
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
+   const Lambda & lambda,
+   const JoinType& join,
+   ValueType& initialized_result) {
+
+  ValueType result = initialized_result;
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,result);
+  }
+  initialized_result = result;
+
+  strided_shfl_warp_reduction<ValueType, JoinType>(
+                          join,
+                          initialized_result,
+                          loop_boundaries.thread.team_size(),
+                          blockDim.x);
+  initialized_result = shfl_warp_broadcast<ValueType>( initialized_result, threadIdx.x, Impl::CudaTraits::WarpSize );
+}
+
+// all-reduce across corresponding vector lanes between team members within warp
+// if no join() provided, use sum
+// assume vec_length*team_size == warp_size 
+// blockDim.x == vec_length == stride
+// blockDim.y == team_size
+// threadIdx.x == position in vec
+// threadIdx.y == member number
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
+   const Lambda & lambda,
+   ValueType& initialized_result) {
+
+  //TODO what is the point of creating this temporary?
+  ValueType result = initialized_result;
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,result);
+  }
+  initialized_result = result;
+
+  strided_shfl_warp_reduction(
+                          [&] (ValueType& val1, const ValueType& val2) { val1 += val2; },
+                          initialized_result,
+                          loop_boundaries.thread.team_size(),
+                          blockDim.x);
+  initialized_result = shfl_warp_broadcast<ValueType>( initialized_result, threadIdx.x, Impl::CudaTraits::WarpSize );
+}
+
+// all-reduce within team members within warp
+// assume vec_length*team_size == warp_size 
+// blockDim.x == vec_length == stride
+// blockDim.y == team_size
+// threadIdx.x == position in vec
+// threadIdx.y == member number
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
+   const Lambda & lambda,
+   const JoinType& join,
+   ValueType& initialized_result) {
+
+  ValueType result = initialized_result;
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,result);
+  }
+  initialized_result = result;
+
+  multi_shfl_warp_reduction<ValueType, JoinType>(join, initialized_result, blockDim.x);
+  initialized_result = shfl_warp_broadcast<ValueType>( initialized_result, 0, blockDim.x );
+}
+
+// all-reduce within team members within warp
+// if no join() provided, use sum
+// assume vec_length*team_size == warp_size 
+// blockDim.x == vec_length == stride
+// blockDim.y == team_size
+// threadIdx.x == position in vec
+// threadIdx.y == member number
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
+   const Lambda & lambda,
+   ValueType& initialized_result) {
+
+  ValueType result = initialized_result;
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,result);
+  }
+
+  initialized_result = result;
+
+  //initialized_result = multi_shfl_warp_reduction(
+  multi_shfl_warp_reduction(
+                          [&] (ValueType& val1, const ValueType& val2) { val1 += val2; },
+                          initialized_result,
+                          blockDim.x);
+  initialized_result = shfl_warp_broadcast<ValueType>( initialized_result, 0, blockDim.x );
+}
+
+// scan across corresponding vector lanes between team members within warp
+// assume vec_length*team_size == warp_size 
+// blockDim.x == vec_length == stride
+// blockDim.y == team_size
+// threadIdx.x == position in vec
+// threadIdx.y == member number
+template< typename ValueType, typename iType, class Lambda >
+KOKKOS_INLINE_FUNCTION
+void parallel_scan
+  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
+   const Lambda & lambda) {
+
+  ValueType accum = 0 ;
+  ValueType val, y, local_total;
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    val = 0;
+    lambda(i,val,false);
+
+    // intra-blockDim.y exclusive scan on 'val'
+    // accum = accumulated, sum in total for this iteration
+
+    // INCLUSIVE scan
+    for( int offset = blockDim.x ; offset < Impl::CudaTraits::WarpSize ; offset <<= 1 ) {
+      y = Kokkos::shfl_up(val, offset, Impl::CudaTraits::WarpSize);
+      if(threadIdx.y*blockDim.x >= offset) { val += y; }
+    }
+
+    // pass accum to all threads
+    local_total = shfl_warp_broadcast<ValueType>(val,
+                                            threadIdx.x+Impl::CudaTraits::WarpSize-blockDim.x,
+                                            Impl::CudaTraits::WarpSize);
+
+    // make EXCLUSIVE scan by shifting values over one
+    val = Kokkos::shfl_up(val, blockDim.x, Impl::CudaTraits::WarpSize);
+    if ( threadIdx.y == 0 ) { val = 0 ; }
+
+    val += accum;
+    lambda(i,val,true);
+    accum += local_total;
+  }
+}
+
+// scan within team member (vector) within warp
+// assume vec_length*team_size == warp_size 
+// blockDim.x == vec_length == stride
+// blockDim.y == team_size
+// threadIdx.x == position in vec
+// threadIdx.y == member number
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_scan
+  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
+   const Lambda & lambda)
+{
+  ValueType accum = 0 ;
+  ValueType val, y, local_total;
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    val = 0;
+    lambda(i,val,false);
+
+    // intra-blockDim.x exclusive scan on 'val'
+    // accum = accumulated, sum in total for this iteration
+
+    // INCLUSIVE scan
+    for( int offset = 1 ; offset < blockDim.x ; offset <<= 1 ) {
+      y = Kokkos::shfl_up(val, offset, blockDim.x);
+      if(threadIdx.x >= offset) { val += y; }
+    }
+
+    // pass accum to all threads
+    local_total = shfl_warp_broadcast<ValueType>(val, blockDim.x-1, blockDim.x);
+
+    // make EXCLUSIVE scan by shifting values over one
+    val = Kokkos::shfl_up(val, 1, blockDim.x);
+    if ( threadIdx.x == 0 ) { val = 0 ; }
+
+    val += accum;
+    lambda(i,val,true);
+    accum += local_total;
+  }
+}
+
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
+#endif /* #ifndef KOKKOS_IMPL_CUDA_TASK_HPP */
+
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_TaskPolicy.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_TaskPolicy.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bb3cd2640d79ad980219861a6e4f0c233c0686bb
--- /dev/null
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_TaskPolicy.cpp
@@ -0,0 +1,932 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+// Experimental unified task-data parallel manycore LDRD
+
+#include <stdio.h>
+#include <iostream>
+#include <sstream>
+#include <Kokkos_Core.hpp>
+#include <Cuda/Kokkos_Cuda_TaskPolicy.hpp>
+
+#if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKPOLICY )
+
+// #define DETAILED_PRINT
+
+//----------------------------------------------------------------------------
+
+#define QLOCK   reinterpret_cast<void*>( ~((uintptr_t)0) )
+#define QDENIED reinterpret_cast<void*>( ~((uintptr_t)0) - 1 )
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+void CudaTaskPolicyQueue::Destroy::destroy_shared_allocation()
+{
+  // Verify the queue is empty
+
+  if ( m_policy->m_count_ready ||
+       m_policy->m_team[0] ||
+       m_policy->m_team[1] ||
+       m_policy->m_team[2] ||
+       m_policy->m_serial[0] ||
+       m_policy->m_serial[1] ||
+       m_policy->m_serial[2] ) {
+    Kokkos::abort("CudaTaskPolicyQueue ERROR : Attempt to destroy non-empty queue" );
+  }
+
+  m_policy->~CudaTaskPolicyQueue();
+
+  Kokkos::Cuda::fence();
+}
+
+CudaTaskPolicyQueue::
+~CudaTaskPolicyQueue()
+{
+}
+
+CudaTaskPolicyQueue::
+CudaTaskPolicyQueue
+  ( const unsigned arg_task_max_count
+  , const unsigned arg_task_max_size
+  , const unsigned arg_task_default_dependence_capacity 
+  , const unsigned arg_team_size
+  )
+  : m_space( Kokkos::CudaUVMSpace()
+           , arg_task_max_size * arg_task_max_count * 1.2
+           , 16 /* log2(superblock size) */
+           )
+  , m_team { 0 , 0 , 0 }
+  , m_serial { 0 , 0 , 0 }
+  , m_team_size( 32 /* 1 warps */ )
+  , m_default_dependence_capacity( arg_task_default_dependence_capacity )
+  , m_count_ready(0)
+{
+  constexpr int max_team_size = 32 * 16 /* 16 warps */ ;
+
+  const int target_team_size =
+    std::min( int(arg_team_size) , max_team_size );
+
+  while ( m_team_size < target_team_size ) { m_team_size *= 2 ; }
+}
+
+//-----------------------------------------------------------------------
+// Called by each block & thread
+
+__device__
+void Kokkos::Experimental::Impl::CudaTaskPolicyQueue::driver()
+{
+  task_root_type * const q_denied = reinterpret_cast<task_root_type*>(QDENIED);
+
+#define IS_TEAM_LEAD ( threadIdx.x == 0 && threadIdx.y == 0 )
+
+#ifdef DETAILED_PRINT
+if ( IS_TEAM_LEAD ) {
+  printf( "CudaTaskPolicyQueue::driver() begin on %d with count %d\n"
+        , blockIdx.x , m_count_ready );
+}
+#endif
+
+  // Each thread block must iterate this loop synchronously
+  // to insure team-execution of team-task
+
+  __shared__ task_root_type * team_task ;
+
+  __syncthreads();
+
+  do {
+
+    if ( IS_TEAM_LEAD ) {
+      if ( 0 == m_count_ready ) {
+        team_task = q_denied ; // All queues are empty and no running tasks
+      }
+      else {
+        team_task = 0 ;
+        for ( int i = 0 ; i < int(NPRIORITY) && 0 == team_task ; ++i ) {
+          if ( ( i < 2 /* regular queue */ )
+               || ( ! m_space.is_empty() /* waiting for memory */ ) ) {
+            team_task = pop_ready_task( & m_team[i] );
+          }
+        }
+      }
+    }
+
+    __syncthreads();
+
+#ifdef DETAILED_PRINT
+if ( IS_TEAM_LEAD && 0 != team_task ) {
+  printf( "CudaTaskPolicyQueue::driver() (%d) team_task(0x%lx)\n"
+        , blockIdx.x
+        , (unsigned long) team_task );
+}
+#endif
+
+    // team_task == q_denied if all queues are empty
+    // team_task == 0 if no team tasks available
+
+    if ( q_denied != team_task ) {
+      if ( 0 != team_task ) {
+
+        Kokkos::Impl::CudaTeamMember
+          member( kokkos_impl_cuda_shared_memory<void>()
+                , 16                      /* shared_begin */
+                , team_task->m_shmem_size /* shared size */
+                , 0                       /* scratch level 1 pointer */
+                , 0                       /* scratch level 1 size */
+                , 0                       /* league rank */
+                , 1                       /* league size */
+                );
+
+        (*team_task->m_team)( team_task , member );
+
+        // A __synthreads was called and if completed the
+        // functor was destroyed.
+
+        if ( IS_TEAM_LEAD ) {
+          complete_executed_task( team_task );
+        }
+      }
+      else {
+        // One thread of one warp performs this serial task
+        if ( threadIdx.x == 0 &&
+             0 == ( threadIdx.y % 32 ) ) {
+          task_root_type * task = 0 ;
+          for ( int i = 0 ; i < int(NPRIORITY) && 0 == task ; ++i ) {
+            if ( ( i < 2 /* regular queue */ )
+                 || ( ! m_space.is_empty() /* waiting for memory */ ) ) {
+              task = pop_ready_task( & m_serial[i] );
+            }
+          }
+
+#ifdef DETAILED_PRINT
+if ( 0 != task ) {
+  printf( "CudaTaskPolicyQueue::driver() (%2d)(%d) single task(0x%lx)\n"
+        , blockIdx.x
+        , threadIdx.y
+        , (unsigned long) task );
+}
+#endif
+
+          if ( task ) {
+            (*task->m_serial)( task );
+            complete_executed_task( task );
+          }
+        }
+
+        __syncthreads();
+      }
+    }
+  } while ( q_denied != team_task );
+
+#ifdef DETAILED_PRINT
+if ( IS_TEAM_LEAD ) {
+  printf( "CudaTaskPolicyQueue::driver() end on %d with count %d\n"
+        , blockIdx.x , m_count_ready );
+}
+#endif
+
+#undef IS_TEAM_LEAD
+}
+
+//-----------------------------------------------------------------------
+
+__device__
+CudaTaskPolicyQueue::task_root_type *
+CudaTaskPolicyQueue::pop_ready_task(
+  CudaTaskPolicyQueue::task_root_type * volatile * const queue )
+{
+  task_root_type * const q_lock = reinterpret_cast<task_root_type*>(QLOCK);
+  task_root_type * task = 0 ;
+  task_root_type * const task_claim = *queue ;
+
+  if ( ( q_lock != task_claim ) && ( 0 != task_claim ) ) {
+
+    // Queue is not locked and not null, try to claim head of queue.
+    // Is a race among threads to claim the queue.
+
+    if ( task_claim == atomic_compare_exchange(queue,task_claim,q_lock) ) {
+
+      // Aquired the task which must be in the waiting state.
+
+      const int claim_state =
+        atomic_compare_exchange( & task_claim->m_state
+                               , int(TASK_STATE_WAITING)
+                               , int(TASK_STATE_EXECUTING) );
+
+      task_root_type * lock_verify = 0 ;
+
+      if ( claim_state == int(TASK_STATE_WAITING) ) {
+
+        // Transitioned this task from waiting to executing
+        // Update the queue to the next entry and release the lock
+
+        task_root_type * const next =
+          *((task_root_type * volatile *) & task_claim->m_next );
+
+        *((task_root_type * volatile *) & task_claim->m_next ) = 0 ;
+
+        lock_verify = atomic_compare_exchange( queue , q_lock , next );
+      }
+
+      if ( ( claim_state != int(TASK_STATE_WAITING) ) |
+           ( q_lock != lock_verify ) ) {
+
+        printf( "CudaTaskPolicyQueue::pop_ready_task(0x%lx) task(0x%lx) state(%d) ERROR %s\n"
+              , (unsigned long) queue
+               , (unsigned long) task
+               , claim_state
+               , ( claim_state != int(TASK_STATE_WAITING)
+                 ? "NOT WAITING"
+                 : "UNLOCK" ) );
+        Kokkos::abort("CudaTaskPolicyQueue::pop_ready_task");
+      }
+
+      task = task_claim ;
+    }
+  }
+  return task ;
+}
+
+//-----------------------------------------------------------------------
+
+__device__
+void CudaTaskPolicyQueue::complete_executed_task(
+  CudaTaskPolicyQueue::task_root_type * task )
+{
+  task_root_type * const q_denied = reinterpret_cast<task_root_type*>(QDENIED);
+  
+
+#ifdef DETAILED_PRINT
+printf( "CudaTaskPolicyQueue::complete_executed_task(0x%lx) state(%d) (%d)(%d,%d)\n"
+      , (unsigned long) task
+      , task->m_state
+      , blockIdx.x
+      , threadIdx.x
+      , threadIdx.y
+      );
+#endif
+
+  // State is either executing or if respawned then waiting,
+  // try to transition from executing to complete.
+  // Reads the current value.
+  
+  const int state_old =
+    atomic_compare_exchange( & task->m_state
+                           , int(Kokkos::Experimental::TASK_STATE_EXECUTING)
+                           , int(Kokkos::Experimental::TASK_STATE_COMPLETE) );
+  
+  if ( int(Kokkos::Experimental::TASK_STATE_WAITING) == state_old ) {
+    /* Task requested a respawn so reschedule it */
+    schedule_task( task , false /* not initial spawn */ );
+  }
+  else if ( int(Kokkos::Experimental::TASK_STATE_EXECUTING) == state_old ) {
+    /* Task is complete */
+
+    // Clear dependences of this task before locking wait queue
+    
+    task->clear_dependence();
+    
+    // Stop other tasks from adding themselves to this task's wait queue.
+    // The wait queue is updated concurrently so guard with an atomic.
+    
+    task_root_type * wait_queue     = *((task_root_type * volatile *) & task->m_wait );
+    task_root_type * wait_queue_old = 0 ;
+    
+    do {
+      wait_queue_old = wait_queue ;
+      wait_queue     = atomic_compare_exchange( & task->m_wait , wait_queue_old , q_denied ); 
+    } while ( wait_queue_old != wait_queue );
+    
+    // The task has been removed from ready queue and
+    // execution is complete so decrement the reference count.
+    // The reference count was incremented by the initial spawning.
+    // The task may be deleted if this was the last reference.
+
+    task_root_type::assign( & task , 0 );
+
+    // Pop waiting tasks and schedule them
+    while ( wait_queue ) {
+      task_root_type * const x = wait_queue ; wait_queue = x->m_next ; x->m_next = 0 ;
+      schedule_task( x , false /* not initial spawn */ );
+    }
+  }
+  else {
+    printf( "CudaTaskPolicyQueue::complete_executed_task(0x%lx) ERROR state_old(%d) dep_size(%d)\n"
+           , (unsigned long)( task )
+           , int(state_old)
+           , task->m_dep_size
+           );
+    Kokkos::abort("CudaTaskPolicyQueue::complete_executed_task" );
+  }
+  
+  // If the task was respawned it may have already been
+  // put in a ready queue and the count incremented.
+  // By decrementing the count last it will never go to zero
+  // with a ready or executing task.
+  
+  atomic_fetch_add( & m_count_ready , -1 );
+}
+
+__device__
+void TaskMember< Kokkos::Cuda , void , void >::latch_add( const int k )
+{
+  typedef TaskMember< Kokkos::Cuda , void , void >  task_root_type ;
+
+  task_root_type * const q_denied = reinterpret_cast<task_root_type*>(QDENIED);
+
+  const bool ok_input = 0 < k ;
+
+  const int count = ok_input ? atomic_fetch_add( & m_dep_size , -k ) - k 
+                             : k ;
+
+  const bool ok_count = 0 <= count ;
+
+  const int state = 0 != count ? TASK_STATE_WAITING :
+    atomic_compare_exchange( & m_state
+                           , TASK_STATE_WAITING
+                           , TASK_STATE_COMPLETE );
+
+  const bool ok_state = state == TASK_STATE_WAITING ;
+
+  if ( ! ok_count || ! ok_state ) {
+    printf( "CudaTaskPolicyQueue::latch_add[0x%lx](%d) ERROR %s %d\n"
+          , (unsigned long) this
+          , k
+          , ( ! ok_input ? "Non-positive input" :
+            ( ! ok_count ? "Negative count" : "Bad State" ) )
+          , ( ! ok_input ? k :
+            ( ! ok_count ? count : state ) )
+          );
+    Kokkos::abort( "CudaTaskPolicyQueue::latch_add ERROR" );
+  }
+  else if ( 0 == count ) {
+    // Stop other tasks from adding themselves to this latch's wait queue.
+    // The wait queue is updated concurrently so guard with an atomic.
+
+    CudaTaskPolicyQueue & policy    = *m_policy ;
+    task_root_type * wait_queue     = *((task_root_type * volatile *) &m_wait);
+    task_root_type * wait_queue_old = 0 ;
+    
+    do {
+      wait_queue_old = wait_queue ;
+      wait_queue     = atomic_compare_exchange( & m_wait , wait_queue_old , q_denied ); 
+    } while ( wait_queue_old != wait_queue );
+    
+    // Pop waiting tasks and schedule them
+    while ( wait_queue ) {
+      task_root_type * const x = wait_queue ; wait_queue = x->m_next ; x->m_next = 0 ;
+      policy.schedule_task( x , false /* not initial spawn */ );
+    }
+  }
+}
+
+//----------------------------------------------------------------------------
+
+void CudaTaskPolicyQueue::reschedule_task(
+  CudaTaskPolicyQueue::task_root_type * const task )
+{
+  // Reschedule transitions from executing back to waiting.
+  const int old_state =
+    atomic_compare_exchange( & task->m_state
+                           , int(TASK_STATE_EXECUTING)
+                           , int(TASK_STATE_WAITING) );
+
+  if ( old_state != int(TASK_STATE_EXECUTING) ) {
+
+    printf( "CudaTaskPolicyQueue::reschedule_task(0x%lx) ERROR state(%d)\n"
+          , (unsigned long) task
+          , old_state
+          );
+    Kokkos::abort("CudaTaskPolicyQueue::reschedule" );
+  }
+}
+
+KOKKOS_FUNCTION
+void CudaTaskPolicyQueue::schedule_task(
+  CudaTaskPolicyQueue::task_root_type * const task ,
+  const bool initial_spawn )
+{
+  task_root_type * const q_lock = reinterpret_cast<task_root_type*>(QLOCK);
+  task_root_type * const q_denied = reinterpret_cast<task_root_type*>(QDENIED);
+
+  //----------------------------------------
+  // State is either constructing or already waiting.
+  // If constructing then transition to waiting.
+
+  {
+    const int old_state = atomic_compare_exchange( & task->m_state
+                                                 , int(TASK_STATE_CONSTRUCTING)
+                                                 , int(TASK_STATE_WAITING) );
+
+    // Head of linked list of tasks waiting on this task
+    task_root_type * const waitTask =
+      *((task_root_type * volatile const *) & task->m_wait );
+
+    // Member of linked list of tasks waiting on some other task
+    task_root_type * const next =
+      *((task_root_type * volatile const *) & task->m_next );
+
+    // An incomplete and non-executing task has:
+    //   task->m_state == TASK_STATE_CONSTRUCTING or TASK_STATE_WAITING
+    //   task->m_wait  != q_denied
+    //   task->m_next  == 0
+    //
+    if ( ( q_denied == waitTask ) ||
+         ( 0 != next ) ||
+         ( old_state != int(TASK_STATE_CONSTRUCTING) &&
+           old_state != int(TASK_STATE_WAITING) ) ) {
+      printf( "CudaTaskPolicyQueue::schedule_task(0x%lx) STATE ERROR: state(%d) wait(0x%lx) next(0x%lx)\n"
+            , (unsigned long) task
+            , old_state
+            , (unsigned long) waitTask
+            , (unsigned long) next );
+      Kokkos::abort("CudaTaskPolicyQueue::schedule" );
+    }
+  }
+
+  //----------------------------------------
+
+  if ( initial_spawn ) {
+    // The initial spawn of a task increments the reference count
+    // for the task's existence in either a waiting or ready queue
+    // until the task has completed.
+    // Completing the task's execution is the matching
+    // decrement of the reference count.
+    task_root_type::assign( 0 , task );
+  }
+
+  //----------------------------------------
+  // Insert this task into a dependence task that is not complete.
+  // Push on to that task's wait queue.
+
+  bool attempt_insert_in_queue = true ;
+
+  task_root_type * volatile * queue =
+    task->m_dep_size ? & task->m_dep[0]->m_wait : (task_root_type **) 0 ;
+
+  for ( int i = 0 ; attempt_insert_in_queue && ( 0 != queue ) ; ) {
+
+    task_root_type * const head_value_old = *queue ;
+
+    if ( q_denied == head_value_old ) {
+      // Wait queue is closed because task is complete,
+      // try again with the next dependence wait queue.
+      ++i ;
+      queue = i < task->m_dep_size ? & task->m_dep[i]->m_wait
+                                   : (task_root_type **) 0 ;
+    }
+    else {
+
+      // Wait queue is open and not denied.
+      // Have exclusive access to this task.
+      // Assign m_next assuming a successfull insertion into the queue.
+      // Fence the memory assignment before attempting the CAS.
+
+      *((task_root_type * volatile *) & task->m_next ) = head_value_old ;
+
+      memory_fence();
+
+      // Attempt to insert this task into the queue.
+      // If fails then continue the attempt.
+
+      attempt_insert_in_queue =
+        head_value_old != atomic_compare_exchange(queue,head_value_old,task);
+    }
+  }
+
+  //----------------------------------------
+  // All dependences are complete, insert into the ready list
+
+  if ( attempt_insert_in_queue ) {
+
+    // Increment the count of ready tasks.
+    // Count will be decremented when task is complete.
+
+    atomic_fetch_add( & m_count_ready , 1 );
+
+    queue = task->m_queue ;
+
+    while ( attempt_insert_in_queue ) {
+
+      // A locked queue is being popped.
+
+      task_root_type * const head_value_old = *queue ;
+
+      if ( q_lock != head_value_old ) {
+        // Read the head of ready queue,
+        // if same as previous value then CAS locks the ready queue
+
+        // Have exclusive access to this task,
+        // assign to head of queue, assuming successful insert
+        // Fence assignment before attempting insert.
+        *((task_root_type * volatile *) & task->m_next ) = head_value_old ;
+
+        memory_fence();
+
+        attempt_insert_in_queue =
+          head_value_old != atomic_compare_exchange(queue,head_value_old,task);
+      }
+    }
+  }
+}
+
+void CudaTaskPolicyQueue::deallocate_task
+  ( CudaTaskPolicyQueue::task_root_type * const task )
+{
+  m_space.deallocate( task , task->m_size_alloc );
+}
+
+KOKKOS_FUNCTION
+CudaTaskPolicyQueue::task_root_type *
+CudaTaskPolicyQueue::allocate_task
+  ( const unsigned arg_sizeof_task
+  , const unsigned arg_dep_capacity
+  , const unsigned arg_team_shmem
+  )
+{
+  const unsigned base_size = arg_sizeof_task +
+    ( arg_sizeof_task % sizeof(task_root_type*)
+    ? sizeof(task_root_type*) - arg_sizeof_task % sizeof(task_root_type*)
+    : 0 );
+
+  const unsigned dep_capacity
+    = ~0u == arg_dep_capacity
+    ? m_default_dependence_capacity
+    : arg_dep_capacity ;
+
+  const unsigned size_alloc =
+     base_size + sizeof(task_root_type*) * dep_capacity ;
+
+  task_root_type * const task =
+    reinterpret_cast<task_root_type*>( m_space.allocate( size_alloc ) );
+
+  if ( task != 0 ) {
+
+    // Initialize task's root and value data structure
+    // Calling function must copy construct the functor.
+
+    new( (void*) task ) task_root_type();
+
+    task->m_policy       = this ;
+    task->m_size_alloc   = size_alloc ;
+    task->m_dep_capacity = dep_capacity ;
+    task->m_shmem_size   = arg_team_shmem ;
+
+    if ( dep_capacity ) {
+      task->m_dep =
+        reinterpret_cast<task_root_type**>(
+        reinterpret_cast<unsigned char*>(task) + base_size );
+
+      for ( unsigned i = 0 ; i < dep_capacity ; ++i )
+        task->task_root_type::m_dep[i] = 0 ;
+    }
+  }
+  return  task ;
+}
+
+//----------------------------------------------------------------------------
+
+void CudaTaskPolicyQueue::add_dependence
+  ( CudaTaskPolicyQueue::task_root_type * const after
+  , CudaTaskPolicyQueue::task_root_type * const before
+  )
+{
+  if ( ( after != 0 ) && ( before != 0 ) ) {
+
+    int const state = *((volatile const int *) & after->m_state );
+
+    // Only add dependence during construction or during execution.
+    // Both tasks must have the same policy.
+    // Dependence on non-full memory cannot be mixed with any other dependence.
+
+    const bool ok_state =
+      Kokkos::Experimental::TASK_STATE_CONSTRUCTING == state ||
+      Kokkos::Experimental::TASK_STATE_EXECUTING    == state ;
+
+    const bool ok_capacity =
+      after->m_dep_size < after->m_dep_capacity ;
+
+    const bool ok_policy =
+      after->m_policy == this && before->m_policy == this ;
+
+    if ( ok_state && ok_capacity && ok_policy ) {
+
+      ++after->m_dep_size ;
+
+      task_root_type::assign( after->m_dep + (after->m_dep_size-1) , before );
+
+      memory_fence();
+    }
+    else {
+
+printf( "CudaTaskPolicyQueue::add_dependence( 0x%lx , 0x%lx ) ERROR %s\n"
+      , (unsigned long) after
+      , (unsigned long) before
+      , ( ! ok_state    ? "Task not constructing or executing" :
+        ( ! ok_capacity ? "Task Exceeded dependence capacity"
+                        : "Tasks from different policies" )) );
+
+      Kokkos::abort("CudaTaskPolicyQueue::add_dependence ERROR");
+    }
+  }
+}
+
+} /* namespace Impl */
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+
+TaskPolicy< Kokkos::Cuda >::TaskPolicy
+  ( const unsigned arg_task_max_count
+  , const unsigned arg_task_max_size
+  , const unsigned arg_task_default_dependence_capacity
+  , const unsigned arg_task_team_size
+  )
+  : m_track()
+  , m_policy(0)
+{
+  // Allocate the queue data sructure in UVM space
+
+  typedef Kokkos::Experimental::Impl::SharedAllocationRecord
+    < Kokkos::CudaUVMSpace , Impl::CudaTaskPolicyQueue::Destroy > record_type ;
+
+  record_type * record =
+    record_type::allocate( Kokkos::CudaUVMSpace()
+                         , "CudaUVM task queue"
+                         , sizeof(Impl::CudaTaskPolicyQueue)
+                         );
+
+  m_policy = reinterpret_cast< Impl::CudaTaskPolicyQueue * >( record->data() );
+
+  // Tasks are allocated with application's task size + sizeof(task_root_type)
+
+  const size_t full_task_size_estimate =
+    arg_task_max_size +
+    sizeof(task_root_type) +
+    sizeof(task_root_type*) * arg_task_default_dependence_capacity ;
+
+  new( m_policy )
+    Impl::CudaTaskPolicyQueue( arg_task_max_count
+                             , full_task_size_estimate
+                             , arg_task_default_dependence_capacity
+                             , arg_task_team_size );
+
+  record->m_destroy.m_policy = m_policy ;
+  
+  m_track.assign_allocated_record_to_uninitialized( record );
+}
+
+__global__
+static void kokkos_cuda_task_policy_queue_driver
+  ( Kokkos::Experimental::Impl::CudaTaskPolicyQueue * queue )
+{
+  queue->driver();
+}
+
+void wait( Kokkos::Experimental::TaskPolicy< Kokkos::Cuda > & policy )
+{
+  const dim3 grid( Kokkos::Impl::cuda_internal_multiprocessor_count() , 1 , 1 );
+  const dim3 block( 1 , policy.m_policy->m_team_size , 1 );
+
+  const int shared = 0 ; // Kokkos::Impl::CudaTraits::SharedMemoryUsage / 2 ;
+  const cudaStream_t stream = 0 ;
+
+
+#ifdef DETAILED_PRINT
+printf("kokkos_cuda_task_policy_queue_driver grid(%d,%d,%d) block(%d,%d,%d) shared(%d) policy(0x%lx)\n"
+      , grid.x , grid.y , grid.z
+      , block.x , block.y , block.z
+      , shared
+      , (unsigned long)( policy.m_policy ) );
+fflush(stdout);
+#endif
+
+  CUDA_SAFE_CALL( cudaDeviceSynchronize() );
+
+/*
+  CUDA_SAFE_CALL(
+    cudaFuncSetCacheConfig( kokkos_cuda_task_policy_queue_driver
+                          , cudaFuncCachePreferL1 ) );
+
+  CUDA_SAFE_CALL( cudaGetLastError() );
+*/
+
+  kokkos_cuda_task_policy_queue_driver<<< grid , block , shared , stream >>>
+    ( policy.m_policy );
+ 
+  CUDA_SAFE_CALL( cudaGetLastError() );
+
+  CUDA_SAFE_CALL( cudaDeviceSynchronize() );
+
+#ifdef DETAILED_PRINT
+printf("kokkos_cuda_task_policy_queue_driver end\n");
+fflush(stdout);
+#endif
+
+}
+
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+typedef TaskMember< Kokkos::Cuda , void , void > Task ;
+
+__host__ __device__
+Task::~TaskMember()
+{
+}
+
+__host__ __device__
+void Task::assign( Task ** const lhs_ptr , Task * rhs )
+{
+  Task * const q_denied = reinterpret_cast<Task*>(QDENIED);
+
+  // Increment rhs reference count.
+  if ( rhs ) { atomic_fetch_add( & rhs->m_ref_count , 1 ); }
+
+  if ( 0 == lhs_ptr ) return ;
+
+  // Must have exclusive access to *lhs_ptr.
+  // Assign the pointer and retrieve the previous value.
+  // Cannot use atomic exchange since *lhs_ptr may be
+  // in Cuda register space.
+
+#if 0
+
+  Task * const old_lhs = *((Task*volatile*)lhs_ptr);
+
+  *((Task*volatile*)lhs_ptr) = rhs ;
+
+  Kokkos::memory_fence();
+
+#else
+
+  Task * const old_lhs = *lhs_ptr ;
+
+  *lhs_ptr = rhs ;
+
+#endif
+
+  if ( old_lhs && rhs && old_lhs->m_policy != rhs->m_policy ) {
+    Kokkos::abort( "Kokkos::Impl::TaskMember<Kokkos::Cuda>::assign ERROR different queues");
+  }
+
+  if ( old_lhs ) {
+
+    Kokkos::memory_fence();
+
+    // Decrement former lhs reference count.
+    // If reference count is zero task must be complete, then delete task.
+    // Task is ready for deletion when  wait == q_denied
+
+    int const count = atomic_fetch_add( & (old_lhs->m_ref_count) , -1 ) - 1 ;
+    int const state = old_lhs->m_state ;
+    Task * const wait = *((Task * const volatile *) & old_lhs->m_wait );
+
+    const bool ok_count = 0 <= count ;
+
+    // If count == 0 then will be deleting
+    // and must either be constructing or complete.
+    const bool ok_state = 0 < count ? true :
+      ( ( state == int(TASK_STATE_CONSTRUCTING) && wait == 0 ) ||
+        ( state == int(TASK_STATE_COMPLETE)     && wait == q_denied ) )
+      &&
+     old_lhs->m_next == 0 &&
+     old_lhs->m_dep_size == 0 ;
+
+    if ( ! ok_count || ! ok_state ) {
+
+      printf( "%s Kokkos::Impl::TaskManager<Kokkos::Cuda>::assign ERROR deleting task(0x%lx) m_ref_count(%d) m_state(%d) m_wait(0x%ld)\n"
+#if defined( KOKKOS_ACTIVE_EXECUTION_SPACE_CUDA )
+            , "CUDA "
+#else
+            , "HOST "
+#endif
+            , (unsigned long) old_lhs
+            , count
+            , state
+            , (unsigned long) wait );
+      Kokkos::abort( "Kokkos::Impl::TaskMember<Kokkos::Cuda>::assign ERROR deleting");
+    }
+
+    if ( count == 0 ) {
+      // When 'count == 0' this thread has exclusive access to 'old_lhs'
+
+#ifdef DETAILED_PRINT
+printf( "Task::assign(...) old_lhs(0x%lx) deallocate\n"
+      , (unsigned long) old_lhs
+      );
+#endif
+
+      old_lhs->m_policy->deallocate_task( old_lhs );
+    }
+  }
+}
+
+//----------------------------------------------------------------------------
+
+__device__
+int Task::get_dependence() const
+{
+  return m_dep_size ;
+}
+
+__device__
+Task * Task::get_dependence( int i ) const
+{
+  Task * const t = ((Task*volatile*)m_dep)[i] ;
+
+  if ( Kokkos::Experimental::TASK_STATE_EXECUTING != m_state || i < 0 || m_dep_size <= i || 0 == t ) {
+
+printf( "TaskMember< Cuda >::get_dependence ERROR : task[%lx]{ state(%d) dep_size(%d) dep[%d] = %lx }\n"
+      , (unsigned long) this
+      , m_state
+      , m_dep_size
+      , i
+      , (unsigned long) t
+      );
+
+    Kokkos::abort("TaskMember< Cuda >::get_dependence ERROR");
+  }
+
+  return t ;
+}
+
+//----------------------------------------------------------------------------
+
+__device__ __host__
+void Task::clear_dependence()
+{
+  for ( int i = m_dep_size - 1 ; 0 <= i ; --i ) {
+    assign( m_dep + i , 0 );
+  }
+
+  *((volatile int *) & m_dep_size ) = 0 ;
+
+  memory_fence();
+}
+
+//----------------------------------------------------------------------------
+
+
+//----------------------------------------------------------------------------
+
+} /* namespace Impl */
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+
+#endif  /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
+
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_TaskPolicy.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_TaskPolicy.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e71512f0391b3e264341222b82918d9901080061
--- /dev/null
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_TaskPolicy.hpp
@@ -0,0 +1,833 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+// Experimental unified task-data parallel manycore LDRD
+
+#ifndef KOKKOS_CUDA_TASKPOLICY_HPP
+#define KOKKOS_CUDA_TASKPOLICY_HPP
+
+#include <Kokkos_Core_fwd.hpp>
+#include <Kokkos_Cuda.hpp>
+#include <Kokkos_TaskPolicy.hpp>
+
+#if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKPOLICY )
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+struct CudaTaskPolicyQueue ;
+
+/** \brief  Base class for all Kokkos::Cuda tasks */
+template<>
+class TaskMember< Kokkos::Cuda , void , void > {
+public:
+
+  template< class > friend class Kokkos::Experimental::TaskPolicy ;
+  friend struct CudaTaskPolicyQueue ;
+
+  typedef void (* function_single_type) ( TaskMember * );
+  typedef void (* function_team_type)   ( TaskMember * , Kokkos::Impl::CudaTeamMember & );
+
+private:
+
+  CudaTaskPolicyQueue   * m_policy ;
+  TaskMember * volatile * m_queue ;
+  function_team_type      m_team ;    ///< Apply function on CUDA
+  function_single_type    m_serial ;  ///< Apply function on CUDA
+  TaskMember **           m_dep ;     ///< Dependences
+  TaskMember *            m_wait ;    ///< Linked list of tasks waiting on this task
+  TaskMember *            m_next ;    ///< Linked list of tasks waiting on a different task
+  int                    m_dep_capacity ; ///< Capacity of dependences
+  int                    m_dep_size ;     ///< Actual count of dependences
+  int                    m_size_alloc ;
+  int                    m_shmem_size ;
+  int                    m_ref_count ;    ///< Reference count
+  int                    m_state ;        ///< State of the task
+
+
+  TaskMember( TaskMember && ) = delete ;
+  TaskMember( const TaskMember & ) = delete ;
+  TaskMember & operator = ( TaskMember && ) = delete ;
+  TaskMember & operator = ( const TaskMember & ) = delete ;
+
+protected:
+
+  KOKKOS_INLINE_FUNCTION
+  TaskMember()
+    : m_policy(0)
+    , m_queue(0)
+    , m_team(0)
+    , m_serial(0)
+    , m_dep(0)
+    , m_wait(0)
+    , m_next(0)
+    , m_size_alloc(0)
+    , m_dep_capacity(0)
+    , m_dep_size(0)
+    , m_shmem_size(0)
+    , m_ref_count(0)
+    , m_state( TASK_STATE_CONSTRUCTING )
+    {}
+
+public:
+
+  KOKKOS_FUNCTION
+  ~TaskMember();
+
+  KOKKOS_INLINE_FUNCTION
+  int reference_count() const
+    { return *((volatile int *) & m_ref_count ); }
+
+  // Cannot use the function pointer to verify the type
+  // since the function pointer is not unique between
+  // Host and Cuda. Don't run verificaton for Cuda. 
+  // Assume testing on Host-only back-end will catch such errors.
+
+  template< typename ResultType >
+  KOKKOS_INLINE_FUNCTION static
+  TaskMember * verify_type( TaskMember * t ) { return t ; }
+
+  //----------------------------------------
+  /*  Inheritence Requirements on task types:
+   *
+   *    class DerivedTaskType
+   *      : public TaskMember< Cuda , DerivedType::value_type , FunctorType >
+   *      { ... };
+   *
+   *    class TaskMember< Cuda , DerivedType::value_type , FunctorType >
+   *      : public TaskMember< Cuda , DerivedType::value_type , void >
+   *      , public Functor
+   *      { ... };
+   *
+   *  If value_type != void
+   *    class TaskMember< Cuda , value_type , void >
+   *      : public TaskMember< Cuda , void , void >
+   *
+   *  Allocate space for DerivedTaskType followed by TaskMember*[ dependence_capacity ]
+   *
+   */
+  //----------------------------------------
+  // If after the 'apply' the task's state is waiting 
+  // then it will be rescheduled and called again.
+  // Otherwise the functor must be destroyed.
+
+  template< class DerivedTaskType , class Tag >
+  __device__ static
+  void apply_single(
+    typename std::enable_if
+      <( std::is_same< Tag , void >::value &&
+        std::is_same< typename DerivedTaskType::result_type , void >::value
+       ), TaskMember * >::type t )
+    {
+      typedef typename DerivedTaskType::functor_type  functor_type ;
+
+      functor_type * const f =
+        static_cast< functor_type * >( static_cast< DerivedTaskType * >(t) );
+
+      f->apply();
+
+      if ( t->m_state == int(Kokkos::Experimental::TASK_STATE_EXECUTING) ) {
+        f->~functor_type();
+      }
+    }
+
+  template< class DerivedTaskType , class Tag >
+  __device__ static
+  void apply_single(
+    typename std::enable_if
+      <( std::is_same< Tag , void >::value &&
+        ! std::is_same< typename DerivedTaskType::result_type , void >::value
+       ), TaskMember * >::type t )
+    {
+      typedef typename DerivedTaskType::functor_type  functor_type ;
+
+      DerivedTaskType * const self = static_cast< DerivedTaskType * >(t);
+      functor_type    * const f    = static_cast< functor_type * >( self );
+
+      f->apply( self->m_result );
+
+      if ( t->m_state == int(Kokkos::Experimental::TASK_STATE_EXECUTING) ) {
+        f->~functor_type();
+      }
+    }
+
+  template< class DerivedTaskType , class Tag >
+  __device__
+  void set_apply_single()
+    {
+      m_serial = & TaskMember::template apply_single<DerivedTaskType,Tag> ;
+    }
+
+  //----------------------------------------
+
+  template< class DerivedTaskType , class Tag >
+  __device__ static
+  void apply_team(
+    typename std::enable_if
+      <( std::is_same<Tag,void>::value &&
+         std::is_same<typename DerivedTaskType::result_type,void>::value
+       ), TaskMember * >::type t
+    , Kokkos::Impl::CudaTeamMember & member
+    )
+    {
+      typedef typename DerivedTaskType::functor_type functor_type ;
+
+      functor_type * const f =
+        static_cast< functor_type * >( static_cast< DerivedTaskType * >(t) );
+
+      f->apply( member );
+
+      __syncthreads(); // Wait for team to finish calling function
+
+      if ( threadIdx.x == 0 &&
+           threadIdx.y == 0 &&
+           t->m_state == int(Kokkos::Experimental::TASK_STATE_EXECUTING) ) {
+        f->~functor_type();
+      }
+    }
+
+  template< class DerivedTaskType , class Tag >
+  __device__ static
+  void apply_team(
+    typename std::enable_if
+      <( std::is_same<Tag,void>::value &&
+         ! std::is_same<typename DerivedTaskType::result_type,void>::value
+       ), TaskMember * >::type t
+    , Kokkos::Impl::CudaTeamMember & member
+    )
+    {
+      typedef typename DerivedTaskType::functor_type  functor_type ;
+
+      DerivedTaskType * const self = static_cast< DerivedTaskType * >(t);
+      functor_type    * const f    = static_cast< functor_type * >( self );
+
+      f->apply( member , self->m_result );
+
+      __syncthreads(); // Wait for team to finish calling function
+
+      if ( threadIdx.x == 0 &&
+           threadIdx.y == 0 &&
+           t->m_state == int(Kokkos::Experimental::TASK_STATE_EXECUTING) ) {
+        f->~functor_type();
+      }
+    }
+
+  template< class DerivedTaskType , class Tag >
+  __device__
+  void set_apply_team()
+    {
+      m_team = & TaskMember::template apply_team<DerivedTaskType,Tag> ;
+    }
+
+  //----------------------------------------
+
+  KOKKOS_FUNCTION static
+  void assign( TaskMember ** const lhs , TaskMember * const rhs );
+
+  __device__
+  TaskMember * get_dependence( int i ) const ;
+
+  __device__
+  int get_dependence() const ;
+
+  KOKKOS_FUNCTION void clear_dependence();
+
+  __device__
+  void latch_add( const int k );
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION static
+  void construct_result( TaskMember * const ) {}
+
+  typedef FutureValueTypeIsVoidError get_result_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  get_result_type get() const { return get_result_type() ; }
+
+  KOKKOS_INLINE_FUNCTION
+  Kokkos::Experimental::TaskState get_state() const { return Kokkos::Experimental::TaskState( m_state ); }
+
+};
+
+/** \brief  A Future< Kokkos::Cuda , ResultType > will cast
+ *          from  TaskMember< Kokkos::Cuda , void , void >
+ *          to    TaskMember< Kokkos::Cuda , ResultType , void >
+ *          to query the result.
+ */
+template< class ResultType >
+class TaskMember< Kokkos::Cuda , ResultType , void >
+  : public TaskMember< Kokkos::Cuda , void , void >
+{
+public:
+
+  typedef ResultType result_type ;
+
+  result_type  m_result ;
+
+  typedef const result_type & get_result_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  get_result_type get() const { return m_result ; }
+
+  KOKKOS_INLINE_FUNCTION static
+  void construct_result( TaskMember * const ptr )
+    {
+      new((void*)(& ptr->m_result)) result_type();
+    }
+
+  TaskMember() = delete ;
+  TaskMember( TaskMember && ) = delete ;
+  TaskMember( const TaskMember & ) = delete ;
+  TaskMember & operator = ( TaskMember && ) = delete ;
+  TaskMember & operator = ( const TaskMember & ) = delete ;
+};
+
+/** \brief  Callback functions will cast
+ *          from  TaskMember< Kokkos::Cuda , void , void >
+ *          to    TaskMember< Kokkos::Cuda , ResultType , FunctorType >
+ *          to execute work functions.
+ */
+template< class ResultType , class FunctorType >
+class TaskMember< Kokkos::Cuda , ResultType , FunctorType >
+  : public TaskMember< Kokkos::Cuda , ResultType , void >
+  , public FunctorType
+{
+public:
+  typedef ResultType   result_type ;
+  typedef FunctorType  functor_type ;
+
+  KOKKOS_INLINE_FUNCTION static
+  void copy_construct( TaskMember * const ptr
+                     , const functor_type & arg_functor )
+    {
+      typedef TaskMember< Kokkos::Cuda , ResultType , void > base_type ;
+
+      new((void*)static_cast<FunctorType*>(ptr)) functor_type( arg_functor );
+
+      base_type::construct_result( static_cast<base_type*>( ptr ) );
+    }
+
+  TaskMember() = delete ;
+  TaskMember( TaskMember && ) = delete ;
+  TaskMember( const TaskMember & ) = delete ;
+  TaskMember & operator = ( TaskMember && ) = delete ;
+  TaskMember & operator = ( const TaskMember & ) = delete ;
+};
+
+//----------------------------------------------------------------------------
+
+namespace {
+
+template< class DerivedTaskType , class Tag >
+__global__
+void cuda_set_apply_single( DerivedTaskType * task )
+{
+  typedef Kokkos::Experimental::Impl::TaskMember< Kokkos::Cuda , void , void >
+    task_root_type ;
+
+  task->task_root_type::template set_apply_single< DerivedTaskType , Tag >();
+}
+
+template< class DerivedTaskType , class Tag >
+__global__
+void cuda_set_apply_team( DerivedTaskType * task )
+{
+  typedef Kokkos::Experimental::Impl::TaskMember< Kokkos::Cuda , void , void >
+    task_root_type ;
+
+  task->task_root_type::template set_apply_team< DerivedTaskType , Tag >();
+}
+
+} /* namespace */
+} /* namespace Impl */
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+struct CudaTaskPolicyQueue {
+
+  enum { NPRIORITY = 3 };
+
+  // Must use UVM so that tasks can be created in both
+  // Host and Cuda space.
+
+  typedef Kokkos::Experimental::MemoryPool< Kokkos::CudaUVMSpace >
+    memory_space ;
+
+  typedef Kokkos::Experimental::Impl::TaskMember< Kokkos::Cuda , void , void >
+    task_root_type ;
+
+  memory_space     m_space ;
+  task_root_type * m_team[ NPRIORITY ] ;
+  task_root_type * m_serial[ NPRIORITY ];
+  int              m_team_size ;
+  int              m_default_dependence_capacity ;
+  int volatile     m_count_ready ; ///< Ready plus executing tasks
+
+  // Execute tasks until all non-waiting tasks are complete
+  __device__
+  void driver();
+
+  __device__ static
+  task_root_type * pop_ready_task( task_root_type * volatile * const queue );
+
+  // When a task finishes executing.
+  __device__
+  void complete_executed_task( task_root_type * );
+
+  KOKKOS_FUNCTION void schedule_task( task_root_type * const 
+                                    , const bool initial_spawn = true );
+  KOKKOS_FUNCTION void reschedule_task( task_root_type * const );
+  KOKKOS_FUNCTION
+  void add_dependence( task_root_type * const after
+                     , task_root_type * const before );
+
+
+  CudaTaskPolicyQueue() = delete ;
+  CudaTaskPolicyQueue( CudaTaskPolicyQueue && ) = delete ;
+  CudaTaskPolicyQueue( const CudaTaskPolicyQueue & ) = delete ;
+  CudaTaskPolicyQueue & operator = ( CudaTaskPolicyQueue && ) = delete ;
+  CudaTaskPolicyQueue & operator = ( const CudaTaskPolicyQueue & ) = delete ;
+
+
+  ~CudaTaskPolicyQueue();
+
+  // Construct only on the Host
+  CudaTaskPolicyQueue
+    ( const unsigned arg_task_max_count
+    , const unsigned arg_task_max_size
+    , const unsigned arg_task_default_dependence_capacity
+    , const unsigned arg_task_team_size
+    );
+
+  struct Destroy {
+    CudaTaskPolicyQueue * m_policy ;
+    void destroy_shared_allocation();
+  };
+
+  //----------------------------------------
+  /** \brief  Allocate and construct a task.
+   *
+   *  Allocate space for DerivedTaskType followed
+   *  by TaskMember*[ dependence_capacity ]
+   */
+  KOKKOS_FUNCTION
+  task_root_type *
+  allocate_task( const unsigned arg_sizeof_task
+               , const unsigned arg_dep_capacity
+               , const unsigned arg_team_shmem = 0 );
+
+  KOKKOS_FUNCTION void deallocate_task( task_root_type * const );
+};
+
+} /* namespace Impl */
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+
+void wait( TaskPolicy< Kokkos::Cuda > & );
+
+template<>
+class TaskPolicy< Kokkos::Cuda >
+{
+public:
+
+  typedef Kokkos::Cuda                  execution_space ;
+  typedef TaskPolicy                    execution_policy ;
+  typedef Kokkos::Impl::CudaTeamMember  member_type ;
+
+private:
+
+  typedef Impl::TaskMember< Kokkos::Cuda , void , void >  task_root_type ;
+  typedef Kokkos::Experimental::MemoryPool< Kokkos::CudaUVMSpace > memory_space ;
+  typedef Kokkos::Experimental::Impl::SharedAllocationTracker track_type ;
+
+  track_type                   m_track ;
+  Impl::CudaTaskPolicyQueue  * m_policy ;
+
+  template< class FunctorType >
+  KOKKOS_INLINE_FUNCTION static
+  const task_root_type * get_task_root( const FunctorType * f )
+    {
+      typedef Impl::TaskMember< execution_space , typename FunctorType::value_type , FunctorType > task_type ;
+      return static_cast< const task_root_type * >( static_cast< const task_type * >(f) );
+    }
+
+  template< class FunctorType >
+  KOKKOS_INLINE_FUNCTION static
+  task_root_type * get_task_root( FunctorType * f )
+    {
+      typedef Impl::TaskMember< execution_space , typename FunctorType::value_type , FunctorType > task_type ;
+      return static_cast< task_root_type * >( static_cast< task_type * >(f) );
+    }
+
+public:
+
+  TaskPolicy
+    ( const unsigned arg_task_max_count
+    , const unsigned arg_task_max_size
+    , const unsigned arg_task_default_dependence_capacity = 4
+    , const unsigned arg_task_team_size = 0 /* choose default */
+    );
+
+  KOKKOS_FUNCTION TaskPolicy() = default ;
+  KOKKOS_FUNCTION TaskPolicy( TaskPolicy && rhs ) = default ;
+  KOKKOS_FUNCTION TaskPolicy( const TaskPolicy & rhs ) = default ;
+  KOKKOS_FUNCTION TaskPolicy & operator = ( TaskPolicy && rhs ) = default ;
+  KOKKOS_FUNCTION TaskPolicy & operator = ( const TaskPolicy & rhs ) = default ;
+
+  KOKKOS_FUNCTION
+  int allocated_task_count() const { return 0 ; }
+
+  //----------------------------------------
+  // Create serial-thread task
+  // Main process and tasks must use different functions
+  // to work around CUDA limitation where __host__ __device__
+  // functions are not allowed to invoke templated __global__ functions.
+
+  template< class FunctorType >
+  Future< typename FunctorType::value_type , execution_space >
+  proc_create( const FunctorType & arg_functor
+             , const unsigned      arg_dep_capacity = ~0u ) const
+    {
+      typedef typename FunctorType::value_type  value_type ;
+
+      typedef Impl::TaskMember< execution_space , value_type , FunctorType >
+        task_type ;
+
+      task_type * const task =
+        static_cast<task_type*>(
+          m_policy->allocate_task( sizeof(task_type) , arg_dep_capacity ) );
+
+      if ( task ) {
+        // The root part of the class has been constructed.
+        // Must now construct the functor and result specific part.
+
+        task_type::copy_construct( task , arg_functor );
+
+        // Setting the apply pointer on the device requires code
+        // executing on the GPU.  This function is called on the
+        // host process so a kernel must be run.
+
+        // Launching a kernel will cause the allocated task in
+        // UVM memory to be copied to the GPU.
+        // Synchronize to guarantee non-concurrent access
+        // between host and device.
+
+        CUDA_SAFE_CALL( cudaDeviceSynchronize() );
+
+        Impl::cuda_set_apply_single<task_type,void><<<1,1>>>( task );
+
+        CUDA_SAFE_CALL( cudaGetLastError() );
+        CUDA_SAFE_CALL( cudaDeviceSynchronize() );
+      }
+
+      return Future< value_type , execution_space >( task );
+    }
+
+  template< class FunctorType >
+  __device__
+  Future< typename FunctorType::value_type , execution_space >
+  task_create( const FunctorType & arg_functor
+             , const unsigned      arg_dep_capacity = ~0u ) const
+    {
+      typedef typename FunctorType::value_type  value_type ;
+
+      typedef Impl::TaskMember< execution_space , value_type , FunctorType >
+        task_type ;
+
+      task_type * const task =
+        static_cast<task_type*>(
+          m_policy->allocate_task( sizeof(task_type) , arg_dep_capacity ) );
+
+      if ( task ) {
+        // The root part of the class has been constructed.
+        // Must now construct the functor and result specific part.
+
+        task_type::copy_construct( task , arg_functor );
+
+        // Setting the apply pointer on the device requires code
+        // executing on the GPU.  If this function is called on the
+        // Host then a kernel must be run.
+
+        task->task_root_type::template set_apply_single< task_type , void >();
+      }
+
+      return Future< value_type , execution_space >( task );
+    }
+
+  //----------------------------------------
+  // Create thread-team task
+  // Main process and tasks must use different functions
+  // to work around CUDA limitation where __host__ __device__
+  // functions are not allowed to invoke templated __global__ functions.
+
+  template< class FunctorType >
+  Future< typename FunctorType::value_type , execution_space >
+  proc_create_team( const FunctorType & arg_functor
+                  , const unsigned      arg_dep_capacity = ~0u ) const
+    {
+      typedef typename FunctorType::value_type  value_type ;
+
+      typedef Impl::TaskMember< execution_space , value_type , FunctorType >
+        task_type ;
+
+      const unsigned team_shmem_size =
+        Kokkos::Impl::FunctorTeamShmemSize< FunctorType >::value
+           ( arg_functor , m_policy->m_team_size );
+
+      task_type * const task =
+        static_cast<task_type*>(
+          m_policy->allocate_task( sizeof(task_type) , arg_dep_capacity , team_shmem_size ) );
+
+      if ( task ) {
+        // The root part of the class has been constructed.
+        // Must now construct the functor and result specific part.
+
+        task_type::copy_construct( task , arg_functor );
+
+        // Setting the apply pointer on the device requires code
+        // executing on the GPU.  This function is called on the
+        // host process so a kernel must be run.
+
+        // Launching a kernel will cause the allocated task in
+        // UVM memory to be copied to the GPU.
+        // Synchronize to guarantee non-concurrent access
+        // between host and device.
+
+        CUDA_SAFE_CALL( cudaDeviceSynchronize() );
+
+        Impl::cuda_set_apply_team<task_type,void><<<1,1>>>( task );
+
+        CUDA_SAFE_CALL( cudaGetLastError() );
+        CUDA_SAFE_CALL( cudaDeviceSynchronize() );
+      }
+
+      return Future< value_type , execution_space >( task );
+    }
+
+  template< class FunctorType >
+  __device__
+  Future< typename FunctorType::value_type , execution_space >
+  task_create_team( const FunctorType & arg_functor
+                  , const unsigned      arg_dep_capacity = ~0u ) const
+    {
+      typedef typename FunctorType::value_type  value_type ;
+
+      typedef Impl::TaskMember< execution_space , value_type , FunctorType >
+        task_type ;
+
+      const unsigned team_shmem_size =
+        Kokkos::Impl::FunctorTeamShmemSize< FunctorType >::value
+           ( arg_functor , m_policy->m_team_size );
+
+      task_type * const task =
+        static_cast<task_type*>(
+          m_policy->allocate_task( sizeof(task_type) , arg_dep_capacity , team_shmem_size ) );
+
+      if ( task ) {
+        // The root part of the class has been constructed.
+        // Must now construct the functor and result specific part.
+
+        task_type::copy_construct( task , arg_functor );
+
+        // Setting the apply pointer on the device requires code
+        // executing on the GPU.  If this function is called on the
+        // Host then a kernel must be run.
+
+        task->task_root_type::template set_apply_team< task_type , void >();
+      }
+
+      return Future< value_type , execution_space >( task );
+    }
+
+  //----------------------------------------
+
+  Future< Latch , execution_space >
+  KOKKOS_INLINE_FUNCTION
+  create_latch( const int N ) const
+    {
+      task_root_type * const task =
+        m_policy->allocate_task( sizeof(task_root_type) , 0 , 0 );
+      task->m_dep_size = N ; // Using m_dep_size for latch counter
+      task->m_state = TASK_STATE_WAITING ;
+      return Future< Latch , execution_space >( task );
+    }
+
+  //----------------------------------------
+
+  template< class A1 , class A2 , class A3 , class A4 >
+  KOKKOS_INLINE_FUNCTION
+  void add_dependence( const Future<A1,A2> & after
+                     , const Future<A3,A4> & before
+                     , typename std::enable_if
+                        < std::is_same< typename Future<A1,A2>::execution_space , execution_space >::value
+                          &&
+                          std::is_same< typename Future<A3,A4>::execution_space , execution_space >::value
+                        >::type * = 0
+                      ) const
+    { m_policy->add_dependence( after.m_task , before.m_task ); }
+
+  template< class FunctorType , class A3 , class A4 >
+  KOKKOS_INLINE_FUNCTION
+  void add_dependence( FunctorType * task_functor
+                     , const Future<A3,A4> & before
+                     , typename std::enable_if
+                        < std::is_same< typename Future<A3,A4>::execution_space , execution_space >::value
+                        >::type * = 0
+                      ) const
+    { m_policy->add_dependence( get_task_root(task_functor) , before.m_task ); }
+
+
+  template< class ValueType >
+  KOKKOS_INLINE_FUNCTION
+  const Future< ValueType , execution_space > &
+    spawn( const Future< ValueType , execution_space > & f 
+         , const bool priority = false ) const
+      {
+        if ( f.m_task ) {
+          f.m_task->m_queue =
+            ( f.m_task->m_team != 0
+            ? & ( m_policy->m_team[   priority ? 0 : 1 ] )
+            : & ( m_policy->m_serial[ priority ? 0 : 1 ] ) );
+          m_policy->schedule_task( f.m_task );
+        }
+        return f ;
+      }
+
+  template< class FunctorType >
+  KOKKOS_INLINE_FUNCTION
+  void respawn( FunctorType * task_functor 
+              , const bool priority = false ) const
+    {
+      task_root_type * const t = get_task_root(task_functor);
+      t->m_queue =
+        ( t->m_team != 0 ? & ( m_policy->m_team[   priority ? 0 : 1 ] )
+                         : & ( m_policy->m_serial[ priority ? 0 : 1 ] ) );
+      m_policy->reschedule_task( t );
+    }
+
+  // When a create method fails by returning a null Future
+  // the task that called the create method may respawn
+  // with a dependence on memory becoming available.
+  // This is a race as more than one task may be respawned
+  // with this need.
+
+  template< class FunctorType >
+  KOKKOS_INLINE_FUNCTION
+  void respawn_needing_memory( FunctorType * task_functor ) const
+    {
+      task_root_type * const t = get_task_root(task_functor);
+      t->m_queue =
+        ( t->m_team != 0 ? & ( m_policy->m_team[   2 ] )
+                         : & ( m_policy->m_serial[ 2 ] ) );
+      m_policy->reschedule_task( t );
+    }
+
+  //----------------------------------------
+  // Functions for an executing task functor to query dependences,
+  // set new dependences, and respawn itself.
+
+  template< class FunctorType >
+  KOKKOS_INLINE_FUNCTION
+  Future< void , execution_space >
+  get_dependence( const FunctorType * task_functor , int i ) const
+    {
+      return Future<void,execution_space>(
+        get_task_root(task_functor)->get_dependence(i)
+      );
+    }
+
+  template< class FunctorType >
+  KOKKOS_INLINE_FUNCTION
+  int get_dependence( const FunctorType * task_functor ) const
+    { return get_task_root(task_functor)->get_dependence(); }
+
+  template< class FunctorType >
+  KOKKOS_INLINE_FUNCTION
+  void clear_dependence( FunctorType * task_functor ) const
+    { get_task_root(task_functor)->clear_dependence(); }
+
+  //----------------------------------------
+
+  __device__
+  static member_type member_single()
+    {
+      return
+        member_type( 0 /* shared memory pointer */
+                   , 0 /* shared memory begin offset */
+                   , 0 /* shared memory end offset */
+                   , 0 /* scratch level_1 pointer */
+                   , 0 /* scratch level_1 size */
+                   , 0 /* league rank */
+                   , 1 /* league size */ );
+    }
+
+  friend void wait( TaskPolicy< Kokkos::Cuda > & );
+};
+
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */
+#endif /* #ifndef KOKKOS_CUDA_TASKPOLICY_HPP */
+
+
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..0b8427cbe1e9664a41b6bb8b33b21320ad613d78
--- /dev/null
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp
@@ -0,0 +1,298 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+#ifndef KOKKOS_CUDA_VECTORIZATION_HPP
+#define KOKKOS_CUDA_VECTORIZATION_HPP
+
+#include <Kokkos_Macros.hpp>
+
+/* only compile this file if CUDA is enabled for Kokkos */
+#ifdef KOKKOS_HAVE_CUDA
+
+#include <Kokkos_Cuda.hpp>
+
+namespace Kokkos {
+
+
+// Shuffle only makes sense on >= Kepler GPUs; it doesn't work on CPUs
+// or other GPUs.  We provide a generic definition (which is trivial
+// and doesn't do what it claims to do) because we don't actually use
+// this function unless we are on a suitable GPU, with a suitable
+// Scalar type.  (For example, in the mat-vec, the "ThreadsPerRow"
+// internal parameter depends both on the ExecutionSpace and the Scalar type,
+// and it controls whether shfl_down() gets called.)
+namespace Impl {
+
+  template< typename Scalar >
+  struct shfl_union {
+    enum {n = sizeof(Scalar)/4};
+    float fval[n];
+    KOKKOS_INLINE_FUNCTION
+    Scalar value() {
+      return *(Scalar*) fval;
+    }
+    KOKKOS_INLINE_FUNCTION
+    void operator= (Scalar& value_) {
+      float* const val_ptr = (float*) &value_;
+      for(int i=0; i<n ; i++) {
+        fval[i] = val_ptr[i];
+      }
+    }
+    KOKKOS_INLINE_FUNCTION
+    void operator= (const Scalar& value_) {
+      float* const val_ptr = (float*) &value_;
+      for(int i=0; i<n ; i++) {
+        fval[i] = val_ptr[i];
+      }
+    }
+
+  };
+}
+
+#ifdef __CUDA_ARCH__
+  #if (__CUDA_ARCH__ >= 300)
+
+    KOKKOS_INLINE_FUNCTION
+    int shfl(const int &val, const int& srcLane, const int& width ) {
+      return __shfl(val,srcLane,width);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    float shfl(const float &val, const int& srcLane, const int& width ) {
+      return __shfl(val,srcLane,width);
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl(const Scalar &val, const int& srcLane, const typename Impl::enable_if< (sizeof(Scalar) == 4) , int >::type& width
+        ) {
+      Scalar tmp1 = val;
+      float tmp = *reinterpret_cast<float*>(&tmp1);
+      tmp = __shfl(tmp,srcLane,width);
+      return *reinterpret_cast<Scalar*>(&tmp);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double shfl(const double &val, const int& srcLane, const int& width) {
+      int lo = __double2loint(val);
+      int hi = __double2hiint(val);
+      lo = __shfl(lo,srcLane,width);
+      hi = __shfl(hi,srcLane,width);
+      return __hiloint2double(hi,lo);
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl(const Scalar &val, const int& srcLane, const typename Impl::enable_if< (sizeof(Scalar) == 8) ,int>::type& width) {
+      int lo = __double2loint(*reinterpret_cast<const double*>(&val));
+      int hi = __double2hiint(*reinterpret_cast<const double*>(&val));
+      lo = __shfl(lo,srcLane,width);
+      hi = __shfl(hi,srcLane,width);
+      const double tmp = __hiloint2double(hi,lo);
+      return *(reinterpret_cast<const Scalar*>(&tmp));
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl(const Scalar &val, const int& srcLane, const typename Impl::enable_if< (sizeof(Scalar) > 8) ,int>::type& width) {
+      Impl::shfl_union<Scalar> s_val;
+      Impl::shfl_union<Scalar> r_val;
+      s_val = val;
+
+      for(int i = 0; i<s_val.n; i++)
+        r_val.fval[i] = __shfl(s_val.fval[i],srcLane,width);
+      return r_val.value();
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int shfl_down(const int &val, const int& delta, const int& width) {
+      return __shfl_down(val,delta,width);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    float shfl_down(const float &val, const int& delta, const int& width) {
+      return __shfl_down(val,delta,width);
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl_down(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 4) , int >::type & width) {
+      Scalar tmp1 = val;
+      float tmp = *reinterpret_cast<float*>(&tmp1);
+      tmp = __shfl_down(tmp,delta,width);
+      return *reinterpret_cast<Scalar*>(&tmp);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double shfl_down(const double &val, const int& delta, const int& width) {
+      int lo = __double2loint(val);
+      int hi = __double2hiint(val);
+      lo = __shfl_down(lo,delta,width);
+      hi = __shfl_down(hi,delta,width);
+      return __hiloint2double(hi,lo);
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl_down(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 8) , int >::type & width) {
+      int lo = __double2loint(*reinterpret_cast<const double*>(&val));
+      int hi = __double2hiint(*reinterpret_cast<const double*>(&val));
+      lo = __shfl_down(lo,delta,width);
+      hi = __shfl_down(hi,delta,width);
+      const double tmp = __hiloint2double(hi,lo);
+      return *(reinterpret_cast<const Scalar*>(&tmp));
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl_down(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) > 8) , int >::type & width) {
+      Impl::shfl_union<Scalar> s_val;
+      Impl::shfl_union<Scalar> r_val;
+      s_val = val;
+
+      for(int i = 0; i<s_val.n; i++)
+        r_val.fval[i] = __shfl_down(s_val.fval[i],delta,width);
+      return r_val.value();
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int shfl_up(const int &val, const int& delta, const int& width ) {
+      return __shfl_up(val,delta,width);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    float shfl_up(const float &val, const int& delta, const int& width ) {
+      return __shfl_up(val,delta,width);
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl_up(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 4) , int >::type & width) {
+      Scalar tmp1 = val;
+      float tmp = *reinterpret_cast<float*>(&tmp1);
+      tmp = __shfl_up(tmp,delta,width);
+      return *reinterpret_cast<Scalar*>(&tmp);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double shfl_up(const double &val, const int& delta, const int& width ) {
+      int lo = __double2loint(val);
+      int hi = __double2hiint(val);
+      lo = __shfl_up(lo,delta,width);
+      hi = __shfl_up(hi,delta,width);
+      return __hiloint2double(hi,lo);
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl_up(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 8) , int >::type & width) {
+      int lo = __double2loint(*reinterpret_cast<const double*>(&val));
+      int hi = __double2hiint(*reinterpret_cast<const double*>(&val));
+      lo = __shfl_up(lo,delta,width);
+      hi = __shfl_up(hi,delta,width);
+      const double tmp = __hiloint2double(hi,lo);
+      return *(reinterpret_cast<const Scalar*>(&tmp));
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl_up(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) > 8) , int >::type & width) {
+      Impl::shfl_union<Scalar> s_val;
+      Impl::shfl_union<Scalar> r_val;
+      s_val = val;
+
+      for(int i = 0; i<s_val.n; i++)
+        r_val.fval[i] = __shfl_up(s_val.fval[i],delta,width);
+      return r_val.value();
+    }
+
+  #else
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl(const Scalar &val, const int& srcLane, const int& width) {
+      if(width > 1) Kokkos::abort("Error: calling shfl from a device with CC<3.0.");
+      return val;
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl_down(const Scalar &val, const int& delta, const int& width) {
+      if(width > 1) Kokkos::abort("Error: calling shfl_down from a device with CC<3.0.");
+      return val;
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl_up(const Scalar &val, const int& delta, const int& width) {
+      if(width > 1) Kokkos::abort("Error: calling shfl_down from a device with CC<3.0.");
+      return val;
+    }
+  #endif
+#else
+    template<typename Scalar>
+    inline
+    Scalar shfl(const Scalar &val, const int& srcLane, const int& width) {
+      if(width > 1) Kokkos::abort("Error: calling shfl from a device with CC<3.0.");
+      return val;
+    }
+
+    template<typename Scalar>
+    inline
+    Scalar shfl_down(const Scalar &val, const int& delta, const int& width) {
+      if(width > 1) Kokkos::abort("Error: calling shfl_down from a device with CC<3.0.");
+      return val;
+    }
+
+    template<typename Scalar>
+    inline
+    Scalar shfl_up(const Scalar &val, const int& delta, const int& width) {
+      if(width > 1) Kokkos::abort("Error: calling shfl_down from a device with CC<3.0.");
+      return val;
+    }
+#endif
+
+
+
+}
+
+#endif // KOKKOS_HAVE_CUDA
+#endif
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..92f6fc1f5f89a75fe717d351af5395da8bf894a4
--- /dev/null
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp
@@ -0,0 +1,93 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_VIEW_HPP
+#define KOKKOS_CUDA_VIEW_HPP
+
+#include <Kokkos_Macros.hpp>
+
+/* only compile this file if CUDA is enabled for Kokkos */
+#ifdef KOKKOS_HAVE_CUDA
+
+#include <cstring>
+
+#include <Kokkos_HostSpace.hpp>
+#include <Kokkos_CudaSpace.hpp>
+#include <impl/Kokkos_Shape.hpp>
+#include <Kokkos_View.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+struct AssertShapeBoundsAbort< CudaSpace >
+{
+  KOKKOS_INLINE_FUNCTION
+  static void apply( const size_t /* rank */ ,
+                     const size_t /* n0 */ , const size_t /* n1 */ ,
+                     const size_t /* n2 */ , const size_t /* n3 */ ,
+                     const size_t /* n4 */ , const size_t /* n5 */ ,
+                     const size_t /* n6 */ , const size_t /* n7 */ ,
+
+                     const size_t /* arg_rank */ ,
+                     const size_t /* i0 */ , const size_t /* i1 */ ,
+                     const size_t /* i2 */ , const size_t /* i3 */ ,
+                     const size_t /* i4 */ , const size_t /* i5 */ ,
+                     const size_t /* i6 */ , const size_t /* i7 */ )
+    {
+      Kokkos::abort("Kokkos::View array bounds violation");
+    }
+};
+
+}
+}
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif // KOKKOS_HAVE_CUDA
+#endif /* #ifndef KOKKOS_CUDA_VIEW_HPP */
+
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_abort.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_abort.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..deb955ccd4755d43a24469171f2689d8c2a87dae
--- /dev/null
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_abort.hpp
@@ -0,0 +1,119 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_ABORT_HPP
+#define KOKKOS_CUDA_ABORT_HPP
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+#include "Kokkos_Macros.hpp"
+#if defined( __CUDACC__ ) && defined( __CUDA_ARCH__ ) && defined( KOKKOS_HAVE_CUDA )
+
+#include <cuda.h>
+
+#if ! defined( CUDA_VERSION ) || ( CUDA_VERSION < 4010 )
+#error "Cuda version 4.1 or greater required"
+#endif
+
+#if ( __CUDA_ARCH__ < 200 )
+#error "Cuda device capability 2.0 or greater required"
+#endif
+
+extern "C" {
+/*  Cuda runtime function, declared in <crt/device_runtime.h>
+ *  Requires capability 2.x or better.
+ */
+extern __device__ void __assertfail(
+  const void  *message,
+  const void  *file,
+  unsigned int line,
+  const void  *function,
+  size_t       charsize);
+}
+
+namespace Kokkos {
+namespace Impl {
+
+__device__ inline
+void cuda_abort( const char * const message )
+{
+#ifndef __APPLE__
+  const char empty[] = "" ;
+
+  __assertfail( (const void *) message ,
+                (const void *) empty ,
+                (unsigned int) 0 ,
+                (const void *) empty ,
+                sizeof(char) );
+#endif
+}
+
+} // namespace Impl
+} // namespace Kokkos
+
+#else
+
+namespace Kokkos {
+namespace Impl {
+KOKKOS_INLINE_FUNCTION
+void cuda_abort( const char * const ) {}
+}
+}
+
+#endif /* #if defined( __CUDACC__ ) && defined( __CUDA_ARCH__ ) */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA )
+namespace Kokkos {
+__device__ inline
+void abort( const char * const message ) { Kokkos::Impl::cuda_abort(message); }
+}
+#endif /* defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA ) */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_CUDA_ABORT_HPP */
+
diff --git a/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp b/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e813285fc739336dc61d105f2afd73b5064b20c3
--- /dev/null
+++ b/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp
@@ -0,0 +1,611 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP
+#define KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP
+
+#include <Kokkos_ExecPolicy.hpp>
+#include <Kokkos_Parallel.hpp>
+#include <initializer_list>
+
+#if defined(KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION) && defined(KOKKOS_HAVE_PRAGMA_IVDEP) && !defined(__CUDA_ARCH__)
+#define KOKKOS_MDRANGE_IVDEP
+#endif
+
+namespace Kokkos { namespace Experimental {
+
+enum class Iterate
+{
+  Default, // Default for the device
+  Left,    // Left indices stride fastest
+  Right,   // Right indices stride fastest
+  Flat,    // Do not tile, only valid for inner direction
+};
+
+template <typename ExecSpace>
+struct default_outer_direction
+{
+  using type = Iterate;
+  static constexpr Iterate value = Iterate::Right;
+};
+
+template <typename ExecSpace>
+struct default_inner_direction
+{
+  using type = Iterate;
+  static constexpr Iterate value = Iterate::Right;
+};
+
+
+// Iteration Pattern
+template < unsigned N
+         , Iterate OuterDir = Iterate::Default
+         , Iterate InnerDir = Iterate::Default
+         >
+struct Rank
+{
+  static_assert( N != 0u, "Kokkos Error: rank 0 undefined");
+  static_assert( N != 1u, "Kokkos Error: rank 1 is not a multi-dimensional range");
+  static_assert( N < 4u, "Kokkos Error: Unsupported rank...");
+
+  using iteration_pattern = Rank<N, OuterDir, InnerDir>;
+
+  static constexpr int rank = N;
+  static constexpr Iterate outer_direction = OuterDir;
+  static constexpr Iterate inner_direction = InnerDir;
+};
+
+
+
+// multi-dimensional iteration pattern
+template <typename... Properties>
+struct MDRangePolicy
+{
+  using range_policy = RangePolicy<Properties...>;
+
+  static_assert( !std::is_same<range_policy,void>::value
+               , "Kokkos Error: MD iteration pattern not defined" );
+
+  using iteration_pattern   = typename range_policy::iteration_pattern;
+  using work_tag            = typename range_policy::work_tag;
+
+  static constexpr int rank = iteration_pattern::rank;
+
+  static constexpr int outer_direction = static_cast<int> (
+      (iteration_pattern::outer_direction != Iterate::Default && iteration_pattern::outer_direction != Iterate::Flat)
+    ? iteration_pattern::outer_direction
+    : default_outer_direction< typename range_policy::execution_space>::value );
+
+  static constexpr int inner_direction = static_cast<int> (
+      iteration_pattern::inner_direction != Iterate::Default
+    ? iteration_pattern::inner_direction
+    : default_inner_direction< typename range_policy::execution_space>::value ) ;
+
+
+  // Ugly ugly workaround intel 14 not handling scoped enum correctly
+  static constexpr int Flat = static_cast<int>( Iterate::Flat );
+  static constexpr int Right = static_cast<int>( Iterate::Right );
+
+
+  using size_type   = typename range_policy::index_type;
+  using index_type  = typename std::make_signed<size_type>::type;
+
+
+  template <typename I>
+  MDRangePolicy( std::initializer_list<I> upper_corner )
+  {
+    static_assert( std::is_integral<I>::value, "Kokkos Error: corner defined with non-integral type" );
+
+    // TODO check size of lists equal to rank
+    // static_asserts on initializer_list.size() require c++14
+
+    //static_assert( upper_corner.size() == rank, "Kokkos Error: upper_corner has incorrect rank" );
+
+    const auto u = upper_corner.begin();
+
+    m_num_tiles = 1;
+    for (int i=0; i<rank; ++i) {
+      m_offset[i] = static_cast<index_type>(0);
+      m_dim[i]    = static_cast<index_type>(u[i]);
+      if (inner_direction != Flat) {
+        // default tile size to 4
+        m_tile[i] = 4;
+      } else {
+        m_tile[i] = 1;
+      }
+      m_tile_dim[i] = (m_dim[i] + (m_tile[i] - 1)) / m_tile[i];
+      m_num_tiles *= m_tile_dim[i];
+    }
+  }
+
+  template <typename IA, typename IB>
+  MDRangePolicy( std::initializer_list<IA> corner_a
+               , std::initializer_list<IB> corner_b
+               )
+  {
+    static_assert( std::is_integral<IA>::value, "Kokkos Error: corner A defined with non-integral type" );
+    static_assert( std::is_integral<IB>::value, "Kokkos Error: corner B defined with non-integral type" );
+
+    // TODO check size of lists equal to rank
+    // static_asserts on initializer_list.size() require c++14
+    //static_assert( corner_a.size() == rank, "Kokkos Error: corner_a has incorrect rank" );
+    //static_assert( corner_b.size() == rank, "Kokkos Error: corner_b has incorrect rank" );
+
+
+    using A = typename std::make_signed<IA>::type;
+    using B = typename std::make_signed<IB>::type;
+
+    const auto a = [=](int i) { return static_cast<A>(corner_a.begin()[i]); };
+    const auto b = [=](int i) { return static_cast<B>(corner_b.begin()[i]); };
+
+    m_num_tiles = 1;
+    for (int i=0; i<rank; ++i) {
+      m_offset[i] = static_cast<index_type>(a(i) <= b(i) ? a(i) : b(i));
+      m_dim[i]    = static_cast<index_type>(a(i) <= b(i) ? b(i) - a(i) : a(i) - b(i));
+      if (inner_direction != Flat) {
+        // default tile size to 4
+        m_tile[i] = 4;
+      } else {
+        m_tile[i] = 1;
+      }
+      m_tile_dim[i] = (m_dim[i] + (m_tile[i] - 1)) / m_tile[i];
+      m_num_tiles *= m_tile_dim[i];
+    }
+  }
+
+  template <typename IA, typename IB, typename T>
+  MDRangePolicy( std::initializer_list<IA> corner_a
+               , std::initializer_list<IB> corner_b
+               , std::initializer_list<T> tile
+               )
+  {
+    static_assert( std::is_integral<IA>::value, "Kokkos Error: corner A defined with non-integral type" );
+    static_assert( std::is_integral<IB>::value, "Kokkos Error: corner B defined with non-integral type" );
+    static_assert( std::is_integral<T>::value, "Kokkos Error: tile defined with non-integral type" );
+    static_assert( inner_direction != Flat, "Kokkos Error: tiling not support with flat iteration" );
+
+    // TODO check size of lists equal to rank
+    // static_asserts on initializer_list.size() require c++14
+    //static_assert( corner_a.size() == rank, "Kokkos Error: corner_a has incorrect rank" );
+    //static_assert( corner_b.size() == rank, "Kokkos Error: corner_b has incorrect rank" );
+    //static_assert( tile.size() == rank, "Kokkos Error: tile has incorrect rank" );
+
+    using A = typename std::make_signed<IA>::type;
+    using B = typename std::make_signed<IB>::type;
+
+    const auto a = [=](int i) { return static_cast<A>(corner_a.begin()[i]); };
+    const auto b = [=](int i) { return static_cast<B>(corner_b.begin()[i]); };
+    const auto t = tile.begin();
+
+    m_num_tiles = 1;
+    for (int i=0; i<rank; ++i) {
+      m_offset[i] = static_cast<index_type>(a(i) <= b(i) ? a(i) : b(i));
+      m_dim[i]    = static_cast<index_type>(a(i) <= b(i) ? b(i) - a(i) : a(i) - b(i));
+      m_tile[i]   = static_cast<int>(t[i] > (T)0 ? t[i] : (T)1 );
+      m_tile_dim[i] = (m_dim[i] + (m_tile[i] - 1)) / m_tile[i];
+      m_num_tiles *= m_tile_dim[i];
+    }
+  }
+
+  index_type   m_offset[rank];
+  index_type   m_dim[rank];
+  int          m_tile[rank];
+  index_type   m_tile_dim[rank];
+  size_type    m_num_tiles;       // product of tile dims
+};
+
+namespace Impl {
+
+// Serial, Threads, OpenMP
+// use enable_if to overload for Cuda
+template < typename MDRange, typename Functor, typename Enable = void >
+struct MDForFunctor
+{
+  using work_tag   = typename MDRange::work_tag;
+  using index_type = typename MDRange::index_type;
+  using size_type  = typename MDRange::size_type;
+
+  MDRange m_range;
+  Functor m_func;
+
+  KOKKOS_INLINE_FUNCTION
+  MDForFunctor( MDRange const& range, Functor const& f )
+    : m_range(range)
+    , m_func( f )
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  MDForFunctor( MDRange const& range, Functor && f )
+    : m_range(range)
+    , m_func( std::forward<Functor>(f) )
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  MDForFunctor( MDRange && range, Functor const& f )
+    : m_range( std::forward<MDRange>(range) )
+    , m_func( f )
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  MDForFunctor( MDRange && range, Functor && f )
+    : m_range( std::forward<MDRange>(range) )
+    , m_func( std::forward<Functor>(f) )
+  {}
+
+
+  KOKKOS_INLINE_FUNCTION
+  MDForFunctor( MDForFunctor const& ) = default;
+
+  KOKKOS_INLINE_FUNCTION
+  MDForFunctor& operator=( MDForFunctor const& ) = default;
+
+  KOKKOS_INLINE_FUNCTION
+  MDForFunctor( MDForFunctor && ) = default;
+
+  KOKKOS_INLINE_FUNCTION
+  MDForFunctor& operator=( MDForFunctor && ) = default;
+
+  // Rank-2, Flat, No Tag
+  template <typename Idx>
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<(  std::is_integral<Idx>::value
+                          && std::is_same<void, work_tag>::value
+                          && MDRange::rank == 2
+                          && MDRange::inner_direction == MDRange::Flat
+                          )>::type
+  operator()(Idx t) const
+  {
+    if (  MDRange::outer_direction == MDRange::Right ) {
+      m_func( m_range.m_offset[0] + ( t / m_range.m_dim[1] )
+            , m_range.m_offset[1] + ( t % m_range.m_dim[1] ) );
+    } else {
+      m_func( m_range.m_offset[0] + ( t % m_range.m_dim[0] )
+            , m_range.m_offset[1] + ( t / m_range.m_dim[0] ) );
+    }
+  }
+
+  // Rank-2, Flat, Tag
+  template <typename Idx>
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<(  std::is_integral<Idx>::value
+                          && !std::is_same<void, work_tag>::value
+                          && MDRange::rank == 2
+                          && MDRange::inner_direction == MDRange::Flat
+                          )>::type
+  operator()(Idx t) const
+  {
+    if (  MDRange::outer_direction == MDRange::Right ) {
+      m_func( work_tag{}, m_range.m_offset[0] + ( t / m_range.m_dim[1] )
+            , m_range.m_offset[1] + ( t % m_range.m_dim[1] ) );
+    } else {
+      m_func( work_tag{}, m_range.m_offset[0] + ( t % m_range.m_dim[0] )
+            , m_range.m_offset[1] + ( t / m_range.m_dim[0] ) );
+    }
+  }
+
+  // Rank-2, Not Flat, No Tag
+  template <typename Idx>
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<(  std::is_integral<Idx>::value
+                          && std::is_same<void, work_tag>::value
+                          && MDRange::rank == 2
+                          && MDRange::inner_direction != MDRange::Flat
+                          )>::type
+  operator()(Idx t) const
+  {
+    index_type t0, t1;
+    if (  MDRange::outer_direction == MDRange::Right ) {
+      t0 = t / m_range.m_tile_dim[1];
+      t1 = t % m_range.m_tile_dim[1];
+    } else {
+      t0 = t % m_range.m_tile_dim[0];
+      t1 = t / m_range.m_tile_dim[0];
+    }
+
+    const index_type b0 = t0 * m_range.m_tile[0] + m_range.m_offset[0];
+    const index_type b1 = t1 * m_range.m_tile[1] + m_range.m_offset[1];
+
+    const index_type e0 = b0 + m_range.m_tile[0] <= (m_range.m_dim[0] + m_range.m_offset[0] ) ? b0 + m_range.m_tile[0] : ( m_range.m_dim[0] + m_range.m_offset[0] );
+    const index_type e1 = b1 + m_range.m_tile[1] <= (m_range.m_dim[1] + m_range.m_offset[1] ) ? b1 + m_range.m_tile[1] : ( m_range.m_dim[1] + m_range.m_offset[1] );
+
+    if (  MDRange::inner_direction == MDRange::Right ) {
+      for (int i0=b0; i0<e0; ++i0) {
+      #if defined(KOKKOS_MDRANGE_IVDEP)
+      #pragma ivdep
+      #endif
+      for (int i1=b1; i1<e1; ++i1) {
+        m_func( i0, i1 );
+      }}
+    } else {
+      for (int i1=b1; i1<e1; ++i1) {
+      #if defined(KOKKOS_MDRANGE_IVDEP)
+      #pragma ivdep
+      #endif
+      for (int i0=b0; i0<e0; ++i0) {
+        m_func( i0, i1 );
+      }}
+    }
+  }
+
+  // Rank-2, Not Flat, Tag
+  template <typename Idx>
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<(  std::is_integral<Idx>::value
+                          && !std::is_same<void, work_tag>::value
+                          && MDRange::rank == 2
+                          && MDRange::inner_direction != MDRange::Flat
+                          )>::type
+  operator()(Idx t) const
+  {
+    work_tag tag;
+
+    index_type t0, t1;
+    if (  MDRange::outer_direction == MDRange::Right ) {
+      t0 = t / m_range.m_tile_dim[1];
+      t1 = t % m_range.m_tile_dim[1];
+    } else {
+      t0 = t % m_range.m_tile_dim[0];
+      t1 = t / m_range.m_tile_dim[0];
+    }
+
+    const index_type b0 = t0 * m_range.m_tile[0] + m_range.m_offset[0];
+    const index_type b1 = t1 * m_range.m_tile[1] + m_range.m_offset[1];
+
+    const index_type e0 = b0 + m_range.m_tile[0] <= (m_range.m_dim[0] + m_range.m_offset[0] ) ? b0 + m_range.m_tile[0] : ( m_range.m_dim[0] + m_range.m_offset[0] );
+    const index_type e1 = b1 + m_range.m_tile[1] <= (m_range.m_dim[1] + m_range.m_offset[1] ) ? b1 + m_range.m_tile[1] : ( m_range.m_dim[1] + m_range.m_offset[1] );
+
+    if (  MDRange::inner_direction == MDRange::Right ) {
+      for (int i0=b0; i0<e0; ++i0) {
+      #if defined(KOKKOS_MDRANGE_IVDEP)
+      #pragma ivdep
+      #endif
+      for (int i1=b1; i1<e1; ++i1) {
+        m_func( tag, i0, i1 );
+      }}
+    } else {
+      for (int i1=b1; i1<e1; ++i1) {
+      #if defined(KOKKOS_MDRANGE_IVDEP)
+      #pragma ivdep
+      #endif
+      for (int i0=b0; i0<e0; ++i0) {
+        m_func( tag, i0, i1 );
+      }}
+    }
+  }
+
+  //---------------------------------------------------------------------------
+
+  // Rank-3, Flat, No Tag
+  template <typename Idx>
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<(  std::is_integral<Idx>::value
+                          && std::is_same<void, work_tag>::value
+                          && MDRange::rank == 3
+                          && MDRange::inner_direction == MDRange::Flat
+                          )>::type
+  operator()(Idx t) const
+  {
+    if (  MDRange::outer_direction == MDRange::Right ) {
+    const int64_t tmp_prod = m_range.m_dim[1]*m_range.m_dim[2];
+    m_func( m_range.m_offset[0] + (  t / tmp_prod )
+          , m_range.m_offset[1] + ( (t % tmp_prod) / m_range.m_dim[2] )
+          , m_range.m_offset[2] + ( (t % tmp_prod) % m_range.m_dim[2] )
+          );
+    } else {
+    const int64_t tmp_prod = m_range.m_dim[0]*m_range.m_dim[1];
+    m_func( m_range.m_offset[0] + ( (t % tmp_prod) % m_range.m_dim[0] )
+          , m_range.m_offset[1] + ( (t % tmp_prod) / m_range.m_dim[0] )
+          , m_range.m_offset[2] + (  t / tmp_prod )
+          );
+    }
+  }
+
+  // Rank-3, Flat, Tag
+  template <typename Idx>
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<(  std::is_integral<Idx>::value
+                          && !std::is_same<void, work_tag>::value
+                          && MDRange::rank == 3
+                          && MDRange::inner_direction == MDRange::Flat
+                          )>::type
+  operator()(Idx t) const
+  {
+    if (  MDRange::outer_direction == MDRange::Right ) {
+      const int64_t tmp_prod = m_range.m_dim[1]*m_range.m_dim[2];
+      m_func( work_tag{}
+            , m_range.m_offset[0] + (  t / tmp_prod )
+            , m_range.m_offset[1] + ( (t % tmp_prod) / m_range.m_dim[2] )
+            , m_range.m_offset[2] + ( (t % tmp_prod) % m_range.m_dim[2] )
+            );
+    } else {
+      const int64_t tmp_prod = m_range.m_dim[0]*m_range.m_dim[1];
+      m_func( work_tag{}
+            , m_range.m_offset[0] + ( (t % tmp_prod) % m_range.m_dim[0] )
+            , m_range.m_offset[1] + ( (t % tmp_prod) / m_range.m_dim[0] )
+            , m_range.m_offset[2] + (  t / tmp_prod )
+            );
+    }
+  }
+
+  // Rank-3, Not Flat, No Tag
+  template <typename Idx>
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<(  std::is_integral<Idx>::value
+                          && std::is_same<void, work_tag>::value
+                          && MDRange::rank == 3
+                          && MDRange::inner_direction != MDRange::Flat
+                          )>::type
+  operator()(Idx t) const
+  {
+    index_type t0, t1, t2;
+    if (  MDRange::outer_direction == MDRange::Right ) {
+      const index_type tmp_prod = ( m_range.m_tile_dim[1]*m_range.m_tile_dim[2]);
+      t0 = t / tmp_prod;
+      t1 = ( t % tmp_prod ) / m_range.m_tile_dim[2];
+      t2 = ( t % tmp_prod ) % m_range.m_tile_dim[2];
+    } else {
+      const index_type tmp_prod = ( m_range.m_tile_dim[0]*m_range.m_tile_dim[1]);
+      t0 = ( t % tmp_prod ) % m_range.m_tile_dim[0];
+      t1 = ( t % tmp_prod ) / m_range.m_tile_dim[0];
+      t2 = t / tmp_prod;
+    }
+
+    const index_type b0 = t0 * m_range.m_tile[0] + m_range.m_offset[0];
+    const index_type b1 = t1 * m_range.m_tile[1] + m_range.m_offset[1];
+    const index_type b2 = t2 * m_range.m_tile[2] + m_range.m_offset[2];
+
+    const index_type e0 = b0 + m_range.m_tile[0] <= (m_range.m_dim[0] + m_range.m_offset[0] ) ? b0 + m_range.m_tile[0] : ( m_range.m_dim[0] + m_range.m_offset[0] );
+    const index_type e1 = b1 + m_range.m_tile[1] <= (m_range.m_dim[1] + m_range.m_offset[1] ) ? b1 + m_range.m_tile[1] : ( m_range.m_dim[1] + m_range.m_offset[1] );
+    const index_type e2 = b2 + m_range.m_tile[2] <= (m_range.m_dim[2] + m_range.m_offset[2] ) ? b2 + m_range.m_tile[2] : ( m_range.m_dim[2] + m_range.m_offset[2] );
+
+    if (  MDRange::inner_direction == MDRange::Right ) {
+      for (int i0=b0; i0<e0; ++i0) {
+      for (int i1=b1; i1<e1; ++i1) {
+      #if defined(KOKKOS_MDRANGE_IVDEP)
+      #pragma ivdep
+      #endif
+      for (int i2=b2; i2<e2; ++i2) {
+        m_func( i0, i1, i2 );
+      }}}
+    } else {
+      for (int i2=b2; i2<e2; ++i2) {
+      for (int i1=b1; i1<e1; ++i1) {
+      #if defined(KOKKOS_MDRANGE_IVDEP)
+      #pragma ivdep
+      #endif
+      for (int i0=b0; i0<e0; ++i0) {
+        m_func( i0, i1, i2 );
+      }}}
+    }
+  }
+
+  // Rank-3, Not Flat, Tag
+  template <typename Idx>
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<(  std::is_integral<Idx>::value
+                          && !std::is_same<void, work_tag>::value
+                          && MDRange::rank == 3
+                          && MDRange::inner_direction != MDRange::Flat
+                          )>::type
+  operator()(Idx t) const
+  {
+    work_tag tag;
+
+    index_type t0, t1, t2;
+    if (  MDRange::outer_direction == MDRange::Right ) {
+      const index_type tmp_prod = ( m_range.m_tile_dim[1]*m_range.m_tile_dim[2]);
+      t0 = t / tmp_prod;
+      t1 = ( t % tmp_prod ) / m_range.m_tile_dim[2];
+      t2 = ( t % tmp_prod ) % m_range.m_tile_dim[2];
+    } else {
+      const index_type tmp_prod = ( m_range.m_tile_dim[0]*m_range.m_tile_dim[1]);
+      t0 = ( t % tmp_prod ) % m_range.m_tile_dim[0];
+      t1 = ( t % tmp_prod ) / m_range.m_tile_dim[0];
+      t2 = t / tmp_prod;
+    }
+
+    const index_type b0 = t0 * m_range.m_tile[0] + m_range.m_offset[0];
+    const index_type b1 = t1 * m_range.m_tile[1] + m_range.m_offset[1];
+    const index_type b2 = t2 * m_range.m_tile[2] + m_range.m_offset[2];
+
+    const index_type e0 = b0 + m_range.m_tile[0] <= (m_range.m_dim[0] + m_range.m_offset[0] ) ? b0 + m_range.m_tile[0] : ( m_range.m_dim[0] + m_range.m_offset[0] );
+    const index_type e1 = b1 + m_range.m_tile[1] <= (m_range.m_dim[1] + m_range.m_offset[1] ) ? b1 + m_range.m_tile[1] : ( m_range.m_dim[1] + m_range.m_offset[1] );
+    const index_type e2 = b2 + m_range.m_tile[2] <= (m_range.m_dim[2] + m_range.m_offset[2] ) ? b2 + m_range.m_tile[2] : ( m_range.m_dim[2] + m_range.m_offset[2] );
+
+    if (  MDRange::inner_direction == MDRange::Right ) {
+      for (int i0=b0; i0<e0; ++i0) {
+      for (int i1=b1; i1<e1; ++i1) {
+      #if defined(KOKKOS_MDRANGE_IVDEP)
+      #pragma ivdep
+      #endif
+      for (int i2=b2; i2<e2; ++i2) {
+        m_func( tag, i0, i1, i2 );
+      }}}
+    } else {
+      for (int i2=b2; i2<e2; ++i2) {
+      for (int i1=b1; i1<e1; ++i1) {
+      #if defined(KOKKOS_MDRANGE_IVDEP)
+      #pragma ivdep
+      #endif
+      for (int i0=b0; i0<e0; ++i0) {
+        m_func( tag, i0, i1, i2 );
+      }}}
+    }
+  }
+};
+
+
+
+} // namespace Impl
+
+
+template <typename MDRange, typename Functor>
+void md_parallel_for( MDRange const& range
+                    , Functor const& f
+                    , const std::string& str = ""
+                    )
+{
+  Impl::MDForFunctor<MDRange, Functor> g(range, f);
+
+  using range_policy = typename MDRange::range_policy;
+
+  Kokkos::parallel_for( range_policy(0, range.m_num_tiles).set_chunk_size(1), g, str );
+}
+
+template <typename MDRange, typename Functor>
+void md_parallel_for( const std::string& str
+                    , MDRange const& range
+                    , Functor const& f
+                    )
+{
+  Impl::MDForFunctor<MDRange, Functor> g(range, f);
+
+  using range_policy = typename MDRange::range_policy;
+
+  Kokkos::parallel_for( range_policy(0, range.m_num_tiles).set_chunk_size(1), g, str );
+}
+
+}} // namespace Kokkos::Experimental
+
+#endif //KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP
+
diff --git a/lib/kokkos/core/src/Kokkos_Array.hpp b/lib/kokkos/core/src/Kokkos_Array.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..038eda804fc5e3747e07ff3d8d64b2d5942271b8
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_Array.hpp
@@ -0,0 +1,302 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_ARRAY
+#define KOKKOS_ARRAY
+
+#include <type_traits>
+#include <algorithm>
+#include <limits>
+#include <cstddef>
+
+namespace Kokkos {
+
+/**\brief  Derived from the C++17 'std::array'.
+ *         Dropping the iterator interface.
+ */
+template< class T      = void
+        , size_t N     = ~size_t(0)
+        , class Proxy  = void
+        >
+struct Array {
+private:
+  T m_elem[N];
+public:
+
+  typedef T &                                 reference ;
+  typedef typename std::add_const<T>::type &  const_reference ;
+  typedef size_t                              size_type ;
+  typedef ptrdiff_t                           difference_type ;
+  typedef T                                   value_type ;
+  typedef T *                                 pointer ;
+  typedef typename std::add_const<T>::type *  const_pointer ;
+
+  KOKKOS_INLINE_FUNCTION static constexpr size_type size() { return N ; }
+  KOKKOS_INLINE_FUNCTION static constexpr bool      empty(){ return false ; }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  reference operator[]( const iType & i )
+    {
+      static_assert( std::is_integral<iType>::value , "Must be integral argument" );
+      return m_elem[i];
+    }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  const_reference operator[]( const iType & i ) const
+    {
+      static_assert( std::is_integral<iType>::value , "Must be integral argument" );
+      return m_elem[i];
+    }
+
+  KOKKOS_INLINE_FUNCTION pointer       data()       { return & m_elem[0] ; }
+  KOKKOS_INLINE_FUNCTION const_pointer data() const { return & m_elem[0] ; }
+
+  ~Array() = default ;
+  Array() = default ;
+  Array( const Array & ) = default ;
+  Array & operator = ( const Array & ) = default ;
+
+  // Some supported compilers are not sufficiently C++11 compliant
+  // for default move constructor and move assignment operator.
+  // Array( Array && ) = default ;
+  // Array & operator = ( Array && ) = default ;
+};
+
+
+template< class T , class Proxy >
+struct Array<T,0,Proxy> {
+public:
+
+  typedef typename std::add_const<T>::type &  reference ;
+  typedef typename std::add_const<T>::type &  const_reference ;
+  typedef size_t                              size_type ;
+  typedef ptrdiff_t                           difference_type ;
+  typedef typename std::add_const<T>::type    value_type ;
+  typedef typename std::add_const<T>::type *  pointer ;
+  typedef typename std::add_const<T>::type *  const_pointer ;
+
+  KOKKOS_INLINE_FUNCTION static constexpr size_type size()  { return 0 ; }
+  KOKKOS_INLINE_FUNCTION static constexpr bool      empty() { return true ; }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  value_type operator[]( const iType & )
+    {
+      static_assert( std::is_integral<iType>::value , "Must be integer argument" );
+      return value_type();
+    }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  value_type operator[]( const iType & ) const
+    {
+      static_assert( std::is_integral<iType>::value , "Must be integer argument" );
+      return value_type();
+    }
+
+  KOKKOS_INLINE_FUNCTION pointer       data()       { return pointer(0) ; }
+  KOKKOS_INLINE_FUNCTION const_pointer data() const { return const_pointer(0); }
+
+  ~Array() = default ;
+  Array() = default ;
+  Array( const Array & ) = default ;
+  Array & operator = ( const Array & ) = default ;
+
+  // Some supported compilers are not sufficiently C++11 compliant
+  // for default move constructor and move assignment operator.
+  // Array( Array && ) = default ;
+  // Array & operator = ( Array && ) = default ;
+};
+
+
+template<>
+struct Array<void,~size_t(0),void>
+{
+  struct contiguous {};
+  struct strided {};
+};
+
+template< class T >
+struct Array< T , ~size_t(0) , Array<>::contiguous >
+{
+private:
+  T *    m_elem ;
+  size_t m_size ;
+public:
+
+  typedef T &                                 reference ;
+  typedef typename std::add_const<T>::type &  const_reference ;
+  typedef size_t                              size_type ;
+  typedef ptrdiff_t                           difference_type ;
+  typedef T                                   value_type ;
+  typedef T *                                 pointer ;
+  typedef typename std::add_const<T>::type *  const_pointer ;
+
+  KOKKOS_INLINE_FUNCTION constexpr size_type size()  const { return m_size ; }
+  KOKKOS_INLINE_FUNCTION constexpr bool      empty() const { return 0 != m_size ; }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  reference operator[]( const iType & i )
+    {
+      static_assert( std::is_integral<iType>::value , "Must be integral argument" );
+      return m_elem[i];
+    }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  const_reference operator[]( const iType & i ) const
+    {
+      static_assert( std::is_integral<iType>::value , "Must be integral argument" );
+      return m_elem[i];
+    }
+
+  KOKKOS_INLINE_FUNCTION pointer       data()       { return m_elem ; }
+  KOKKOS_INLINE_FUNCTION const_pointer data() const { return m_elem ; }
+
+  ~Array() = default ;
+  Array() = delete ;
+  Array( const Array & rhs ) = delete ;
+
+  // Some supported compilers are not sufficiently C++11 compliant
+  // for default move constructor and move assignment operator.
+  // Array( Array && rhs ) = default ;
+  // Array & operator = ( Array && rhs ) = delete ;
+
+  KOKKOS_INLINE_FUNCTION
+  Array & operator = ( const Array & rhs )
+    {
+      const size_t n = std::min( m_size , rhs.size() );
+      for ( size_t i = 0 ; i < n ; ++i ) m_elem[i] = rhs[i] ;
+      return *this ;
+    }
+
+  template< size_t N , class P >
+  KOKKOS_INLINE_FUNCTION
+  Array & operator = ( const Array<T,N,P> & rhs )
+    {
+      const size_t n = std::min( m_size , rhs.size() );
+      for ( size_t i = 0 ; i < n ; ++i ) m_elem[i] = rhs[i] ;
+      return *this ;
+    }
+
+  KOKKOS_INLINE_FUNCTION constexpr Array( pointer arg_ptr , size_type arg_size , size_type = 0 )
+    : m_elem(arg_ptr), m_size(arg_size) {}
+};
+
+template< class T >
+struct Array< T , ~size_t(0) , Array<>::strided >
+{
+private:
+  T *    m_elem ;
+  size_t m_size ;
+  size_t m_stride ;
+public:
+
+  typedef T &                                 reference ;
+  typedef typename std::add_const<T>::type &  const_reference ;
+  typedef size_t                              size_type ;
+  typedef ptrdiff_t                           difference_type ;
+  typedef T                                   value_type ;
+  typedef T *                                 pointer ;
+  typedef typename std::add_const<T>::type *  const_pointer ;
+
+  KOKKOS_INLINE_FUNCTION constexpr size_type size()  const { return m_size ; }
+  KOKKOS_INLINE_FUNCTION constexpr bool      empty() const { return 0 != m_size ; }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  reference operator[]( const iType & i )
+    {
+      static_assert( std::is_integral<iType>::value , "Must be integral argument" );
+      return m_elem[i*m_stride];
+    }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  const_reference operator[]( const iType & i ) const
+    {
+      static_assert( std::is_integral<iType>::value , "Must be integral argument" );
+      return m_elem[i*m_stride];
+    }
+
+  KOKKOS_INLINE_FUNCTION pointer       data()       { return m_elem ; }
+  KOKKOS_INLINE_FUNCTION const_pointer data() const { return m_elem ; }
+
+  ~Array() = default ;
+  Array()  = delete ;
+  Array( const Array & ) = delete ;
+
+
+  // Some supported compilers are not sufficiently C++11 compliant
+  // for default move constructor and move assignment operator.
+  // Array( Array && rhs ) = default ;
+  // Array & operator = ( Array && rhs ) = delete ;
+
+  KOKKOS_INLINE_FUNCTION
+  Array & operator = ( const Array & rhs )
+    {
+      const size_t n = std::min( m_size , rhs.size() );
+      for ( size_t i = 0 ; i < n ; ++i ) m_elem[i] = rhs[i] ;
+      return *this ;
+    }
+
+  template< size_t N , class P >
+  KOKKOS_INLINE_FUNCTION
+  Array & operator = ( const Array<T,N,P> & rhs )
+    {
+      const size_t n = std::min( m_size , rhs.size() );
+      for ( size_t i = 0 ; i < n ; ++i ) m_elem[i] = rhs[i] ;
+      return *this ;
+    }
+
+  KOKKOS_INLINE_FUNCTION constexpr Array( pointer arg_ptr , size_type arg_size , size_type arg_stride )
+    : m_elem(arg_ptr), m_size(arg_size), m_stride(arg_stride) {}
+};
+
+} // namespace Kokkos
+
+#endif /* #ifndef KOKKOS_ARRAY */
+
diff --git a/lib/kokkos/core/src/Kokkos_Atomic.hpp b/lib/kokkos/core/src/Kokkos_Atomic.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..6d37d69a63c8c837457fb2edba6a6d607103b6ad
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_Atomic.hpp
@@ -0,0 +1,305 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+/// \file Kokkos_Atomic.hpp
+/// \brief Atomic functions
+///
+/// This header file defines prototypes for the following atomic functions:
+///   - exchange
+///   - compare and exchange
+///   - add
+///
+/// Supported types include:
+///   - signed and unsigned 4 and 8 byte integers
+///   - float
+///   - double
+///
+/// They are implemented through GCC compatible intrinsics, OpenMP
+/// directives and native CUDA intrinsics.
+///
+/// Including this header file requires one of the following
+/// compilers:
+///   - NVCC (for CUDA device code only)
+///   - GCC (for host code only)
+///   - Intel (for host code only)
+///   - A compiler that supports OpenMP 3.1 (for host code only)
+
+#ifndef KOKKOS_ATOMIC_HPP
+#define KOKKOS_ATOMIC_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <Kokkos_HostSpace.hpp>
+#include <impl/Kokkos_Traits.hpp>
+
+//----------------------------------------------------------------------------
+#if defined(_WIN32)
+#define KOKKOS_ATOMICS_USE_WINDOWS
+#else
+#if defined( __CUDA_ARCH__ ) && defined( KOKKOS_HAVE_CUDA )
+
+// Compiling NVIDIA device code, must use Cuda atomics:
+
+#define KOKKOS_ATOMICS_USE_CUDA
+
+#elif ! defined( KOKKOS_ATOMICS_USE_GCC ) && \
+      ! defined( KOKKOS_ATOMICS_USE_INTEL ) && \
+      ! defined( KOKKOS_ATOMICS_USE_OMP31 )
+
+// Compiling for non-Cuda atomic implementation has not been pre-selected.
+// Choose the best implementation for the detected compiler.
+// Preference: GCC, INTEL, OMP31
+
+#if defined( KOKKOS_COMPILER_GNU ) || \
+    defined( KOKKOS_COMPILER_CLANG ) || \
+    ( defined ( KOKKOS_COMPILER_NVCC ) && defined ( __GNUC__ ) )
+
+#define KOKKOS_ATOMICS_USE_GCC
+
+#elif defined( KOKKOS_COMPILER_INTEL ) || \
+      defined( KOKKOS_COMPILER_CRAYC )
+
+#define KOKKOS_ATOMICS_USE_INTEL
+
+#elif defined( _OPENMP ) && ( 201107 <= _OPENMP )
+
+#define KOKKOS_ATOMICS_USE_OMP31
+
+#else
+
+#error "KOKKOS_ATOMICS_USE : Unsupported compiler"
+
+#endif
+
+#endif /* Not pre-selected atomic implementation */
+#endif
+
+//----------------------------------------------------------------------------
+
+// Forward decalaration of functions supporting arbitrary sized atomics
+// This is necessary since Kokkos_Atomic.hpp is internally included very early
+// through Kokkos_HostSpace.hpp as well as the allocation tracker.
+#ifdef KOKKOS_HAVE_CUDA
+namespace Kokkos {
+namespace Impl {
+/// \brief Aquire a lock for the address
+///
+/// This function tries to aquire the lock for the hash value derived
+/// from the provided ptr. If the lock is successfully aquired the
+/// function returns true. Otherwise it returns false.
+__device__ inline
+bool lock_address_cuda_space(void* ptr);
+
+/// \brief Release lock for the address
+///
+/// This function releases the lock for the hash value derived
+/// from the provided ptr. This function should only be called
+/// after previously successfully aquiring a lock with
+/// lock_address.
+__device__ inline
+void unlock_address_cuda_space(void* ptr);
+}
+}
+#endif
+
+
+namespace Kokkos {
+template <typename T>
+KOKKOS_INLINE_FUNCTION
+void atomic_add(volatile T * const dest, const T src);
+
+// Atomic increment
+template<typename T>
+KOKKOS_INLINE_FUNCTION
+void atomic_increment(volatile T* a);
+
+template<typename T>
+KOKKOS_INLINE_FUNCTION
+void atomic_decrement(volatile T* a);
+}
+
+namespace Kokkos {
+
+
+inline
+const char * atomic_query_version()
+{
+#if defined( KOKKOS_ATOMICS_USE_CUDA )
+  return "KOKKOS_ATOMICS_USE_CUDA" ;
+#elif defined( KOKKOS_ATOMICS_USE_GCC )
+  return "KOKKOS_ATOMICS_USE_GCC" ;
+#elif defined( KOKKOS_ATOMICS_USE_INTEL )
+  return "KOKKOS_ATOMICS_USE_INTEL" ;
+#elif defined( KOKKOS_ATOMICS_USE_OMP31 )
+  return "KOKKOS_ATOMICS_USE_OMP31" ;
+#elif defined( KOKKOS_ATOMICS_USE_WINDOWS )
+  return "KOKKOS_ATOMICS_USE_WINDOWS";
+#endif
+}
+
+} // namespace Kokkos
+
+#ifdef _WIN32
+#include "impl/Kokkos_Atomic_Windows.hpp"
+#else
+
+//----------------------------------------------------------------------------
+// Atomic Assembly
+//
+// Implements CAS128-bit in assembly
+
+#include "impl/Kokkos_Atomic_Assembly.hpp"
+
+//----------------------------------------------------------------------------
+// Atomic exchange
+//
+// template< typename T >
+// T atomic_exchange( volatile T* const dest , const T val )
+// { T tmp = *dest ; *dest = val ; return tmp ; }
+
+#include "impl/Kokkos_Atomic_Exchange.hpp"
+
+//----------------------------------------------------------------------------
+// Atomic compare-and-exchange
+//
+// template<class T>
+// bool atomic_compare_exchange_strong(volatile T* const dest, const T compare, const T val)
+// { bool equal = compare == *dest ; if ( equal ) { *dest = val ; } return equal ; }
+
+#include "impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp"
+
+//----------------------------------------------------------------------------
+// Atomic fetch and add
+//
+// template<class T>
+// T atomic_fetch_add(volatile T* const dest, const T val)
+// { T tmp = *dest ; *dest += val ; return tmp ; }
+
+#include "impl/Kokkos_Atomic_Fetch_Add.hpp"
+
+//----------------------------------------------------------------------------
+// Atomic increment
+//
+// template<class T>
+// T atomic_increment(volatile T* const dest)
+// { dest++; }
+
+#include "impl/Kokkos_Atomic_Increment.hpp"
+
+//----------------------------------------------------------------------------
+// Atomic Decrement
+//
+// template<class T>
+// T atomic_decrement(volatile T* const dest)
+// { dest--; }
+
+#include "impl/Kokkos_Atomic_Decrement.hpp"
+
+//----------------------------------------------------------------------------
+// Atomic fetch and sub
+//
+// template<class T>
+// T atomic_fetch_sub(volatile T* const dest, const T val)
+// { T tmp = *dest ; *dest -= val ; return tmp ; }
+
+#include "impl/Kokkos_Atomic_Fetch_Sub.hpp"
+
+//----------------------------------------------------------------------------
+// Atomic fetch and or
+//
+// template<class T>
+// T atomic_fetch_or(volatile T* const dest, const T val)
+// { T tmp = *dest ; *dest = tmp | val ; return tmp ; }
+
+#include "impl/Kokkos_Atomic_Fetch_Or.hpp"
+
+//----------------------------------------------------------------------------
+// Atomic fetch and and
+//
+// template<class T>
+// T atomic_fetch_and(volatile T* const dest, const T val)
+// { T tmp = *dest ; *dest = tmp & val ; return tmp ; }
+
+#include "impl/Kokkos_Atomic_Fetch_And.hpp"
+#endif /*Not _WIN32*/
+
+//----------------------------------------------------------------------------
+// Memory fence
+//
+// All loads and stores from this thread will be globally consistent before continuing
+//
+// void memory_fence() {...};
+#include "impl/Kokkos_Memory_Fence.hpp"
+
+//----------------------------------------------------------------------------
+// Provide volatile_load and safe_load
+//
+// T volatile_load(T const volatile * const ptr);
+//
+// T const& safe_load(T const * const ptr);
+// XEON PHI
+// T safe_load(T const * const ptr
+
+#include "impl/Kokkos_Volatile_Load.hpp"
+
+#ifndef _WIN32
+#include "impl/Kokkos_Atomic_Generic.hpp"
+#endif
+//----------------------------------------------------------------------------
+// This atomic-style macro should be an inlined function, not a macro
+
+#if defined( KOKKOS_COMPILER_GNU ) && !defined(__PGIC__)
+
+  #define KOKKOS_NONTEMPORAL_PREFETCH_LOAD(addr) __builtin_prefetch(addr,0,0)
+  #define KOKKOS_NONTEMPORAL_PREFETCH_STORE(addr) __builtin_prefetch(addr,1,0)
+
+#else
+
+  #define KOKKOS_NONTEMPORAL_PREFETCH_LOAD(addr) ((void)0)
+  #define KOKKOS_NONTEMPORAL_PREFETCH_STORE(addr) ((void)0)
+
+#endif
+
+//----------------------------------------------------------------------------
+
+#endif /* KOKKOS_ATOMIC_HPP */
+
diff --git a/lib/kokkos/core/src/Kokkos_Complex.hpp b/lib/kokkos/core/src/Kokkos_Complex.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..cdfa4429f08f241d86bd32c3020f1b20c9a5a90b
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_Complex.hpp
@@ -0,0 +1,538 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+#ifndef KOKKOS_COMPLEX_HPP
+#define KOKKOS_COMPLEX_HPP
+
+#include <Kokkos_Atomic.hpp>
+#include <complex>
+#include <iostream>
+
+namespace Kokkos {
+
+/// \class complex
+/// \brief Partial reimplementation of std::complex that works as the
+///   result of a Kokkos::parallel_reduce.
+/// \tparam RealType The type of the real and imaginary parts of the
+///   complex number.  As with std::complex, this is only defined for
+///   \c float, \c double, and <tt>long double</tt>.  The latter is
+///   currently forbidden in CUDA device kernels.
+template<class RealType>
+class complex {
+private:
+  RealType re_, im_;
+
+public:
+  //! The type of the real or imaginary parts of this complex number.
+  typedef RealType value_type;
+
+  //! Default constructor (initializes both real and imaginary parts to zero).
+  KOKKOS_INLINE_FUNCTION complex () :
+    re_ (0.0), im_ (0.0)
+  {}
+
+  //! Copy constructor.
+  KOKKOS_INLINE_FUNCTION complex (const complex<RealType>& src) :
+    re_ (src.re_), im_ (src.im_)
+  {}
+
+  //! Copy constructor from volatile.
+  KOKKOS_INLINE_FUNCTION complex (const volatile complex<RealType>& src) :
+    re_ (src.re_), im_ (src.im_)
+  {}
+
+  /// \brief Conversion constructor from std::complex.
+  ///
+  /// This constructor cannot be called in a CUDA device function,
+  /// because std::complex's methods and nonmember functions are not
+  /// marked as CUDA device functions.
+  template<class InputRealType>
+  complex (const std::complex<InputRealType>& src) :
+    re_ (std::real (src)), im_ (std::imag (src))
+  {}
+
+  /// \brief Conversion operator to std::complex.
+  ///
+  /// This operator cannot be called in a CUDA device function,
+  /// because std::complex's methods and nonmember functions are not
+  /// marked as CUDA device functions.
+  operator std::complex<RealType> () const {
+    return std::complex<RealType> (re_, im_);
+  }
+
+  /// \brief Constructor that takes just the real part, and sets the
+  ///   imaginary part to zero.
+  template<class InputRealType>
+  KOKKOS_INLINE_FUNCTION complex (const InputRealType& val) :
+    re_ (val), im_ (0.0)
+  {}
+
+  //! Constructor that takes the real and imaginary parts.
+  template<class RealType1, class RealType2>
+  KOKKOS_INLINE_FUNCTION complex (const RealType1& re, const RealType2& im) :
+    re_ (re), im_ (im)
+  {}
+
+  //! Assignment operator.
+  template<class InputRealType>
+  KOKKOS_INLINE_FUNCTION
+  complex<RealType>& operator= (const complex<InputRealType>& src) {
+    re_ = src.re_;
+    im_ = src.im_;
+    return *this;
+  }
+
+  /// \brief Assignment operator, for volatile <tt>*this</tt> and
+  ///   nonvolatile input.
+  ///
+  /// \param src [in] Input; right-hand side of the assignment.
+  ///
+  /// This operator returns \c void instead of <tt>volatile
+  /// complex<RealType>& </tt>.  See Kokkos Issue #177 for the
+  /// explanation.  In practice, this means that you should not chain
+  /// assignments with volatile lvalues.
+  template<class InputRealType>
+  KOKKOS_INLINE_FUNCTION
+  void operator= (const complex<InputRealType>& src) volatile {
+    re_ = src.re_;
+    im_ = src.im_;
+    // We deliberately do not return anything here.  See explanation
+    // in public documentation above.
+  }
+
+  //! Assignment operator.
+  template<class InputRealType>
+  KOKKOS_INLINE_FUNCTION
+  volatile complex<RealType>& operator= (const volatile complex<InputRealType>& src) volatile {
+    re_ = src.re_;
+    im_ = src.im_;
+    return *this;
+  }
+
+  //! Assignment operator.
+  template<class InputRealType>
+  KOKKOS_INLINE_FUNCTION
+  complex<RealType>& operator= (const volatile complex<InputRealType>& src) {
+    re_ = src.re_;
+    im_ = src.im_;
+    return *this;
+  }
+
+  //! Assignment operator (from a real number).
+  template<class InputRealType>
+  KOKKOS_INLINE_FUNCTION
+  complex<RealType>& operator= (const InputRealType& val) {
+    re_ = val;
+    im_ = static_cast<RealType> (0.0);
+    return *this;
+  }
+
+  //! Assignment operator (from a real number).
+  template<class InputRealType>
+  KOKKOS_INLINE_FUNCTION
+  void operator= (const InputRealType& val) volatile {
+    re_ = val;
+    im_ = static_cast<RealType> (0.0);
+  }
+
+  /// \brief Assignment operator from std::complex.
+  ///
+  /// This constructor cannot be called in a CUDA device function,
+  /// because std::complex's methods and nonmember functions are not
+  /// marked as CUDA device functions.
+  template<class InputRealType>
+  complex<RealType>& operator= (const std::complex<InputRealType>& src) {
+    re_ = std::real (src);
+    im_ = std::imag (src);
+    return *this;
+  }
+
+  //! The imaginary part of this complex number.
+  KOKKOS_INLINE_FUNCTION RealType& imag () {
+    return im_;
+  }
+
+  //! The real part of this complex number.
+  KOKKOS_INLINE_FUNCTION RealType& real () {
+    return re_;
+  }
+
+  //! The imaginary part of this complex number.
+  KOKKOS_INLINE_FUNCTION const RealType imag () const {
+    return im_;
+  }
+
+  //! The real part of this complex number.
+  KOKKOS_INLINE_FUNCTION const RealType real () const {
+    return re_;
+  }
+
+  //! The imaginary part of this complex number (volatile overload).
+  KOKKOS_INLINE_FUNCTION volatile RealType& imag () volatile {
+    return im_;
+  }
+
+  //! The real part of this complex number (volatile overload).
+  KOKKOS_INLINE_FUNCTION volatile RealType& real () volatile {
+    return re_;
+  }
+
+  //! The imaginary part of this complex number (volatile overload).
+  KOKKOS_INLINE_FUNCTION const RealType imag () const volatile {
+    return im_;
+  }
+
+  //! The real part of this complex number (volatile overload).
+  KOKKOS_INLINE_FUNCTION const RealType real () const volatile {
+    return re_;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  complex<RealType>& operator += (const complex<RealType>& src) {
+    re_ += src.re_;
+    im_ += src.im_;
+    return *this;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator += (const volatile complex<RealType>& src) volatile {
+    re_ += src.re_;
+    im_ += src.im_;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  complex<RealType>& operator += (const RealType& src) {
+    re_ += src;
+    return *this;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator += (const volatile RealType& src) volatile {
+    re_ += src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  complex<RealType>& operator -= (const complex<RealType>& src) {
+    re_ -= src.re_;
+    im_ -= src.im_;
+    return *this;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  complex<RealType>& operator -= (const RealType& src) {
+    re_ -= src;
+    return *this;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  complex<RealType>& operator *= (const complex<RealType>& src) {
+    const RealType realPart = re_ * src.re_ - im_ * src.im_;
+    const RealType imagPart = re_ * src.im_ + im_ * src.re_;
+    re_ = realPart;
+    im_ = imagPart;
+    return *this;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator *= (const volatile complex<RealType>& src) volatile {
+    const RealType realPart = re_ * src.re_ - im_ * src.im_;
+    const RealType imagPart = re_ * src.im_ + im_ * src.re_;
+    re_ = realPart;
+    im_ = imagPart;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  complex<RealType>& operator *= (const RealType& src) {
+    re_ *= src;
+    im_ *= src;
+    return *this;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator *= (const volatile RealType& src) volatile {
+    re_ *= src;
+    im_ *= src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  complex<RealType>& operator /= (const complex<RealType>& y) {
+    // Scale (by the "1-norm" of y) to avoid unwarranted overflow.
+    // If the real part is +/-Inf and the imaginary part is -/+Inf,
+    // this won't change the result.
+    const RealType s = ::fabs (y.real ()) + ::fabs (y.imag ());
+
+    // If s is 0, then y is zero, so x/y == real(x)/0 + i*imag(x)/0.
+    // In that case, the relation x/y == (x/s) / (y/s) doesn't hold,
+    // because y/s is NaN.
+    if (s == 0.0) {
+      this->re_ /= s;
+      this->im_ /= s;
+    }
+    else {
+      const complex<RealType> x_scaled (this->re_ / s, this->im_ / s);
+      const complex<RealType> y_conj_scaled (y.re_ / s, -(y.im_) / s);
+      const RealType y_scaled_abs = y_conj_scaled.re_ * y_conj_scaled.re_ +
+        y_conj_scaled.im_ * y_conj_scaled.im_; // abs(y) == abs(conj(y))
+      *this = x_scaled * y_conj_scaled;
+      *this /= y_scaled_abs;
+    }
+    return *this;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  complex<RealType>& operator /= (const RealType& src) {
+    re_ /= src;
+    im_ /= src;
+    return *this;
+  }
+};
+
+//! Binary + operator for complex.
+template<class RealType>
+KOKKOS_INLINE_FUNCTION
+complex<RealType>
+operator + (const complex<RealType>& x, const complex<RealType>& y) {
+  return complex<RealType> (x.real () + y.real (), x.imag () + y.imag ());
+}
+
+//! Unary + operator for complex.
+template<class RealType>
+KOKKOS_INLINE_FUNCTION
+complex<RealType>
+operator + (const complex<RealType>& x) {
+  return x;
+}
+
+//! Binary - operator for complex.
+template<class RealType>
+KOKKOS_INLINE_FUNCTION
+complex<RealType>
+operator - (const complex<RealType>& x, const complex<RealType>& y) {
+  return complex<RealType> (x.real () - y.real (), x.imag () - y.imag ());
+}
+
+//! Unary - operator for complex.
+template<class RealType>
+KOKKOS_INLINE_FUNCTION
+complex<RealType>
+operator - (const complex<RealType>& x) {
+  return complex<RealType> (-x.real (), -x.imag ());
+}
+
+//! Binary * operator for complex.
+template<class RealType>
+KOKKOS_INLINE_FUNCTION
+complex<RealType>
+operator * (const complex<RealType>& x, const complex<RealType>& y) {
+  return complex<RealType> (x.real () * y.real () - x.imag () * y.imag (),
+                            x.real () * y.imag () + x.imag () * y.real ());
+}
+
+/// \brief Binary * operator for std::complex and complex.
+///
+/// This function exists because GCC 4.7.2 (and perhaps other
+/// compilers) are not able to deduce that they can multiply
+/// std::complex by Kokkos::complex, by first converting std::complex
+/// to Kokkos::complex.
+///
+/// This function cannot be called in a CUDA device function, because
+/// std::complex's methods and nonmember functions are not marked as
+/// CUDA device functions.
+template<class RealType>
+complex<RealType>
+operator * (const std::complex<RealType>& x, const complex<RealType>& y) {
+  return complex<RealType> (x.real () * y.real () - x.imag () * y.imag (),
+                            x.real () * y.imag () + x.imag () * y.real ());
+}
+
+/// \brief Binary * operator for RealType times complex.
+///
+/// This function exists because the compiler doesn't know that
+/// RealType and complex<RealType> commute with respect to operator*.
+template<class RealType>
+KOKKOS_INLINE_FUNCTION
+complex<RealType>
+operator * (const RealType& x, const complex<RealType>& y) {
+  return complex<RealType> (x * y.real (), x * y.imag ());
+}
+
+
+//! Imaginary part of a complex number.
+template<class RealType>
+KOKKOS_INLINE_FUNCTION
+RealType imag (const complex<RealType>& x) {
+  return x.imag ();
+}
+
+//! Real part of a complex number.
+template<class RealType>
+KOKKOS_INLINE_FUNCTION
+RealType real (const complex<RealType>& x) {
+  return x.real ();
+}
+
+//! Absolute value (magnitude) of a complex number.
+template<class RealType>
+KOKKOS_INLINE_FUNCTION
+RealType abs (const complex<RealType>& x) {
+  // FIXME (mfh 31 Oct 2014) Scale to avoid unwarranted overflow.
+  return ::sqrt (real (x) * real (x) + imag (x) * imag (x));
+}
+
+//! Conjugate of a complex number.
+template<class RealType>
+KOKKOS_INLINE_FUNCTION
+complex<RealType> conj (const complex<RealType>& x) {
+  return complex<RealType> (real (x), -imag (x));
+}
+
+
+//! Binary operator / for complex and real numbers
+template<class RealType1, class RealType2>
+KOKKOS_INLINE_FUNCTION
+complex<RealType1>
+operator / (const complex<RealType1>& x, const RealType2& y) {
+  return complex<RealType1> (real (x) / y, imag (x) / y);
+}
+
+//! Binary operator / for complex.
+template<class RealType>
+KOKKOS_INLINE_FUNCTION
+complex<RealType>
+operator / (const complex<RealType>& x, const complex<RealType>& y) {
+  // Scale (by the "1-norm" of y) to avoid unwarranted overflow.
+  // If the real part is +/-Inf and the imaginary part is -/+Inf,
+  // this won't change the result.
+  const RealType s = ::fabs (real (y)) + ::fabs (imag (y));
+
+  // If s is 0, then y is zero, so x/y == real(x)/0 + i*imag(x)/0.
+  // In that case, the relation x/y == (x/s) / (y/s) doesn't hold,
+  // because y/s is NaN.
+  if (s == 0.0) {
+    return complex<RealType> (real (x) / s, imag (x) / s);
+  }
+  else {
+    const complex<RealType> x_scaled (real (x) / s, imag (x) / s);
+    const complex<RealType> y_conj_scaled (real (y) / s, -imag (y) / s);
+    const RealType y_scaled_abs = real (y_conj_scaled) * real (y_conj_scaled) +
+      imag (y_conj_scaled) * imag (y_conj_scaled); // abs(y) == abs(conj(y))
+    complex<RealType> result = x_scaled * y_conj_scaled;
+    result /= y_scaled_abs;
+    return result;
+  }
+}
+
+//! Equality operator for two complex numbers.
+template<class RealType>
+KOKKOS_INLINE_FUNCTION
+bool operator == (const complex<RealType>& x, const complex<RealType>& y) {
+  return real (x) == real (y) && imag (x) == imag (y);
+}
+
+//! Equality operator for std::complex and Kokkos::complex.
+template<class RealType>
+KOKKOS_INLINE_FUNCTION
+bool operator == (const std::complex<RealType>& x, const complex<RealType>& y) {
+  return std::real (x) == real (y) && std::imag (x) == imag (y);
+}
+
+//! Equality operator for complex and real number.
+template<class RealType1, class RealType2>
+KOKKOS_INLINE_FUNCTION
+bool operator == (const complex<RealType1>& x, const RealType2& y) {
+  return real (x) == y && imag (x) == static_cast<RealType1> (0.0);
+}
+
+//! Equality operator for real and complex number.
+template<class RealType>
+KOKKOS_INLINE_FUNCTION
+bool operator == (const RealType& x, const complex<RealType>& y) {
+  return y == x;
+}
+
+//! Inequality operator for two complex numbers.
+template<class RealType>
+KOKKOS_INLINE_FUNCTION
+bool operator != (const complex<RealType>& x, const complex<RealType>& y) {
+  return real (x) != real (y) || imag (x) != imag (y);
+}
+
+//! Inequality operator for std::complex and Kokkos::complex.
+template<class RealType>
+KOKKOS_INLINE_FUNCTION
+bool operator != (const std::complex<RealType>& x, const complex<RealType>& y) {
+  return std::real (x) != real (y) || std::imag (x) != imag (y);
+}
+
+//! Inequality operator for complex and real number.
+template<class RealType1, class RealType2>
+KOKKOS_INLINE_FUNCTION
+bool operator != (const complex<RealType1>& x, const RealType2& y) {
+  return real (x) != y || imag (x) != static_cast<RealType1> (0.0);
+}
+
+//! Inequality operator for real and complex number.
+template<class RealType>
+KOKKOS_INLINE_FUNCTION
+bool operator != (const RealType& x, const complex<RealType>& y) {
+  return y != x;
+}
+
+template<class RealType>
+std::ostream& operator << (std::ostream& os, const complex<RealType>& x) {
+  const std::complex<RealType> x_std (Kokkos::real (x), Kokkos::imag (x));
+  os << x_std;
+  return os;
+}
+
+template<class RealType>
+std::ostream& operator >> (std::ostream& os, complex<RealType>& x) {
+  std::complex<RealType> x_std;
+  os >> x_std;
+  x = x_std; // only assigns on success of above
+  return os;
+}
+
+
+} // namespace Kokkos
+
+#endif // KOKKOS_COMPLEX_HPP
diff --git a/lib/kokkos/core/src/Kokkos_Concepts.hpp b/lib/kokkos/core/src/Kokkos_Concepts.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..82a342eec0bfba9e5420b86d41c586b22969712c
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_Concepts.hpp
@@ -0,0 +1,78 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CORE_CONCEPTS_HPP
+#define KOKKOS_CORE_CONCEPTS_HPP
+
+#include <type_traits>
+
+namespace Kokkos {
+//Schedules for Execution Policies
+struct Static {};
+struct Dynamic {};
+
+//Schedule Wrapper Type
+template<class T>
+struct Schedule
+{
+  static_assert(  std::is_same<T,Static>::value
+               || std::is_same<T,Dynamic>::value
+               , "Kokkos: Invalid Schedule<> type."
+               );
+  using schedule_type = Schedule<T>;
+  using type = T;
+};
+
+//Specify Iteration Index Type
+template<typename T>
+struct IndexType
+{
+  static_assert(std::is_integral<T>::value,"Kokkos: Invalid IndexType<>.");
+  using index_type = IndexType<T>;
+  using type = T;
+};
+
+} // namespace Kokkos
+
+#endif // KOKKOS_CORE_CONCEPTS_HPP
+
diff --git a/lib/kokkos/core/src/Kokkos_Core.hpp b/lib/kokkos/core/src/Kokkos_Core.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..7cde4610ee8957c2eea7a9a2e05c8f2cbb9463f4
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_Core.hpp
@@ -0,0 +1,174 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CORE_HPP
+#define KOKKOS_CORE_HPP
+
+//----------------------------------------------------------------------------
+// Include the execution space header files for the enabled execution spaces.
+
+#include <Kokkos_Core_fwd.hpp>
+
+#if defined( KOKKOS_HAVE_SERIAL )
+#include <Kokkos_Serial.hpp>
+#endif
+
+#if defined( KOKKOS_HAVE_OPENMP )
+#include <Kokkos_OpenMP.hpp>
+#endif
+
+#if defined( KOKKOS_HAVE_PTHREAD )
+#include <Kokkos_Threads.hpp>
+#endif
+
+#if defined( KOKKOS_HAVE_CUDA )
+#include <Kokkos_Cuda.hpp>
+#endif
+
+#include <Kokkos_MemoryPool.hpp>
+#include <Kokkos_Pair.hpp>
+#include <Kokkos_Array.hpp>
+#include <Kokkos_View.hpp>
+#include <Kokkos_Vectorization.hpp>
+#include <Kokkos_Atomic.hpp>
+#include <Kokkos_hwloc.hpp>
+
+#ifdef KOKKOS_HAVE_CXX11
+#include <Kokkos_Complex.hpp>
+#endif
+
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+struct InitArguments {
+  int num_threads;
+  int num_numa;
+  int device_id;
+
+  InitArguments() {
+    num_threads = -1;
+    num_numa = -1;
+    device_id = -1;
+  }
+};
+
+void initialize(int& narg, char* arg[]);
+
+void initialize(const InitArguments& args = InitArguments());
+
+/** \brief  Finalize the spaces that were initialized via Kokkos::initialize */
+void finalize();
+
+/** \brief  Finalize all known execution spaces */
+void finalize_all();
+
+void fence();
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+
+/* Allocate memory from a memory space.
+ * The allocation is tracked in Kokkos memory tracking system, so
+ * leaked memory can be identified.
+ */
+template< class Space = typename Kokkos::DefaultExecutionSpace::memory_space >
+inline
+void * kokkos_malloc( const std::string & arg_alloc_label
+                    , const size_t arg_alloc_size )
+{
+  typedef typename Space::memory_space MemorySpace ;
+  return Impl::SharedAllocationRecord< MemorySpace >::
+    allocate_tracked( MemorySpace() , arg_alloc_label , arg_alloc_size );
+}
+
+template< class Space = typename Kokkos::DefaultExecutionSpace::memory_space >
+inline
+void * kokkos_malloc( const size_t arg_alloc_size )
+{
+  typedef typename Space::memory_space MemorySpace ;
+  return Impl::SharedAllocationRecord< MemorySpace >::
+    allocate_tracked( MemorySpace() , "no-label" , arg_alloc_size );
+}
+
+template< class Space = typename Kokkos::DefaultExecutionSpace::memory_space >
+inline
+void kokkos_free( void * arg_alloc )
+{
+  typedef typename Space::memory_space MemorySpace ;
+  return Impl::SharedAllocationRecord< MemorySpace >::
+    deallocate_tracked( arg_alloc );
+}
+
+template< class Space = typename Kokkos::DefaultExecutionSpace::memory_space >
+inline
+void * kokkos_realloc( void * arg_alloc , const size_t arg_alloc_size )
+{
+  typedef typename Space::memory_space MemorySpace ;
+  return Impl::SharedAllocationRecord< MemorySpace >::
+    reallocate_tracked( arg_alloc , arg_alloc_size );
+}
+
+} // namespace Experimental
+} // namespace Kokkos
+
+
+namespace Kokkos {
+
+using Kokkos::Experimental::kokkos_malloc ;
+using Kokkos::Experimental::kokkos_realloc ;
+using Kokkos::Experimental::kokkos_free ;
+
+}
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif
+
diff --git a/lib/kokkos/core/src/Kokkos_Core_fwd.hpp b/lib/kokkos/core/src/Kokkos_Core_fwd.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e9648b59b8f62c5cb4ea46c00ec1498c361cbdb4
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_Core_fwd.hpp
@@ -0,0 +1,247 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CORE_FWD_HPP
+#define KOKKOS_CORE_FWD_HPP
+
+//----------------------------------------------------------------------------
+// Kokkos_Macros.hpp does introspection on configuration options
+// and compiler environment then sets a collection of #define macros.
+
+#include <Kokkos_Macros.hpp>
+
+//----------------------------------------------------------------------------
+// Have assumed a 64bit build (8byte pointers) throughout the code base.
+
+static_assert( sizeof(void*) == 8
+             , "Kokkos assumes 64-bit build; i.e., 8-byte pointers" );
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+struct AUTO_t {
+  KOKKOS_INLINE_FUNCTION
+  constexpr const AUTO_t & operator()() const { return *this ; }
+};
+
+namespace {
+/**\brief Token to indicate that a parameter's value is to be automatically selected */
+constexpr AUTO_t AUTO = Kokkos::AUTO_t();
+}
+
+struct InvalidType {};
+
+}
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+// Forward declarations for class inter-relationships
+
+namespace Kokkos {
+
+class HostSpace ; ///< Memory space for main process and CPU execution spaces
+
+#ifdef KOKKOS_HAVE_HBWSPACE
+namespace Experimental {
+class HBWSpace ; /// Memory space for hbw_malloc from memkind (e.g. for KNL processor)
+}
+#endif
+
+#if defined( KOKKOS_HAVE_SERIAL )
+class Serial ;    ///< Execution space main process on CPU
+#endif // defined( KOKKOS_HAVE_SERIAL )
+
+#if defined( KOKKOS_HAVE_PTHREAD )
+class Threads ;  ///< Execution space with pthreads back-end
+#endif
+
+#if defined( KOKKOS_HAVE_OPENMP )
+class OpenMP ; ///< OpenMP execution space
+#endif
+
+#if defined( KOKKOS_HAVE_CUDA )
+class CudaSpace ;            ///< Memory space on Cuda GPU
+class CudaUVMSpace ;         ///< Memory space on Cuda GPU with UVM
+class CudaHostPinnedSpace ;  ///< Memory space on Host accessible to Cuda GPU
+class Cuda ;                 ///< Execution space for Cuda GPU
+#endif
+
+template<class ExecutionSpace, class MemorySpace>
+struct Device;
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+// Set the default execution space.
+
+/// Define Kokkos::DefaultExecutionSpace as per configuration option
+/// or chosen from the enabled execution spaces in the following order:
+/// Kokkos::Cuda, Kokkos::OpenMP, Kokkos::Threads, Kokkos::Serial
+
+namespace Kokkos {
+
+#if   defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_CUDA )
+  typedef Cuda DefaultExecutionSpace ;
+#elif defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP )
+  typedef OpenMP DefaultExecutionSpace ;
+#elif defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS )
+  typedef Threads DefaultExecutionSpace ;
+#elif defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_SERIAL )
+  typedef Serial DefaultExecutionSpace ;
+#else
+#  error "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::Cuda, Kokkos::OpenMP, Kokkos::Serial, or Kokkos::Threads."
+#endif
+
+#if defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP )
+  typedef OpenMP DefaultHostExecutionSpace ;
+#elif defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS )
+  typedef Threads DefaultHostExecutionSpace ;
+#elif defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_SERIAL )
+  typedef Serial DefaultHostExecutionSpace ;
+#elif defined ( KOKKOS_HAVE_OPENMP )
+  typedef OpenMP DefaultHostExecutionSpace ;
+#elif defined ( KOKKOS_HAVE_PTHREAD )
+  typedef Threads DefaultHostExecutionSpace ;
+#elif defined ( KOKKOS_HAVE_SERIAL )
+  typedef Serial DefaultHostExecutionSpace ;
+#else
+#  error "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::OpenMP, Kokkos::Serial, or Kokkos::Threads."
+#endif
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+// Detect the active execution space and define its memory space.
+// This is used to verify whether a running kernel can access
+// a given memory space.
+
+namespace Kokkos {
+namespace Impl {
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA ) && defined (KOKKOS_HAVE_CUDA)
+typedef Kokkos::CudaSpace  ActiveExecutionMemorySpace ;
+#elif defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+typedef Kokkos::HostSpace  ActiveExecutionMemorySpace ;
+#else
+typedef void ActiveExecutionMemorySpace ;
+#endif
+
+template< class ActiveSpace , class MemorySpace >
+struct VerifyExecutionCanAccessMemorySpace {
+  enum {value = 0};
+};
+
+template< class Space >
+struct VerifyExecutionCanAccessMemorySpace< Space , Space >
+{
+  enum {value = 1};
+  KOKKOS_INLINE_FUNCTION static void verify(void) {}
+  KOKKOS_INLINE_FUNCTION static void verify(const void *) {}
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+#define KOKKOS_RESTRICT_EXECUTION_TO_DATA( DATA_SPACE , DATA_PTR ) \
+  Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< \
+    Kokkos::Impl::ActiveExecutionMemorySpace , DATA_SPACE >::verify( DATA_PTR )
+
+#define KOKKOS_RESTRICT_EXECUTION_TO_( DATA_SPACE ) \
+  Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< \
+    Kokkos::Impl::ActiveExecutionMemorySpace , DATA_SPACE >::verify()
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+  void fence();
+}
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class Functor
+        , class Policy
+        , class EnableFunctor = void 
+	      , class EnablePolicy = void
+        >
+struct FunctorPolicyExecutionSpace;
+
+//----------------------------------------------------------------------------
+/// \class ParallelFor
+/// \brief Implementation of the ParallelFor operator that has a
+///   partial specialization for the device.
+///
+/// This is an implementation detail of parallel_for.  Users should
+/// skip this and go directly to the nonmember function parallel_for.
+template< class FunctorType , class ExecPolicy , class ExecutionSpace =
+          typename Impl::FunctorPolicyExecutionSpace< FunctorType , ExecPolicy >::execution_space 
+        > class ParallelFor ;
+
+/// \class ParallelReduce
+/// \brief Implementation detail of parallel_reduce.
+///
+/// This is an implementation detail of parallel_reduce.  Users should
+/// skip this and go directly to the nonmember function parallel_reduce.
+template< class FunctorType , class ExecPolicy , class ReducerType = InvalidType, class ExecutionSpace =
+          typename Impl::FunctorPolicyExecutionSpace< FunctorType , ExecPolicy >::execution_space 
+        > class ParallelReduce ;
+
+/// \class ParallelScan
+/// \brief Implementation detail of parallel_scan.
+///
+/// This is an implementation detail of parallel_scan.  Users should
+/// skip this and go directly to the documentation of the nonmember
+/// template function Kokkos::parallel_scan.
+template< class FunctorType , class ExecPolicy , class ExecutionSapce = 
+          typename Impl::FunctorPolicyExecutionSpace< FunctorType , ExecPolicy >::execution_space 
+        > class ParallelScan ;
+
+}}
+#endif /* #ifndef KOKKOS_CORE_FWD_HPP */
+
diff --git a/lib/kokkos/core/src/Kokkos_Cuda.hpp b/lib/kokkos/core/src/Kokkos_Cuda.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..3130ee3198f35ec59dbeef7755cfffc11fda9346
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_Cuda.hpp
@@ -0,0 +1,274 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_HPP
+#define KOKKOS_CUDA_HPP
+
+#include <Kokkos_Core_fwd.hpp>
+
+// If CUDA execution space is enabled then use this header file.
+
+#if defined( KOKKOS_HAVE_CUDA )
+
+#include <iosfwd>
+#include <vector>
+
+#include <Kokkos_CudaSpace.hpp>
+
+#include <Kokkos_Parallel.hpp>
+#include <Kokkos_TaskPolicy.hpp>
+#include <Kokkos_Layout.hpp>
+#include <Kokkos_ScratchSpace.hpp>
+#include <Kokkos_MemoryTraits.hpp>
+#include <impl/Kokkos_Tags.hpp>
+
+#include <KokkosExp_MDRangePolicy.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+class CudaExec ;
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+/// \class Cuda
+/// \brief Kokkos Execution Space that uses CUDA to run on GPUs.
+///
+/// An "execution space" represents a parallel execution model.  It tells Kokkos
+/// how to parallelize the execution of kernels in a parallel_for or
+/// parallel_reduce.  For example, the Threads execution space uses Pthreads or
+/// C++11 threads on a CPU, the OpenMP execution space uses the OpenMP language
+/// extensions, and the Serial execution space executes "parallel" kernels
+/// sequentially.  The Cuda execution space uses NVIDIA's CUDA programming
+/// model to execute kernels in parallel on GPUs.
+class Cuda {
+public:
+  //! \name Type declarations that all Kokkos execution spaces must provide.
+  //@{
+
+  //! Tag this class as a kokkos execution space
+  typedef Cuda                  execution_space ;
+
+#if defined( KOKKOS_USE_CUDA_UVM )
+  //! This execution space's preferred memory space.
+  typedef CudaUVMSpace          memory_space ;
+#else
+  //! This execution space's preferred memory space.
+  typedef CudaSpace             memory_space ;
+#endif
+
+  //! This execution space preferred device_type
+  typedef Kokkos::Device<execution_space,memory_space> device_type;
+
+  //! The size_type best suited for this execution space.
+  typedef memory_space::size_type  size_type ;
+
+  //! This execution space's preferred array layout.
+  typedef LayoutLeft            array_layout ;
+
+  //!
+  typedef ScratchMemorySpace< Cuda >  scratch_memory_space ;
+
+  //@}
+  //--------------------------------------------------
+  //! \name Functions that all Kokkos devices must implement.
+  //@{
+
+  /// \brief True if and only if this method is being called in a
+  ///   thread-parallel function.
+  KOKKOS_INLINE_FUNCTION static int in_parallel() {
+#if defined( __CUDA_ARCH__ )
+    return true;
+#else
+    return false;
+#endif
+  }
+
+  /** \brief  Set the device in a "sleep" state.
+   *
+   * This function sets the device in a "sleep" state in which it is
+   * not ready for work.  This may consume less resources than if the
+   * device were in an "awake" state, but it may also take time to
+   * bring the device from a sleep state to be ready for work.
+   *
+   * \return True if the device is in the "sleep" state, else false if
+   *   the device is actively working and could not enter the "sleep"
+   *   state.
+   */
+  static bool sleep();
+
+  /// \brief Wake the device from the 'sleep' state so it is ready for work.
+  ///
+  /// \return True if the device is in the "ready" state, else "false"
+  ///  if the device is actively working (which also means that it's
+  ///  awake).
+  static bool wake();
+
+  /// \brief Wait until all dispatched functors complete.
+  ///
+  /// The parallel_for or parallel_reduce dispatch of a functor may
+  /// return asynchronously, before the functor completes.  This
+  /// method does not return until all dispatched functors on this
+  /// device have completed.
+  static void fence();
+
+  //! Free any resources being consumed by the device.
+  static void finalize();
+
+  //! Has been initialized
+  static int is_initialized();
+
+  /** \brief  Return the maximum amount of concurrency.  */
+  static int concurrency();
+
+  //! Print configuration information to the given output stream.
+  static void print_configuration( std::ostream & , const bool detail = false );
+
+  //@}
+  //--------------------------------------------------
+  //! \name  Cuda space instances
+
+  ~Cuda() {}
+  Cuda();
+  explicit Cuda( const int instance_id );
+
+  Cuda( Cuda && ) = default ;
+  Cuda( const Cuda & ) = default ;
+  Cuda & operator = ( Cuda && ) = default ;
+  Cuda & operator = ( const Cuda & ) = default ;
+
+  //--------------------------------------------------------------------------
+  //! \name Device-specific functions
+  //@{
+
+  struct SelectDevice {
+    int cuda_device_id ;
+    SelectDevice() : cuda_device_id(0) {}
+    explicit SelectDevice( int id ) : cuda_device_id( id ) {}
+  };
+
+  //! Initialize, telling the CUDA run-time library which device to use.
+  static void initialize( const SelectDevice = SelectDevice()
+                        , const size_t num_instances = 1 );
+
+  /// \brief Cuda device architecture of the selected device.
+  ///
+  /// This matches the __CUDA_ARCH__ specification.
+  static size_type device_arch();
+
+  //! Query device count.
+  static size_type detect_device_count();
+
+  /** \brief  Detect the available devices and their architecture
+   *          as defined by the __CUDA_ARCH__ specification.
+   */
+  static std::vector<unsigned> detect_device_arch();
+
+  cudaStream_t cuda_stream() const { return m_stream ; }
+  int          cuda_device() const { return m_device ; }
+
+  //@}
+  //--------------------------------------------------------------------------
+
+private:
+
+  cudaStream_t m_stream ;
+  int          m_device ;
+};
+
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+struct VerifyExecutionCanAccessMemorySpace
+  < Kokkos::CudaSpace
+  , Kokkos::Cuda::scratch_memory_space
+  >
+{
+  enum { value = true };
+  KOKKOS_INLINE_FUNCTION static void verify( void ) { }
+  KOKKOS_INLINE_FUNCTION static void verify( const void * ) { }
+};
+
+template<>
+struct VerifyExecutionCanAccessMemorySpace
+  < Kokkos::HostSpace
+  , Kokkos::Cuda::scratch_memory_space
+  >
+{
+  enum { value = false };
+  inline static void verify( void ) { CudaSpace::access_error(); }
+  inline static void verify( const void * p ) { CudaSpace::access_error(p); }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+#include <Cuda/Kokkos_CudaExec.hpp>
+#include <Cuda/Kokkos_Cuda_View.hpp>
+
+#include <Cuda/KokkosExp_Cuda_View.hpp>
+
+#include <Cuda/Kokkos_Cuda_Parallel.hpp>
+#include <Cuda/Kokkos_Cuda_Task.hpp>
+
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_HAVE_CUDA ) */
+#endif /* #ifndef KOKKOS_CUDA_HPP */
+
+
+
diff --git a/lib/kokkos/core/src/Kokkos_CudaSpace.hpp b/lib/kokkos/core/src/Kokkos_CudaSpace.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..cd728895d0f02419d702ccb37ec9b048b08a6df8
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_CudaSpace.hpp
@@ -0,0 +1,802 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDASPACE_HPP
+#define KOKKOS_CUDASPACE_HPP
+
+#include <Kokkos_Core_fwd.hpp>
+
+#if defined( KOKKOS_HAVE_CUDA )
+
+#include <iosfwd>
+#include <typeinfo>
+#include <string>
+
+#include <Kokkos_HostSpace.hpp>
+
+#include <Cuda/Kokkos_Cuda_abort.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+/** \brief  Cuda on-device memory management */
+
+class CudaSpace {
+public:
+
+  //! Tag this class as a kokkos memory space
+  typedef CudaSpace             memory_space ;
+  typedef Kokkos::Cuda          execution_space ;
+  typedef Kokkos::Device<execution_space,memory_space> device_type;
+
+  typedef unsigned int          size_type ;
+
+  /*--------------------------------*/
+
+  CudaSpace();
+  CudaSpace( CudaSpace && rhs ) = default ;
+  CudaSpace( const CudaSpace & rhs ) = default ;
+  CudaSpace & operator = ( CudaSpace && rhs ) = default ;
+  CudaSpace & operator = ( const CudaSpace & rhs ) = default ;
+  ~CudaSpace() = default ;
+
+  /**\brief  Allocate untracked memory in the cuda space */
+  void * allocate( const size_t arg_alloc_size ) const ;
+
+  /**\brief  Deallocate untracked memory in the cuda space */
+  void deallocate( void * const arg_alloc_ptr
+                 , const size_t arg_alloc_size ) const ;
+
+  /*--------------------------------*/
+  /** \brief  Error reporting for HostSpace attempt to access CudaSpace */
+  static void access_error();
+  static void access_error( const void * const );
+
+private:
+
+  int  m_device ; ///< Which Cuda device
+
+  // friend class Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void > ;
+};
+
+namespace Impl {
+/// \brief Initialize lock array for arbitrary size atomics.
+///
+/// Arbitrary atomics are implemented using a hash table of locks
+/// where the hash value is derived from the address of the
+/// object for which an atomic operation is performed.
+/// This function initializes the locks to zero (unset).
+void init_lock_arrays_cuda_space();
+
+/// \brief Retrieve the pointer to the lock array for arbitrary size atomics.
+///
+/// Arbitrary atomics are implemented using a hash table of locks
+/// where the hash value is derived from the address of the
+/// object for which an atomic operation is performed.
+/// This function retrieves the lock array pointer.
+/// If the array is not yet allocated it will do so.
+int* atomic_lock_array_cuda_space_ptr(bool deallocate = false);
+
+/// \brief Retrieve the pointer to the scratch array for team and thread private global memory.
+///
+/// Team and Thread private scratch allocations in
+/// global memory are aquired via locks.
+/// This function retrieves the lock array pointer.
+/// If the array is not yet allocated it will do so.
+int* scratch_lock_array_cuda_space_ptr(bool deallocate = false);
+
+/// \brief Retrieve the pointer to the scratch array for unique identifiers.
+///
+/// Unique identifiers in the range 0-Cuda::concurrency
+/// are provided via locks.
+/// This function retrieves the lock array pointer.
+/// If the array is not yet allocated it will do so.
+int* threadid_lock_array_cuda_space_ptr(bool deallocate = false);
+}
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+/** \brief  Cuda memory that is accessible to Host execution space
+ *          through Cuda's unified virtual memory (UVM) runtime.
+ */
+class CudaUVMSpace {
+public:
+
+  //! Tag this class as a kokkos memory space
+  typedef CudaUVMSpace          memory_space ;
+  typedef Cuda                  execution_space ;
+  typedef Kokkos::Device<execution_space,memory_space> device_type;
+  typedef unsigned int          size_type ;
+
+  /** \brief  If UVM capability is available */
+  static bool available();
+
+  /*--------------------------------*/
+
+  CudaUVMSpace();
+  CudaUVMSpace( CudaUVMSpace && rhs ) = default ;
+  CudaUVMSpace( const CudaUVMSpace & rhs ) = default ;
+  CudaUVMSpace & operator = ( CudaUVMSpace && rhs ) = default ;
+  CudaUVMSpace & operator = ( const CudaUVMSpace & rhs ) = default ;
+  ~CudaUVMSpace() = default ;
+
+  /**\brief  Allocate untracked memory in the cuda space */
+  void * allocate( const size_t arg_alloc_size ) const ;
+
+  /**\brief  Deallocate untracked memory in the cuda space */
+  void deallocate( void * const arg_alloc_ptr
+                 , const size_t arg_alloc_size ) const ;
+
+  /*--------------------------------*/
+
+private:
+
+  int  m_device ; ///< Which Cuda device
+};
+
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+/** \brief  Host memory that is accessible to Cuda execution space
+ *          through Cuda's host-pinned memory allocation.
+ */
+class CudaHostPinnedSpace {
+public:
+
+  //! Tag this class as a kokkos memory space
+  /** \brief  Memory is in HostSpace so use the HostSpace::execution_space */
+  typedef HostSpace::execution_space  execution_space ;
+  typedef CudaHostPinnedSpace         memory_space ;
+  typedef Kokkos::Device<execution_space,memory_space> device_type;
+  typedef unsigned int                size_type ;
+
+  /*--------------------------------*/
+
+  CudaHostPinnedSpace();
+  CudaHostPinnedSpace( CudaHostPinnedSpace && rhs ) = default ;
+  CudaHostPinnedSpace( const CudaHostPinnedSpace & rhs ) = default ;
+  CudaHostPinnedSpace & operator = ( CudaHostPinnedSpace && rhs ) = default ;
+  CudaHostPinnedSpace & operator = ( const CudaHostPinnedSpace & rhs ) = default ;
+  ~CudaHostPinnedSpace() = default ;
+
+  /**\brief  Allocate untracked memory in the space */
+  void * allocate( const size_t arg_alloc_size ) const ;
+
+  /**\brief  Deallocate untracked memory in the space */
+  void deallocate( void * const arg_alloc_ptr
+                 , const size_t arg_alloc_size ) const ;
+
+  /*--------------------------------*/
+};
+
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+void DeepCopyAsyncCuda( void * dst , const void * src , size_t n);
+
+template<> struct DeepCopy< CudaSpace , CudaSpace , Cuda>
+{
+  DeepCopy( void * dst , const void * src , size_t );
+  DeepCopy( const Cuda & , void * dst , const void * src , size_t );
+};
+
+template<> struct DeepCopy< CudaSpace , HostSpace , Cuda >
+{
+  DeepCopy( void * dst , const void * src , size_t );
+  DeepCopy( const Cuda & , void * dst , const void * src , size_t );
+};
+
+template<> struct DeepCopy< HostSpace , CudaSpace , Cuda >
+{
+  DeepCopy( void * dst , const void * src , size_t );
+  DeepCopy( const Cuda & , void * dst , const void * src , size_t );
+};
+
+template<class ExecutionSpace> struct DeepCopy< CudaSpace , CudaSpace , ExecutionSpace >
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< CudaSpace , CudaSpace , Cuda >( dst , src , n ); }
+
+  inline
+  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
+  {
+    exec.fence();
+    DeepCopyAsyncCuda (dst,src,n);
+  }
+};
+
+template<class ExecutionSpace> struct DeepCopy< CudaSpace , HostSpace , ExecutionSpace >
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< CudaSpace , HostSpace , Cuda>( dst , src , n ); }
+
+  inline
+  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
+  {
+    exec.fence();
+    DeepCopyAsyncCuda (dst,src,n);
+  }
+};
+
+template<class ExecutionSpace>
+struct DeepCopy< HostSpace , CudaSpace , ExecutionSpace >
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< HostSpace , CudaSpace , Cuda >( dst , src , n ); }
+
+  inline
+  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
+  {
+    exec.fence();
+    DeepCopyAsyncCuda (dst,src,n);
+  }
+};
+
+template<class ExecutionSpace>
+struct DeepCopy< CudaSpace , CudaUVMSpace , ExecutionSpace >
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< CudaSpace , CudaSpace , Cuda >( dst , src , n ); }
+
+  inline
+  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
+  {
+    exec.fence();
+    DeepCopyAsyncCuda (dst,src,n);
+  }
+};
+
+template<class ExecutionSpace>
+struct DeepCopy< CudaSpace , CudaHostPinnedSpace , ExecutionSpace>
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< CudaSpace , HostSpace , Cuda >( dst , src , n ); }
+
+  inline
+  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
+  {
+    exec.fence();
+    DeepCopyAsyncCuda (dst,src,n);
+  }
+};
+
+
+template<class ExecutionSpace>
+struct DeepCopy< CudaUVMSpace , CudaSpace , ExecutionSpace>
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< CudaSpace , CudaSpace , Cuda >( dst , src , n ); }
+
+  inline
+  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
+  {
+    exec.fence();
+    DeepCopyAsyncCuda (dst,src,n);
+  }
+};
+
+template<class ExecutionSpace>
+struct DeepCopy< CudaUVMSpace , CudaUVMSpace , ExecutionSpace>
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< CudaSpace , CudaSpace , Cuda >( dst , src , n ); }
+
+  inline
+  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
+  {
+    exec.fence();
+    DeepCopyAsyncCuda (dst,src,n);
+  }
+};
+
+template<class ExecutionSpace>
+struct DeepCopy< CudaUVMSpace , CudaHostPinnedSpace , ExecutionSpace>
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< CudaSpace , HostSpace , Cuda >( dst , src , n ); }
+
+  inline
+  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
+  {
+    exec.fence();
+    DeepCopyAsyncCuda (dst,src,n);
+  }
+};
+
+template<class ExecutionSpace> struct DeepCopy< CudaUVMSpace , HostSpace , ExecutionSpace >
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< CudaSpace , HostSpace , Cuda >( dst , src , n ); }
+
+  inline
+  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
+  {
+    exec.fence();
+    DeepCopyAsyncCuda (dst,src,n);
+  }
+};
+
+
+template<class ExecutionSpace> struct DeepCopy< CudaHostPinnedSpace , CudaSpace , ExecutionSpace >
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< HostSpace , CudaSpace , Cuda >( dst , src , n ); }
+
+  inline
+  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
+  {
+    exec.fence();
+    DeepCopyAsyncCuda (dst,src,n);
+  }
+};
+
+template<class ExecutionSpace> struct DeepCopy< CudaHostPinnedSpace , CudaUVMSpace , ExecutionSpace >
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< HostSpace , CudaSpace , Cuda >( dst , src , n ); }
+
+  inline
+  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
+  {
+    exec.fence();
+    DeepCopyAsyncCuda (dst,src,n);
+  }
+};
+
+template<class ExecutionSpace> struct DeepCopy< CudaHostPinnedSpace , CudaHostPinnedSpace , ExecutionSpace >
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< HostSpace , HostSpace , Cuda >( dst , src , n ); }
+
+  inline
+  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
+  {
+    exec.fence();
+    DeepCopyAsyncCuda (dst,src,n);
+  }
+};
+
+template<class ExecutionSpace> struct DeepCopy< CudaHostPinnedSpace , HostSpace , ExecutionSpace >
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< HostSpace , HostSpace , Cuda >( dst , src , n ); }
+
+  inline
+  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
+  {
+    exec.fence();
+    DeepCopyAsyncCuda (dst,src,n);
+  }
+};
+
+
+template<class ExecutionSpace> struct DeepCopy< HostSpace , CudaUVMSpace , ExecutionSpace >
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< HostSpace , CudaSpace , Cuda >( dst , src , n ); }
+
+  inline
+  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
+  {
+    exec.fence();
+    DeepCopyAsyncCuda (dst,src,n);
+  }
+};
+
+template<class ExecutionSpace> struct DeepCopy< HostSpace , CudaHostPinnedSpace , ExecutionSpace >
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< HostSpace , HostSpace , Cuda >( dst , src , n ); }
+
+  inline
+  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
+  {
+    exec.fence();
+    DeepCopyAsyncCuda (dst,src,n);
+  }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+/** Running in CudaSpace attempting to access HostSpace: error */
+template<>
+struct VerifyExecutionCanAccessMemorySpace< Kokkos::CudaSpace , Kokkos::HostSpace >
+{
+  enum { value = false };
+  KOKKOS_INLINE_FUNCTION static void verify( void )
+    { Kokkos::abort("Cuda code attempted to access HostSpace memory"); }
+
+  KOKKOS_INLINE_FUNCTION static void verify( const void * )
+    { Kokkos::abort("Cuda code attempted to access HostSpace memory"); }
+};
+
+/** Running in CudaSpace accessing CudaUVMSpace: ok */
+template<>
+struct VerifyExecutionCanAccessMemorySpace< Kokkos::CudaSpace , Kokkos::CudaUVMSpace >
+{
+  enum { value = true };
+  KOKKOS_INLINE_FUNCTION static void verify( void ) { }
+  KOKKOS_INLINE_FUNCTION static void verify( const void * ) { }
+};
+
+/** Running in CudaSpace accessing CudaHostPinnedSpace: ok */
+template<>
+struct VerifyExecutionCanAccessMemorySpace< Kokkos::CudaSpace , Kokkos::CudaHostPinnedSpace >
+{
+  enum { value = true };
+  KOKKOS_INLINE_FUNCTION static void verify( void ) { }
+  KOKKOS_INLINE_FUNCTION static void verify( const void * ) { }
+};
+
+/** Running in CudaSpace attempting to access an unknown space: error */
+template< class OtherSpace >
+struct VerifyExecutionCanAccessMemorySpace<
+  typename enable_if< ! is_same<Kokkos::CudaSpace,OtherSpace>::value , Kokkos::CudaSpace >::type ,
+  OtherSpace >
+{
+  enum { value = false };
+  KOKKOS_INLINE_FUNCTION static void verify( void )
+    { Kokkos::abort("Cuda code attempted to access unknown Space memory"); }
+
+  KOKKOS_INLINE_FUNCTION static void verify( const void * )
+    { Kokkos::abort("Cuda code attempted to access unknown Space memory"); }
+};
+
+//----------------------------------------------------------------------------
+/** Running in HostSpace attempting to access CudaSpace */
+template<>
+struct VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace , Kokkos::CudaSpace >
+{
+  enum { value = false };
+  inline static void verify( void ) { CudaSpace::access_error(); }
+  inline static void verify( const void * p ) { CudaSpace::access_error(p); }
+};
+
+/** Running in HostSpace accessing CudaUVMSpace is OK */
+template<>
+struct VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace , Kokkos::CudaUVMSpace >
+{
+  enum { value = true };
+  inline static void verify( void ) { }
+  inline static void verify( const void * ) { }
+};
+
+/** Running in HostSpace accessing CudaHostPinnedSpace is OK */
+template<>
+struct VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace , Kokkos::CudaHostPinnedSpace >
+{
+  enum { value = true };
+  KOKKOS_INLINE_FUNCTION static void verify( void ) {}
+  KOKKOS_INLINE_FUNCTION static void verify( const void * ) {}
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template<>
+class SharedAllocationRecord< Kokkos::CudaSpace , void >
+  : public SharedAllocationRecord< void , void >
+{
+private:
+
+  friend class SharedAllocationRecord< Kokkos::CudaUVMSpace , void > ;
+
+  typedef SharedAllocationRecord< void , void >  RecordBase ;
+
+  SharedAllocationRecord( const SharedAllocationRecord & ) = delete ;
+  SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ;
+
+  static void deallocate( RecordBase * );
+
+  static ::cudaTextureObject_t
+  attach_texture_object( const unsigned sizeof_alias
+                       , void * const   alloc_ptr
+                       , const size_t   alloc_size ); 
+
+  static RecordBase s_root_record ;
+
+  ::cudaTextureObject_t   m_tex_obj ;
+  const Kokkos::CudaSpace m_space ;
+
+protected:
+
+  ~SharedAllocationRecord();
+  SharedAllocationRecord() : RecordBase(), m_tex_obj(0), m_space() {}
+
+  SharedAllocationRecord( const Kokkos::CudaSpace        & arg_space
+                        , const std::string              & arg_label
+                        , const size_t                     arg_alloc_size
+                        , const RecordBase::function_type  arg_dealloc = & deallocate
+                        );
+
+public:
+
+  std::string get_label() const ;
+
+  static SharedAllocationRecord * allocate( const Kokkos::CudaSpace &  arg_space
+                                          , const std::string       &  arg_label
+                                          , const size_t               arg_alloc_size );
+
+  /**\brief  Allocate tracked memory in the space */
+  static
+  void * allocate_tracked( const Kokkos::CudaSpace & arg_space
+                         , const std::string & arg_label
+                         , const size_t arg_alloc_size );
+
+  /**\brief  Reallocate tracked memory in the space */
+  static
+  void * reallocate_tracked( void * const arg_alloc_ptr
+                           , const size_t arg_alloc_size );
+
+  /**\brief  Deallocate tracked memory in the space */
+  static
+  void deallocate_tracked( void * const arg_alloc_ptr );
+
+  static SharedAllocationRecord * get_record( void * arg_alloc_ptr );
+
+  template< typename AliasType >
+  inline
+  ::cudaTextureObject_t attach_texture_object()
+    {
+      static_assert( ( std::is_same< AliasType , int >::value ||
+                       std::is_same< AliasType , ::int2 >::value ||
+                       std::is_same< AliasType , ::int4 >::value )
+                   , "Cuda texture fetch only supported for alias types of int, ::int2, or ::int4" );
+
+      if ( m_tex_obj == 0 ) {
+        m_tex_obj = attach_texture_object( sizeof(AliasType)
+                                         , (void*) RecordBase::m_alloc_ptr
+                                         , RecordBase::m_alloc_size );
+      }
+
+      return m_tex_obj ;
+    }
+
+  template< typename AliasType >
+  inline
+  int attach_texture_object_offset( const AliasType * const ptr )
+    {
+      // Texture object is attached to the entire allocation range
+      return ptr - reinterpret_cast<AliasType*>( RecordBase::m_alloc_ptr );
+    }
+
+  static void print_records( std::ostream & , const Kokkos::CudaSpace & , bool detail = false );
+};
+
+
+template<>
+class SharedAllocationRecord< Kokkos::CudaUVMSpace , void >
+  : public SharedAllocationRecord< void , void >
+{
+private:
+
+  typedef SharedAllocationRecord< void , void >  RecordBase ;
+
+  SharedAllocationRecord( const SharedAllocationRecord & ) = delete ;
+  SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ;
+
+  static void deallocate( RecordBase * );
+
+  static RecordBase s_root_record ;
+
+  ::cudaTextureObject_t      m_tex_obj ;
+  const Kokkos::CudaUVMSpace m_space ;
+
+protected:
+
+  ~SharedAllocationRecord();
+  SharedAllocationRecord() : RecordBase(), m_tex_obj(0), m_space() {}
+
+  SharedAllocationRecord( const Kokkos::CudaUVMSpace     & arg_space
+                        , const std::string              & arg_label
+                        , const size_t                     arg_alloc_size
+                        , const RecordBase::function_type  arg_dealloc = & deallocate
+                        );
+
+public:
+
+  std::string get_label() const ;
+
+  static SharedAllocationRecord * allocate( const Kokkos::CudaUVMSpace &  arg_space
+                                          , const std::string          &  arg_label
+                                          , const size_t                  arg_alloc_size
+                                          );
+
+  /**\brief  Allocate tracked memory in the space */
+  static
+  void * allocate_tracked( const Kokkos::CudaUVMSpace & arg_space
+                         , const std::string & arg_label
+                         , const size_t arg_alloc_size );
+
+  /**\brief  Reallocate tracked memory in the space */
+  static
+  void * reallocate_tracked( void * const arg_alloc_ptr
+                           , const size_t arg_alloc_size );
+
+  /**\brief  Deallocate tracked memory in the space */
+  static
+  void deallocate_tracked( void * const arg_alloc_ptr );
+
+  static SharedAllocationRecord * get_record( void * arg_alloc_ptr );
+
+
+  template< typename AliasType >
+  inline
+  ::cudaTextureObject_t attach_texture_object()
+    {
+      static_assert( ( std::is_same< AliasType , int >::value ||
+                       std::is_same< AliasType , ::int2 >::value ||
+                       std::is_same< AliasType , ::int4 >::value )
+                   , "Cuda texture fetch only supported for alias types of int, ::int2, or ::int4" );
+
+      if ( m_tex_obj == 0 ) {
+        m_tex_obj = SharedAllocationRecord< Kokkos::CudaSpace , void >::
+          attach_texture_object( sizeof(AliasType)
+                               , (void*) RecordBase::m_alloc_ptr
+                               , RecordBase::m_alloc_size );
+      }
+
+      return m_tex_obj ;
+    }
+
+  template< typename AliasType >
+  inline
+  int attach_texture_object_offset( const AliasType * const ptr )
+    {
+      // Texture object is attached to the entire allocation range
+      return ptr - reinterpret_cast<AliasType*>( RecordBase::m_alloc_ptr );
+    }
+
+  static void print_records( std::ostream & , const Kokkos::CudaUVMSpace & , bool detail = false );
+};
+
+template<>
+class SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >
+  : public SharedAllocationRecord< void , void >
+{
+private:
+
+  typedef SharedAllocationRecord< void , void >  RecordBase ;
+
+  SharedAllocationRecord( const SharedAllocationRecord & ) = delete ;
+  SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ;
+
+  static void deallocate( RecordBase * );
+
+  static RecordBase s_root_record ;
+
+  const Kokkos::CudaHostPinnedSpace m_space ;
+
+protected:
+
+  ~SharedAllocationRecord();
+  SharedAllocationRecord() : RecordBase(), m_space() {}
+
+  SharedAllocationRecord( const Kokkos::CudaHostPinnedSpace     & arg_space
+                        , const std::string              & arg_label
+                        , const size_t                     arg_alloc_size
+                        , const RecordBase::function_type  arg_dealloc = & deallocate
+                        );
+
+public:
+
+  std::string get_label() const ;
+
+  static SharedAllocationRecord * allocate( const Kokkos::CudaHostPinnedSpace &  arg_space
+                                          , const std::string          &  arg_label
+                                          , const size_t                  arg_alloc_size
+                                          );
+  /**\brief  Allocate tracked memory in the space */
+  static
+  void * allocate_tracked( const Kokkos::CudaHostPinnedSpace & arg_space
+                         , const std::string & arg_label
+                         , const size_t arg_alloc_size );
+
+  /**\brief  Reallocate tracked memory in the space */
+  static
+  void * reallocate_tracked( void * const arg_alloc_ptr
+                           , const size_t arg_alloc_size );
+
+  /**\brief  Deallocate tracked memory in the space */
+  static
+  void deallocate_tracked( void * const arg_alloc_ptr );
+
+
+  static SharedAllocationRecord * get_record( void * arg_alloc_ptr );
+
+  static void print_records( std::ostream & , const Kokkos::CudaHostPinnedSpace & , bool detail = false );
+};
+
+} // namespace Impl
+} // namespace Experimental
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_HAVE_CUDA ) */
+#endif /* #define KOKKOS_CUDASPACE_HPP */
+
diff --git a/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp b/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..5834fc04dbe43c78bd53b032db1e97ade5e34655
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp
@@ -0,0 +1,570 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_EXECPOLICY_HPP
+#define KOKKOS_EXECPOLICY_HPP
+
+#include <Kokkos_Core_fwd.hpp>
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_StaticAssert.hpp>
+#include <impl/Kokkos_Error.hpp>
+#include <impl/Kokkos_Tags.hpp>
+#include <impl/Kokkos_AnalyzePolicy.hpp>
+#include <Kokkos_Concepts.hpp>
+#include <iostream>
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+/** \brief  Execution policy for work over a range of an integral type.
+ *
+ * Valid template argument options:
+ *
+ *  With a specified execution space:
+ *    < ExecSpace , WorkTag , { IntConst | IntType } >
+ *    < ExecSpace , WorkTag , void >
+ *    < ExecSpace , { IntConst | IntType } , void >
+ *    < ExecSpace , void , void >
+ *
+ *  With the default execution space:
+ *    < WorkTag , { IntConst | IntType } , void >
+ *    < WorkTag , void , void >
+ *    < { IntConst | IntType } , void , void >
+ *    < void , void , void >
+ *
+ *  IntType  is a fundamental integral type
+ *  IntConst is an Impl::integral_constant< IntType , Blocking >
+ *
+ *  Blocking is the granularity of partitioning the range among threads.
+ */
+template<class ... Properties>
+class RangePolicy
+  : public Impl::PolicyTraits<Properties ... >
+{
+private:
+
+  typedef Impl::PolicyTraits<Properties ... > traits;
+
+  typename traits::execution_space m_space ;
+  typename traits::index_type  m_begin ;
+  typename traits::index_type  m_end ;
+  typename traits::index_type  m_granularity ;
+  typename traits::index_type  m_granularity_mask ;
+public:
+
+  //! Tag this class as an execution policy
+  typedef RangePolicy execution_policy;
+  typedef typename traits::index_type member_type ;
+
+  KOKKOS_INLINE_FUNCTION const typename traits::execution_space & space() const { return m_space ; }
+  KOKKOS_INLINE_FUNCTION member_type begin() const { return m_begin ; }
+  KOKKOS_INLINE_FUNCTION member_type end()   const { return m_end ; }
+
+
+  //TODO: find a better workaround for Clangs weird instantiation order
+  // This thing is here because of an instantiation error, where the RangePolicy is inserted into FunctorValue Traits, which
+  // tries decltype on the operator. It tries to do this even though the first argument of parallel for clearly doesn't match.
+  void operator()(const int&) const {}
+
+  RangePolicy(const RangePolicy&) = default;
+  RangePolicy(RangePolicy&&) = default;
+
+  inline RangePolicy() : m_space(), m_begin(0), m_end(0) {}
+
+  /** \brief  Total range */
+  inline
+  RangePolicy( const typename traits::execution_space & work_space
+             , const member_type work_begin
+             , const member_type work_end
+             )
+    : m_space( work_space )
+    , m_begin( work_begin < work_end ? work_begin : 0 )
+    , m_end(   work_begin < work_end ? work_end : 0 )
+    , m_granularity(0)
+    , m_granularity_mask(0)
+    {
+      set_auto_chunk_size();
+    }
+
+  /** \brief  Total range */
+  inline
+  RangePolicy( const member_type work_begin
+             , const member_type work_end
+             )
+    : RangePolicy( typename traits::execution_space()
+                 , work_begin , work_end )
+    {}
+
+  public:
+
+     /** \brief return chunk_size */
+     inline member_type chunk_size() const {
+       return m_granularity;
+     }
+
+     /** \brief set chunk_size to a discrete value*/
+     inline RangePolicy set_chunk_size(int chunk_size_) const {
+       RangePolicy p = *this;
+       p.m_granularity = chunk_size_;
+       p.m_granularity_mask = p.m_granularity - 1;
+       return p;
+     }
+
+  private:
+     /** \brief finalize chunk_size if it was set to AUTO*/
+     inline void set_auto_chunk_size() {
+
+       typename traits::index_type concurrency = traits::execution_space::concurrency();
+       if( concurrency==0 ) concurrency=1;
+
+       if(m_granularity > 0) {
+         if(!Impl::is_integral_power_of_two( m_granularity ))
+           Kokkos::abort("RangePolicy blocking granularity must be power of two" );
+       }
+
+
+       member_type new_chunk_size = 1;
+       while(new_chunk_size*100*concurrency < m_end-m_begin)
+         new_chunk_size *= 2;
+       if(new_chunk_size < 128) {
+         new_chunk_size = 1;
+         while( (new_chunk_size*40*concurrency < m_end-m_begin ) && (new_chunk_size<128) )
+           new_chunk_size*=2;
+       }
+       m_granularity = new_chunk_size;
+       m_granularity_mask = m_granularity - 1;
+     }
+
+  public:
+  /** \brief  Subrange for a partition's rank and size.
+   *
+   *  Typically used to partition a range over a group of threads.
+   */
+  struct WorkRange {
+    typedef typename RangePolicy::work_tag     work_tag ;
+    typedef typename RangePolicy::member_type  member_type ;
+
+    KOKKOS_INLINE_FUNCTION member_type begin() const { return m_begin ; }
+    KOKKOS_INLINE_FUNCTION member_type end()   const { return m_end ; }
+
+    /** \brief  Subrange for a partition's rank and size.
+     *
+     *  Typically used to partition a range over a group of threads.
+     */
+    KOKKOS_INLINE_FUNCTION
+    WorkRange( const RangePolicy & range
+             , const int part_rank
+             , const int part_size
+             )
+      : m_begin(0), m_end(0)
+      {
+        if ( part_size ) {
+
+          // Split evenly among partitions, then round up to the granularity.
+          const member_type work_part =
+            ( ( ( ( range.end() - range.begin() ) + ( part_size - 1 ) ) / part_size )
+              + range.m_granularity_mask ) & ~member_type(range.m_granularity_mask);
+
+          m_begin = range.begin() + work_part * part_rank ;
+          m_end   = m_begin       + work_part ;
+
+          if ( range.end() < m_begin ) m_begin = range.end() ;
+          if ( range.end() < m_end )   m_end   = range.end() ;
+        }
+      }
+  private:
+     member_type m_begin ;
+     member_type m_end ;
+     WorkRange();
+     WorkRange & operator = ( const WorkRange & );
+
+  };
+};
+
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+namespace Impl {
+
+
+template< class ExecSpace, class ... Properties>
+class TeamPolicyInternal: public Impl::PolicyTraits<Properties ... > {
+private:
+  typedef Impl::PolicyTraits<Properties ... > traits;
+
+public:
+
+  //----------------------------------------
+  /** \brief  Query maximum team size for a given functor.
+   *
+   *  This size takes into account execution space concurrency limitations and
+   *  scratch memory space limitations for reductions, team reduce/scan, and
+   *  team shared memory.
+   */
+  template< class FunctorType >
+  static int team_size_max( const FunctorType & );
+
+  /** \brief  Query recommended team size for a given functor.
+   *
+   *  This size takes into account execution space concurrency limitations and
+   *  scratch memory space limitations for reductions, team reduce/scan, and
+   *  team shared memory.
+   */
+  template< class FunctorType >
+  static int team_size_recommended( const FunctorType & );
+
+  template< class FunctorType >
+  static int team_size_recommended( const FunctorType & , const int&);
+  //----------------------------------------
+  /** \brief  Construct policy with the given instance of the execution space */
+  TeamPolicyInternal( const typename traits::execution_space & , int league_size_request , int team_size_request , int vector_length_request = 1 );
+
+  TeamPolicyInternal( const typename traits::execution_space & , int league_size_request , const Kokkos::AUTO_t & , int vector_length_request = 1 );
+
+  /** \brief  Construct policy with the default instance of the execution space */
+  TeamPolicyInternal( int league_size_request , int team_size_request , int vector_length_request = 1 );
+
+  TeamPolicyInternal( int league_size_request , const Kokkos::AUTO_t & , int vector_length_request = 1 );
+
+/*  TeamPolicyInternal( int league_size_request , int team_size_request );
+
+  TeamPolicyInternal( int league_size_request , const Kokkos::AUTO_t & );*/
+
+  /** \brief  The actual league size (number of teams) of the policy.
+   *
+   *  This may be smaller than the requested league size due to limitations
+   *  of the execution space.
+   */
+  KOKKOS_INLINE_FUNCTION int league_size() const ;
+
+  /** \brief  The actual team size (number of threads per team) of the policy.
+   *
+   *  This may be smaller than the requested team size due to limitations
+   *  of the execution space.
+   */
+  KOKKOS_INLINE_FUNCTION int team_size() const ;
+
+  inline typename traits::index_type chunk_size() const ;
+
+  inline TeamPolicyInternal set_chunk_size(int chunk_size) const ;
+
+  /** \brief  Parallel execution of a functor calls the functor once with
+   *          each member of the execution policy.
+   */
+  struct member_type {
+
+    /** \brief  Handle to the currently executing team shared scratch memory */
+    KOKKOS_INLINE_FUNCTION
+    typename traits::execution_space::scratch_memory_space team_shmem() const ;
+
+    /** \brief  Rank of this team within the league of teams */
+    KOKKOS_INLINE_FUNCTION int league_rank() const ;
+
+    /** \brief  Number of teams in the league */
+    KOKKOS_INLINE_FUNCTION int league_size() const ;
+
+    /** \brief  Rank of this thread within this team */
+    KOKKOS_INLINE_FUNCTION int team_rank() const ;
+
+    /** \brief  Number of threads in this team */
+    KOKKOS_INLINE_FUNCTION int team_size() const ;
+
+    /** \brief  Barrier among the threads of this team */
+    KOKKOS_INLINE_FUNCTION void team_barrier() const ;
+
+    /** \brief  Intra-team reduction. Returns join of all values of the team members. */
+    template< class JoinOp >
+    KOKKOS_INLINE_FUNCTION
+    typename JoinOp::value_type team_reduce( const typename JoinOp::value_type
+                                           , const JoinOp & ) const ;
+
+    /** \brief  Intra-team exclusive prefix sum with team_rank() ordering.
+     *
+     *  The highest rank thread can compute the reduction total as
+     *    reduction_total = dev.team_scan( value ) + value ;
+     */
+    template< typename Type >
+    KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value ) const ;
+
+    /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
+     *          with intra-team non-deterministic ordering accumulation.
+     *
+     *  The global inter-team accumulation value will, at the end of the
+     *  league's parallel execution, be the scan's total.
+     *  Parallel execution ordering of the league's teams is non-deterministic.
+     *  As such the base value for each team's scan operation is similarly
+     *  non-deterministic.
+     */
+    template< typename Type >
+    KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value , Type * const global_accum ) const ;
+  };
+};
+}
+
+namespace Impl {
+  struct PerTeamValue {
+    int value;
+    PerTeamValue(int arg);
+  };
+
+  struct PerThreadValue {
+    int value;
+    PerThreadValue(int arg);
+  };
+}
+
+Impl::PerTeamValue PerTeam(const int& arg);
+Impl::PerThreadValue PerThread(const int& arg);
+
+
+/** \brief  Execution policy for parallel work over a league of teams of threads.
+ *
+ *  The work functor is called for each thread of each team such that
+ *  the team's member threads are guaranteed to be concurrent.
+ *
+ *  The team's threads have access to team shared scratch memory and
+ *  team collective operations.
+ *
+ *  If the WorkTag is non-void then the first calling argument of the
+ *  work functor's parentheses operator is 'const WorkTag &'.
+ *  This allows a functor to have multiple work member functions.
+ *
+ *  Order of template arguments does not matter, since the implementation
+ *  uses variadic templates. Each and any of the template arguments can
+ *  be omitted.
+ *
+ *  Possible Template arguments and there default values:
+ *    ExecutionSpace (DefaultExecutionSpace): where to execute code. Must be enabled.
+ *    WorkTag (none): Tag which is used as the first argument for the functor operator.
+ *    Schedule<Type> (Schedule<Static>): Scheduling Policy (Dynamic, or Static).
+ *    IndexType<Type> (IndexType<ExecutionSpace::size_type>: Integer Index type used to iterate over the Index space.
+ */
+template< class ... Properties>
+class TeamPolicy: public
+  Impl::TeamPolicyInternal<
+     typename Impl::PolicyTraits<Properties ... >::execution_space,
+     Properties ...> {
+  typedef Impl::TeamPolicyInternal<
+       typename Impl::PolicyTraits<Properties ... >::execution_space,
+       Properties ...> internal_policy;
+
+  typedef Impl::PolicyTraits<Properties ... > traits;
+
+public:
+  typedef TeamPolicy execution_policy;
+
+  TeamPolicy& operator = (const TeamPolicy&) = default;
+
+  /** \brief  Construct policy with the given instance of the execution space */
+  TeamPolicy( const typename traits::execution_space & , int league_size_request , int team_size_request , int vector_length_request = 1 )
+    : internal_policy(typename traits::execution_space(),league_size_request,team_size_request, vector_length_request) {}
+
+  TeamPolicy( const typename traits::execution_space & , int league_size_request , const Kokkos::AUTO_t & , int vector_length_request = 1 )
+    : internal_policy(typename traits::execution_space(),league_size_request,Kokkos::AUTO(), vector_length_request) {}
+
+  /** \brief  Construct policy with the default instance of the execution space */
+  TeamPolicy( int league_size_request , int team_size_request , int vector_length_request = 1 )
+    : internal_policy(league_size_request,team_size_request, vector_length_request) {}
+
+  TeamPolicy( int league_size_request , const Kokkos::AUTO_t & , int vector_length_request = 1 )
+    : internal_policy(league_size_request,Kokkos::AUTO(), vector_length_request) {}
+
+/*  TeamPolicy( int league_size_request , int team_size_request  )
+    : internal_policy(league_size_request,team_size_request) {}
+
+  TeamPolicy( int league_size_request , const Kokkos::AUTO_t &  )
+    : internal_policy(league_size_request,Kokkos::AUTO()) {}*/
+
+private:
+  TeamPolicy(const internal_policy& p):internal_policy(p) {}
+public:
+
+  inline TeamPolicy set_chunk_size(int chunk) const {
+    return TeamPolicy(internal_policy::set_chunk_size(chunk));
+  };
+
+  inline TeamPolicy set_scratch_size(const int& level, const Impl::PerTeamValue& per_team) const {
+    return TeamPolicy(internal_policy::set_scratch_size(level,per_team));
+  };
+  inline TeamPolicy set_scratch_size(const int& level, const Impl::PerThreadValue& per_thread) const {
+    return TeamPolicy(internal_policy::set_scratch_size(level,per_thread));
+  };
+  inline TeamPolicy set_scratch_size(const int& level, const Impl::PerTeamValue& per_team, const Impl::PerThreadValue& per_thread) const {
+    return TeamPolicy(internal_policy::set_scratch_size(level, per_team, per_thread));
+  };
+  inline TeamPolicy set_scratch_size(const int& level, const Impl::PerThreadValue& per_thread, const Impl::PerTeamValue& per_team) const {
+    return TeamPolicy(internal_policy::set_scratch_size(level, per_team, per_thread));
+  };
+
+};
+
+} // namespace Kokkos
+
+namespace Kokkos {
+
+namespace Impl {
+
+template<typename iType, class TeamMemberType>
+struct TeamThreadRangeBoundariesStruct {
+private:
+
+  KOKKOS_INLINE_FUNCTION static
+  iType ibegin( const iType & arg_begin
+              , const iType & arg_end
+              , const iType & arg_rank
+              , const iType & arg_size
+              )
+    {
+      return arg_begin + ( ( arg_end - arg_begin + arg_size - 1 ) / arg_size ) * arg_rank ;
+    }
+
+  KOKKOS_INLINE_FUNCTION static
+  iType iend( const iType & arg_begin
+            , const iType & arg_end
+            , const iType & arg_rank
+            , const iType & arg_size
+            )
+    {
+      const iType end_ = arg_begin + ( ( arg_end - arg_begin + arg_size - 1 ) / arg_size ) * ( arg_rank + 1 );
+      return end_ < arg_end ? end_ : arg_end ;
+    }
+
+public:
+
+  typedef iType index_type;
+  const iType start;
+  const iType end;
+  enum {increment = 1};
+  const TeamMemberType& thread;
+
+  KOKKOS_INLINE_FUNCTION
+  TeamThreadRangeBoundariesStruct( const TeamMemberType& arg_thread
+                                , const iType& arg_end
+                                )
+    : start( ibegin( 0 , arg_end , arg_thread.team_rank() , arg_thread.team_size() ) )
+    , end(   iend(   0 , arg_end , arg_thread.team_rank() , arg_thread.team_size() ) )
+    , thread( arg_thread )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  TeamThreadRangeBoundariesStruct( const TeamMemberType& arg_thread
+                                , const iType& arg_begin
+                                , const iType& arg_end
+                                )
+    : start( ibegin( arg_begin , arg_end , arg_thread.team_rank() , arg_thread.team_size() ) )
+    , end(   iend(   arg_begin , arg_end , arg_thread.team_rank() , arg_thread.team_size() ) )
+    , thread( arg_thread )
+    {}
+};
+
+  template<typename iType, class TeamMemberType>
+  struct ThreadVectorRangeBoundariesStruct {
+    typedef iType index_type;
+    enum {start = 0};
+    const iType end;
+    enum {increment = 1};
+
+    KOKKOS_INLINE_FUNCTION
+    ThreadVectorRangeBoundariesStruct (const TeamMemberType& thread, const iType& count):
+      end( count )
+    {}
+  };
+
+  template<class TeamMemberType>
+  struct ThreadSingleStruct {
+    const TeamMemberType& team_member;
+    KOKKOS_INLINE_FUNCTION
+    ThreadSingleStruct(const TeamMemberType& team_member_):team_member(team_member_){}
+  };
+
+  template<class TeamMemberType>
+  struct VectorSingleStruct {
+    const TeamMemberType& team_member;
+    KOKKOS_INLINE_FUNCTION
+    VectorSingleStruct(const TeamMemberType& team_member_):team_member(team_member_){}
+  };
+} // namespace Impl
+
+/** \brief  Execution policy for parallel work over a threads within a team.
+ *
+ *  The range is split over all threads in a team. The Mapping scheme depends on the architecture.
+ *  This policy is used together with a parallel pattern as a nested layer within a kernel launched
+ *  with the TeamPolicy. This variant expects a single count. So the range is (0,count].
+ */
+template<typename iType, class TeamMemberType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,TeamMemberType> TeamThreadRange(const TeamMemberType&, const iType& count);
+
+/** \brief  Execution policy for parallel work over a threads within a team.
+ *
+ *  The range is split over all threads in a team. The Mapping scheme depends on the architecture.
+ *  This policy is used together with a parallel pattern as a nested layer within a kernel launched
+ *  with the TeamPolicy. This variant expects a begin and end. So the range is (begin,end].
+ */
+template<typename iType, class TeamMemberType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,TeamMemberType> TeamThreadRange(const TeamMemberType&, const iType& begin, const iType& end);
+
+/** \brief  Execution policy for a vector parallel loop.
+ *
+ *  The range is split over all vector lanes in a thread. The Mapping scheme depends on the architecture.
+ *  This policy is used together with a parallel pattern as a nested layer within a kernel launched
+ *  with the TeamPolicy. This variant expects a single count. So the range is (0,count].
+ */
+template<typename iType, class TeamMemberType>
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadVectorRangeBoundariesStruct<iType,TeamMemberType> ThreadVectorRange(const TeamMemberType&, const iType& count);
+
+} // namespace Kokkos
+
+
+#endif /* #define KOKKOS_EXECPOLICY_HPP */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
diff --git a/lib/kokkos/core/src/Kokkos_HBWSpace.hpp b/lib/kokkos/core/src/Kokkos_HBWSpace.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e02689b0f96f370448061cb90bd80a3492d32c35
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_HBWSpace.hpp
@@ -0,0 +1,312 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_HBWSPACE_HPP
+#define KOKKOS_HBWSPACE_HPP
+
+
+#include <Kokkos_HostSpace.hpp>
+#include <impl/Kokkos_HBWAllocators.hpp>
+
+/*--------------------------------------------------------------------------*/
+#ifdef KOKKOS_HAVE_HBWSPACE
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+/// \brief Initialize lock array for arbitrary size atomics.
+///
+/// Arbitrary atomics are implemented using a hash table of locks
+/// where the hash value is derived from the address of the
+/// object for which an atomic operation is performed.
+/// This function initializes the locks to zero (unset).
+void init_lock_array_hbw_space();
+
+/// \brief Aquire a lock for the address
+///
+/// This function tries to aquire the lock for the hash value derived
+/// from the provided ptr. If the lock is successfully aquired the
+/// function returns true. Otherwise it returns false.
+bool lock_address_hbw_space(void* ptr);
+
+/// \brief Release lock for the address
+///
+/// This function releases the lock for the hash value derived
+/// from the provided ptr. This function should only be called
+/// after previously successfully aquiring a lock with
+/// lock_address.
+void unlock_address_hbw_space(void* ptr);
+
+} // namespace Impl
+} // neamspace Experimental
+} // namespace Kokkos
+
+namespace Kokkos {
+namespace Experimental {
+
+/// \class HBWSpace
+/// \brief Memory management for host memory.
+///
+/// HBWSpace is a memory space that governs host memory.  "Host"
+/// memory means the usual CPU-accessible memory.
+class HBWSpace {
+public:
+
+  //! Tag this class as a kokkos memory space
+  typedef HBWSpace  memory_space ;
+  typedef size_t     size_type ;
+
+  /// \typedef execution_space
+  /// \brief Default execution space for this memory space.
+  ///
+  /// Every memory space has a default execution space.  This is
+  /// useful for things like initializing a View (which happens in
+  /// parallel using the View's default execution space).
+#if defined( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP )
+  typedef Kokkos::OpenMP   execution_space ;
+#elif defined( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS )
+  typedef Kokkos::Threads  execution_space ;
+#elif defined( KOKKOS_HAVE_OPENMP )
+  typedef Kokkos::OpenMP   execution_space ;
+#elif defined( KOKKOS_HAVE_PTHREAD )
+  typedef Kokkos::Threads  execution_space ;
+#elif defined( KOKKOS_HAVE_SERIAL )
+  typedef Kokkos::Serial   execution_space ;
+#else
+#  error "At least one of the following host execution spaces must be defined: Kokkos::OpenMP, Kokkos::Serial, or Kokkos::Threads.  You might be seeing this message if you disabled the Kokkos::Serial device explicitly using the Kokkos_ENABLE_Serial:BOOL=OFF CMake option, but did not enable any of the other host execution space devices."
+#endif
+
+  //! This memory space preferred device_type
+  typedef Kokkos::Device<execution_space,memory_space> device_type;
+
+  /*--------------------------------*/
+  /* Functions unique to the HBWSpace */
+  static int in_parallel();
+
+  static void register_in_parallel( int (*)() );
+
+  /*--------------------------------*/
+
+  /**\brief  Default memory space instance */
+  HBWSpace();
+  HBWSpace( const HBWSpace & rhs ) = default ;
+  HBWSpace & operator = ( const HBWSpace & ) = default ;
+  ~HBWSpace() = default ;
+
+  /**\brief  Non-default memory space instance to choose allocation mechansim, if available */
+
+  enum AllocationMechanism { STD_MALLOC , POSIX_MEMALIGN , POSIX_MMAP , INTEL_MM_ALLOC };
+
+  explicit
+  HBWSpace( const AllocationMechanism & );
+
+  /**\brief  Allocate untracked memory in the space */
+  void * allocate( const size_t arg_alloc_size ) const ;
+
+  /**\brief  Deallocate untracked memory in the space */
+  void deallocate( void * const arg_alloc_ptr 
+                 , const size_t arg_alloc_size ) const ;
+
+private:
+
+  AllocationMechanism  m_alloc_mech ;
+
+  friend class Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::Experimental::HBWSpace , void > ;
+};
+
+} // namespace Experimental
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template<>
+class SharedAllocationRecord< Kokkos::Experimental::HBWSpace , void >
+  : public SharedAllocationRecord< void , void >
+{
+private:
+
+  friend Kokkos::Experimental::HBWSpace ;
+
+  typedef SharedAllocationRecord< void , void >  RecordBase ;
+
+  SharedAllocationRecord( const SharedAllocationRecord & ) = delete ;
+  SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ;
+
+  static void deallocate( RecordBase * );
+
+  /**\brief  Root record for tracked allocations from this HBWSpace instance */
+  static RecordBase s_root_record ;
+
+  const Kokkos::Experimental::HBWSpace m_space ;
+
+protected:
+
+  ~SharedAllocationRecord();
+  SharedAllocationRecord() = default ;
+
+  SharedAllocationRecord( const Kokkos::Experimental::HBWSpace        & arg_space
+                        , const std::string              & arg_label
+                        , const size_t                     arg_alloc_size
+                        , const RecordBase::function_type  arg_dealloc = & deallocate
+                        );
+
+public:
+
+  inline
+  std::string get_label() const
+    {
+      return std::string( RecordBase::head()->m_label );
+    }
+
+  KOKKOS_INLINE_FUNCTION static
+  SharedAllocationRecord * allocate( const Kokkos::Experimental::HBWSpace &  arg_space
+                                   , const std::string       &  arg_label
+                                   , const size_t               arg_alloc_size
+                                   )
+    {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      return new SharedAllocationRecord( arg_space , arg_label , arg_alloc_size );
+#else
+      return (SharedAllocationRecord *) 0 ;
+#endif
+    }
+
+  /**\brief  Allocate tracked memory in the space */
+  static
+  void * allocate_tracked( const Kokkos::Experimental::HBWSpace & arg_space
+                         , const std::string & arg_label
+                         , const size_t arg_alloc_size );
+
+  /**\brief  Reallocate tracked memory in the space */
+  static
+  void * reallocate_tracked( void * const arg_alloc_ptr
+                           , const size_t arg_alloc_size );
+
+  /**\brief  Deallocate tracked memory in the space */
+  static
+  void deallocate_tracked( void * const arg_alloc_ptr );
+
+
+  static SharedAllocationRecord * get_record( void * arg_alloc_ptr );
+
+  static void print_records( std::ostream & , const Kokkos::Experimental::HBWSpace & , bool detail = false );
+};
+
+} // namespace Impl
+} // namespace Experimental
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+
+template<class ExecutionSpace>
+struct DeepCopy<Experimental::HBWSpace,Experimental::HBWSpace,ExecutionSpace> {
+  DeepCopy( void * dst , const void * src , size_t n ) {
+    memcpy( dst , src , n );
+  }
+  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n ) {
+    exec.fence();
+    memcpy( dst , src , n );
+  }
+};
+
+template<class ExecutionSpace>
+struct DeepCopy<HostSpace,Experimental::HBWSpace,ExecutionSpace> {
+  DeepCopy( void * dst , const void * src , size_t n ) {
+    memcpy( dst , src , n );
+  }
+  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n ) {
+    exec.fence();
+    memcpy( dst , src , n );
+  }
+};
+
+template<class ExecutionSpace>
+struct DeepCopy<Experimental::HBWSpace,HostSpace,ExecutionSpace> {
+  DeepCopy( void * dst , const void * src , size_t n ) {
+    memcpy( dst , src , n );
+  }
+  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n ) {
+    exec.fence();
+    memcpy( dst , src , n );
+  }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+struct VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace , Kokkos::Experimental::HBWSpace >
+{
+  enum { value = true };
+  inline static void verify( void ) { }
+  inline static void verify( const void * ) { }
+};
+
+template<>
+struct VerifyExecutionCanAccessMemorySpace< Kokkos::Experimental::HBWSpace , Kokkos::HostSpace >
+{
+  enum { value = true };
+  inline static void verify( void ) { }
+  inline static void verify( const void * ) { }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+#endif
+#endif /* #define KOKKOS_HBWSPACE_HPP */
+
diff --git a/lib/kokkos/core/src/Kokkos_HostSpace.hpp b/lib/kokkos/core/src/Kokkos_HostSpace.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..5fe686559a07d63cb4a07bf821203672c1336699
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_HostSpace.hpp
@@ -0,0 +1,275 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_HOSTSPACE_HPP
+#define KOKKOS_HOSTSPACE_HPP
+
+#include <cstring>
+#include <string>
+#include <iosfwd>
+#include <typeinfo>
+
+#include <Kokkos_Core_fwd.hpp>
+#include <Kokkos_MemoryTraits.hpp>
+
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_Error.hpp>
+
+#include <impl/KokkosExp_SharedAlloc.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+/// \brief Initialize lock array for arbitrary size atomics.
+///
+/// Arbitrary atomics are implemented using a hash table of locks
+/// where the hash value is derived from the address of the
+/// object for which an atomic operation is performed.
+/// This function initializes the locks to zero (unset).
+void init_lock_array_host_space();
+
+/// \brief Aquire a lock for the address
+///
+/// This function tries to aquire the lock for the hash value derived
+/// from the provided ptr. If the lock is successfully aquired the
+/// function returns true. Otherwise it returns false.
+bool lock_address_host_space(void* ptr);
+
+/// \brief Release lock for the address
+///
+/// This function releases the lock for the hash value derived
+/// from the provided ptr. This function should only be called
+/// after previously successfully aquiring a lock with
+/// lock_address.
+void unlock_address_host_space(void* ptr);
+
+} // namespace Impl
+} // namespace Kokkos
+
+namespace Kokkos {
+
+/// \class HostSpace
+/// \brief Memory management for host memory.
+///
+/// HostSpace is a memory space that governs host memory.  "Host"
+/// memory means the usual CPU-accessible memory.
+class HostSpace {
+public:
+
+  //! Tag this class as a kokkos memory space
+  typedef HostSpace  memory_space ;
+  typedef size_t     size_type ;
+
+  /// \typedef execution_space
+  /// \brief Default execution space for this memory space.
+  ///
+  /// Every memory space has a default execution space.  This is
+  /// useful for things like initializing a View (which happens in
+  /// parallel using the View's default execution space).
+#if defined( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP )
+  typedef Kokkos::OpenMP   execution_space ;
+#elif defined( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS )
+  typedef Kokkos::Threads  execution_space ;
+#elif defined( KOKKOS_HAVE_OPENMP )
+  typedef Kokkos::OpenMP   execution_space ;
+#elif defined( KOKKOS_HAVE_PTHREAD )
+  typedef Kokkos::Threads  execution_space ;
+#elif defined( KOKKOS_HAVE_SERIAL )
+  typedef Kokkos::Serial   execution_space ;
+#else
+#  error "At least one of the following host execution spaces must be defined: Kokkos::OpenMP, Kokkos::Serial, or Kokkos::Threads.  You might be seeing this message if you disabled the Kokkos::Serial device explicitly using the Kokkos_ENABLE_Serial:BOOL=OFF CMake option, but did not enable any of the other host execution space devices."
+#endif
+
+  //! This memory space preferred device_type
+  typedef Kokkos::Device<execution_space,memory_space> device_type;
+
+  /*--------------------------------*/
+  /* Functions unique to the HostSpace */
+  static int in_parallel();
+
+  static void register_in_parallel( int (*)() );
+
+  /*--------------------------------*/
+
+  /**\brief  Default memory space instance */
+  HostSpace();
+  HostSpace( HostSpace && rhs ) = default ;
+  HostSpace( const HostSpace & rhs ) = default ;
+  HostSpace & operator = ( HostSpace && ) = default ;
+  HostSpace & operator = ( const HostSpace & ) = default ;
+  ~HostSpace() = default ;
+
+  /**\brief  Non-default memory space instance to choose allocation mechansim, if available */
+
+  enum AllocationMechanism { STD_MALLOC , POSIX_MEMALIGN , POSIX_MMAP , INTEL_MM_ALLOC };
+
+  explicit
+  HostSpace( const AllocationMechanism & );
+
+  /**\brief  Allocate untracked memory in the space */
+  void * allocate( const size_t arg_alloc_size ) const ;
+
+  /**\brief  Deallocate untracked memory in the space */
+  void deallocate( void * const arg_alloc_ptr 
+                 , const size_t arg_alloc_size ) const ;
+
+private:
+
+  AllocationMechanism  m_alloc_mech ;
+
+  friend class Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > ;
+};
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template<>
+class SharedAllocationRecord< Kokkos::HostSpace , void >
+  : public SharedAllocationRecord< void , void >
+{
+private:
+
+  friend Kokkos::HostSpace ;
+
+  typedef SharedAllocationRecord< void , void >  RecordBase ;
+
+  SharedAllocationRecord( const SharedAllocationRecord & ) = delete ;
+  SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ;
+
+  static void deallocate( RecordBase * );
+
+  /**\brief  Root record for tracked allocations from this HostSpace instance */
+  static RecordBase s_root_record ;
+
+  const Kokkos::HostSpace m_space ;
+
+protected:
+
+  ~SharedAllocationRecord();
+  SharedAllocationRecord() = default ;
+
+  SharedAllocationRecord( const Kokkos::HostSpace        & arg_space
+                        , const std::string              & arg_label
+                        , const size_t                     arg_alloc_size
+                        , const RecordBase::function_type  arg_dealloc = & deallocate
+                        );
+
+public:
+
+  inline
+  std::string get_label() const
+    {
+      return std::string( RecordBase::head()->m_label );
+    }
+
+  KOKKOS_INLINE_FUNCTION static
+  SharedAllocationRecord * allocate( const Kokkos::HostSpace &  arg_space
+                                   , const std::string       &  arg_label
+                                   , const size_t               arg_alloc_size
+                                   )
+    {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      return new SharedAllocationRecord( arg_space , arg_label , arg_alloc_size );
+#else
+      return (SharedAllocationRecord *) 0 ;
+#endif
+    }
+
+  /**\brief  Allocate tracked memory in the space */
+  static
+  void * allocate_tracked( const Kokkos::HostSpace & arg_space
+                         , const std::string & arg_label
+                         , const size_t arg_alloc_size );
+
+  /**\brief  Reallocate tracked memory in the space */
+  static
+  void * reallocate_tracked( void * const arg_alloc_ptr
+                           , const size_t arg_alloc_size );
+
+  /**\brief  Deallocate tracked memory in the space */
+  static
+  void deallocate_tracked( void * const arg_alloc_ptr );
+
+
+  static SharedAllocationRecord * get_record( void * arg_alloc_ptr );
+
+  static void print_records( std::ostream & , const Kokkos::HostSpace & , bool detail = false );
+};
+
+} // namespace Impl
+} // namespace Experimental
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class DstSpace, class SrcSpace, class ExecutionSpace = typename DstSpace::execution_space> struct DeepCopy ;
+
+template<class ExecutionSpace>
+struct DeepCopy<HostSpace,HostSpace,ExecutionSpace> {
+  DeepCopy( void * dst , const void * src , size_t n ) {
+    memcpy( dst , src , n );
+  }
+  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n ) {
+    exec.fence();
+    memcpy( dst , src , n );
+  }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+
+#endif /* #define KOKKOS_HOSTSPACE_HPP */
+
diff --git a/lib/kokkos/core/src/Kokkos_Layout.hpp b/lib/kokkos/core/src/Kokkos_Layout.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..c77c33703bdd76161b20c2e5ae59b96c03c4550e
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_Layout.hpp
@@ -0,0 +1,233 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+/// \file Kokkos_Layout.hpp
+/// \brief Declaration of various \c MemoryLayout options.
+
+#ifndef KOKKOS_LAYOUT_HPP
+#define KOKKOS_LAYOUT_HPP
+
+#include <stddef.h>
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_Tags.hpp>
+
+namespace Kokkos {
+
+enum { ARRAY_LAYOUT_MAX_RANK = 8 };
+
+//----------------------------------------------------------------------------
+/// \struct LayoutLeft
+/// \brief Memory layout tag indicating left-to-right (Fortran scheme)
+///   striding of multi-indices.
+///
+/// This is an example of a \c MemoryLayout template parameter of
+/// View.  The memory layout describes how View maps from a
+/// multi-index (i0, i1, ..., ik) to a memory location.  
+///
+/// "Layout left" indicates a mapping where the leftmost index i0
+/// refers to contiguous access, and strides increase for dimensions
+/// going right from there (i1, i2, ...).  This layout imitates how
+/// Fortran stores multi-dimensional arrays.  For the special case of
+/// a two-dimensional array, "layout left" is also called "column
+/// major."
+struct LayoutLeft {
+  //! Tag this class as a kokkos array layout
+  typedef LayoutLeft array_layout ;
+
+  size_t dimension[ ARRAY_LAYOUT_MAX_RANK ];
+
+  LayoutLeft( LayoutLeft const & ) = default ;
+  LayoutLeft( LayoutLeft && ) = default ;
+  LayoutLeft & operator = ( LayoutLeft const & ) = default ;
+  LayoutLeft & operator = ( LayoutLeft && ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr
+  LayoutLeft( size_t N0 = 0 , size_t N1 = 0 , size_t N2 = 0 , size_t N3 = 0
+            , size_t N4 = 0 , size_t N5 = 0 , size_t N6 = 0 , size_t N7 = 0 )
+    : dimension { N0 , N1 , N2 , N3 , N4 , N5 , N6 , N7 } {}
+};
+
+//----------------------------------------------------------------------------
+/// \struct LayoutRight
+/// \brief Memory layout tag indicating right-to-left (C or
+///   lexigraphical scheme) striding of multi-indices.
+///
+/// This is an example of a \c MemoryLayout template parameter of
+/// View.  The memory layout describes how View maps from a
+/// multi-index (i0, i1, ..., ik) to a memory location.  
+///
+/// "Right layout" indicates a mapping where the rightmost index ik
+/// refers to contiguous access, and strides increase for dimensions
+/// going left from there.  This layout imitates how C stores
+/// multi-dimensional arrays.  For the special case of a
+/// two-dimensional array, "layout right" is also called "row major."
+struct LayoutRight {
+  //! Tag this class as a kokkos array layout
+  typedef LayoutRight array_layout ;
+
+  size_t dimension[ ARRAY_LAYOUT_MAX_RANK ];
+
+  LayoutRight( LayoutRight const & ) = default ;
+  LayoutRight( LayoutRight && ) = default ;
+  LayoutRight & operator = ( LayoutRight const & ) = default ;
+  LayoutRight & operator = ( LayoutRight && ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr
+  LayoutRight( size_t N0 = 0 , size_t N1 = 0 , size_t N2 = 0 , size_t N3 = 0
+             , size_t N4 = 0 , size_t N5 = 0 , size_t N6 = 0 , size_t N7 = 0 )
+    : dimension { N0 , N1 , N2 , N3 , N4 , N5 , N6 , N7 } {}
+};
+
+//----------------------------------------------------------------------------
+/// \struct LayoutStride
+/// \brief  Memory layout tag indicated arbitrarily strided
+///         multi-index mapping into contiguous memory.
+struct LayoutStride {
+
+  //! Tag this class as a kokkos array layout
+  typedef LayoutStride array_layout ;
+
+  size_t dimension[ ARRAY_LAYOUT_MAX_RANK ] ;
+  size_t stride[ ARRAY_LAYOUT_MAX_RANK ] ; 
+
+  /** \brief  Compute strides from ordered dimensions.
+   *
+   *  Values of order uniquely form the set [0..rank)
+   *  and specify ordering of the dimensions.
+   *  Order = {0,1,2,...} is LayoutLeft
+   *  Order = {...,2,1,0} is LayoutRight
+   */
+  template< typename iTypeOrder , typename iTypeDimen >
+  KOKKOS_INLINE_FUNCTION static
+  LayoutStride order_dimensions( int const rank
+                               , iTypeOrder const * const order
+                               , iTypeDimen const * const dimen )
+    {
+      LayoutStride tmp ;
+      // Verify valid rank order:
+      int check_input = ARRAY_LAYOUT_MAX_RANK < rank ? 0 : int( 1 << rank ) - 1 ;
+      for ( int r = 0 ; r < ARRAY_LAYOUT_MAX_RANK ; ++r ) {
+        tmp.dimension[r] = 0 ;
+        tmp.stride[r]    = 0 ;
+        check_input &= ~int( 1 << order[r] );
+      }
+      if ( 0 == check_input ) {
+        size_t n = 1 ;
+        for ( int r = 0 ; r < rank ; ++r ) {
+          tmp.stride[ order[r] ] = n ;
+          n *= ( dimen[order[r]] );
+          tmp.dimension[r] = dimen[r];
+        }
+      }
+      return tmp ;
+    }
+
+  KOKKOS_INLINE_FUNCTION constexpr
+  LayoutStride( size_t N0 = 0 , size_t S0 = 0
+              , size_t N1 = 0 , size_t S1 = 0
+              , size_t N2 = 0 , size_t S2 = 0
+              , size_t N3 = 0 , size_t S3 = 0
+              , size_t N4 = 0 , size_t S4 = 0
+              , size_t N5 = 0 , size_t S5 = 0
+              , size_t N6 = 0 , size_t S6 = 0
+              , size_t N7 = 0 , size_t S7 = 0
+              )
+    : dimension { N0 , N1 , N2 , N3 , N4 , N5 , N6 , N7 }
+    , stride    { S0 , S1 , S2 , S3 , S4 , S5 , S6 , S7 }
+    {}
+};
+
+//----------------------------------------------------------------------------
+/// \struct LayoutTileLeft
+/// \brief Memory layout tag indicating left-to-right (Fortran scheme)
+///   striding of multi-indices by tiles.
+///
+/// This is an example of a \c MemoryLayout template parameter of
+/// View.  The memory layout describes how View maps from a
+/// multi-index (i0, i1, ..., ik) to a memory location.  
+///
+/// "Tiled layout" indicates a mapping to contiguously stored
+/// <tt>ArgN0</tt> by <tt>ArgN1</tt> tiles for the rightmost two
+/// dimensions.  Indices are LayoutLeft within each tile, and the
+/// tiles themselves are arranged using LayoutLeft.  Note that the
+/// dimensions <tt>ArgN0</tt> and <tt>ArgN1</tt> of the tiles must be
+/// compile-time constants.  This speeds up index calculations.  If
+/// both tile dimensions are powers of two, Kokkos can optimize
+/// further.
+template < unsigned ArgN0 , unsigned ArgN1 ,
+           bool IsPowerOfTwo = ( Impl::is_integral_power_of_two(ArgN0) &&
+                                 Impl::is_integral_power_of_two(ArgN1) )
+         >
+struct LayoutTileLeft {
+
+  static_assert( Impl::is_integral_power_of_two(ArgN0) &&
+                 Impl::is_integral_power_of_two(ArgN1)
+               , "LayoutTileLeft must be given power-of-two tile dimensions" );
+
+  //! Tag this class as a kokkos array layout
+  typedef LayoutTileLeft<ArgN0,ArgN1,IsPowerOfTwo> array_layout ;
+
+  enum { N0 = ArgN0 };
+  enum { N1 = ArgN1 };
+
+  size_t dimension[ ARRAY_LAYOUT_MAX_RANK ] ;
+
+  LayoutTileLeft( LayoutTileLeft const & ) = default ;
+  LayoutTileLeft( LayoutTileLeft && ) = default ;
+  LayoutTileLeft & operator = ( LayoutTileLeft const & ) = default ;
+  LayoutTileLeft & operator = ( LayoutTileLeft && ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr
+  LayoutTileLeft( size_t argN0 = 0 , size_t argN1 = 0 , size_t argN2 = 0 , size_t argN3 = 0
+                , size_t argN4 = 0 , size_t argN5 = 0 , size_t argN6 = 0 , size_t argN7 = 0
+                )
+    : dimension { argN0 , argN1 , argN2 , argN3 , argN4 , argN5 , argN6 , argN7 } {}
+};
+
+} // namespace Kokkos
+
+#endif // #ifndef KOKKOS_LAYOUT_HPP
+
diff --git a/lib/kokkos/core/src/Kokkos_Macros.hpp b/lib/kokkos/core/src/Kokkos_Macros.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..7d1e59af5e473db94a5ed6361bb3d6ee7b9b47e6
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_Macros.hpp
@@ -0,0 +1,470 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_MACROS_HPP
+#define KOKKOS_MACROS_HPP
+
+//----------------------------------------------------------------------------
+/** Pick up configure/build options via #define macros:
+ *
+ *  KOKKOS_HAVE_CUDA                Kokkos::Cuda execution and memory spaces
+ *  KOKKOS_HAVE_PTHREAD             Kokkos::Threads execution space
+ *  KOKKOS_HAVE_QTHREAD             Kokkos::Qthread execution space
+ *  KOKKOS_HAVE_OPENMP              Kokkos::OpenMP  execution space
+ *  KOKKOS_HAVE_HWLOC               HWLOC library is available
+ *  KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK    insert array bounds checks, is expensive!
+ *  KOKKOS_HAVE_CXX11               enable C++11 features
+ *
+ *  KOKKOS_HAVE_MPI                 negotiate MPI/execution space interactions
+ *
+ *  KOKKOS_USE_CUDA_UVM             Use CUDA UVM for Cuda memory space
+ */
+
+#ifndef KOKKOS_DONT_INCLUDE_CORE_CONFIG_H
+#include <KokkosCore_config.h>
+#endif
+
+//----------------------------------------------------------------------------
+/** Pick up compiler specific #define macros:
+ *
+ *  Macros for known compilers evaluate to an integral version value
+ *
+ *  KOKKOS_COMPILER_NVCC
+ *  KOKKOS_COMPILER_GNU
+ *  KOKKOS_COMPILER_INTEL
+ *  KOKKOS_COMPILER_IBM
+ *  KOKKOS_COMPILER_CRAYC
+ *  KOKKOS_COMPILER_APPLECC
+ *  KOKKOS_COMPILER_CLANG
+ *  KOKKOS_COMPILER_PGI
+ *
+ *  Macros for which compiler extension to use for atomics on intrinsice types
+ *
+ *  KOKKOS_ATOMICS_USE_CUDA
+ *  KOKKOS_ATOMICS_USE_GNU
+ *  KOKKOS_ATOMICS_USE_INTEL
+ *  KOKKOS_ATOMICS_USE_OPENMP31
+ *
+ *  A suite of 'KOKKOS_HAVE_PRAGMA_...' are defined for internal use.
+ *
+ *  Macros for marking functions to run in an execution space:
+ *
+ *  KOKKOS_FUNCTION
+ *  KOKKOS_INLINE_FUNCTION        request compiler to inline
+ *  KOKKOS_FORCEINLINE_FUNCTION   force compiler to inline, use with care!
+ */
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_HAVE_CUDA ) && defined( __CUDACC__ )
+
+/*  Compiling with a CUDA compiler.
+ *
+ *  Include <cuda.h> to pick up the CUDA_VERSION macro defined as:
+ *    CUDA_VERSION = ( MAJOR_VERSION * 1000 ) + ( MINOR_VERSION * 10 )
+ *
+ *  When generating device code the __CUDA_ARCH__ macro is defined as:
+ *    __CUDA_ARCH__ = ( MAJOR_CAPABILITY * 100 ) + ( MINOR_CAPABILITY * 10 )
+ */
+
+#include <cuda_runtime.h>
+#include <cuda.h>
+
+#if ! defined( CUDA_VERSION )
+#error "#include <cuda.h> did not define CUDA_VERSION"
+#endif
+
+#if ( CUDA_VERSION < 6050 )
+// CUDA supports (inofficially) C++11 in device code starting with
+// version 6.5. This includes auto type and device code internal
+// lambdas.
+#error "Cuda version 6.5 or greater required"
+#endif
+
+#if defined( __CUDA_ARCH__ ) && ( __CUDA_ARCH__ < 300 )
+/*  Compiling with CUDA compiler for device code. */
+#error "Cuda device capability >= 3.0 is required"
+#endif
+
+#ifdef KOKKOS_CUDA_USE_LAMBDA
+#if ( CUDA_VERSION < 7000 )
+// CUDA supports C++11 lambdas generated in host code to be given
+// to the device starting with version 7.5. But the release candidate (7.5.6)
+// still identifies as 7.0
+#error "Cuda version 7.5 or greater required for host-to-device Lambda support"
+#endif
+#if ( CUDA_VERSION < 8000 )
+#define KOKKOS_LAMBDA [=]__device__
+#else
+#define KOKKOS_LAMBDA [=]__host__ __device__
+#endif
+#define KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA 1
+#endif
+#endif /* #if defined( KOKKOS_HAVE_CUDA ) && defined( __CUDACC__ ) */
+
+
+#if defined(KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA)
+   // Cuda version 8.0 still needs the functor wrapper
+   #if (KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA /* && (CUDA_VERSION < 8000) */ )
+      #define KOKKOS_IMPL_NEED_FUNCTOR_WRAPPER
+   #endif
+#endif
+
+/*--------------------------------------------------------------------------*/
+/* Language info: C++, CUDA, OPENMP */
+
+#if defined( __CUDA_ARCH__ ) && defined( KOKKOS_HAVE_CUDA )
+  // Compiling Cuda code to 'ptx'
+
+  #define KOKKOS_FORCEINLINE_FUNCTION  __device__  __host__  __forceinline__
+  #define KOKKOS_INLINE_FUNCTION       __device__  __host__  inline
+  #define KOKKOS_FUNCTION              __device__  __host__
+
+#endif /* #if defined( __CUDA_ARCH__ ) */
+
+#if defined( _OPENMP )
+
+  /*  Compiling with OpenMP.
+   *  The value of _OPENMP is an integer value YYYYMM
+   *  where YYYY and MM are the year and month designation
+   *  of the supported OpenMP API version.
+   */
+
+#endif /* #if defined( _OPENMP ) */
+
+/*--------------------------------------------------------------------------*/
+/* Mapping compiler built-ins to KOKKOS_COMPILER_*** macros */
+
+#if defined( __NVCC__ )
+  // NVIDIA compiler is being used.
+  // Code is parsed and separated into host and device code.
+  // Host code is compiled again with another compiler.
+  // Device code is compile to 'ptx'.
+  #define KOKKOS_COMPILER_NVCC __NVCC__
+
+#else
+#if defined( KOKKOS_HAVE_CXX11 ) && ! defined( KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA )
+    // CUDA (including version 6.5) does not support giving lambdas as
+    // arguments to global functions. Thus its not currently possible
+    // to dispatch lambdas from the host.
+    #define KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA 1
+  #endif
+#endif /* #if defined( __NVCC__ ) */
+
+#if defined( KOKKOS_HAVE_CXX11 ) && !defined (KOKKOS_LAMBDA)
+  #define KOKKOS_LAMBDA [=]
+#endif
+
+#if ! defined( __CUDA_ARCH__ ) /* Not compiling Cuda code to 'ptx'. */
+
+/* Intel compiler for host code */
+
+#if defined( __INTEL_COMPILER )
+  #define KOKKOS_COMPILER_INTEL __INTEL_COMPILER
+#elif defined( __ICC )
+  // Old define
+  #define KOKKOS_COMPILER_INTEL __ICC
+#elif defined( __ECC )
+  // Very old define
+  #define KOKKOS_COMPILER_INTEL __ECC
+#endif
+
+/* CRAY compiler for host code */
+#if defined( _CRAYC )
+  #define KOKKOS_COMPILER_CRAYC _CRAYC
+#endif
+
+#if defined( __IBMCPP__ )
+  // IBM C++
+  #define KOKKOS_COMPILER_IBM __IBMCPP__
+#elif defined( __IBMC__ )
+  #define KOKKOS_COMPILER_IBM __IBMC__
+#endif
+
+#if defined( __APPLE_CC__ )
+  #define KOKKOS_COMPILER_APPLECC __APPLE_CC__
+#endif
+
+#if defined (__clang__) && !defined (KOKKOS_COMPILER_INTEL)
+  #define KOKKOS_COMPILER_CLANG __clang_major__*100+__clang_minor__*10+__clang_patchlevel__
+#endif
+
+#if ! defined( __clang__ ) && ! defined( KOKKOS_COMPILER_INTEL ) &&defined( __GNUC__ )
+  #define KOKKOS_COMPILER_GNU __GNUC__*100+__GNUC_MINOR__*10+__GNUC_PATCHLEVEL__
+  #if ( 472 > KOKKOS_COMPILER_GNU )
+    #error "Compiling with GCC version earlier than 4.7.2 is not supported."
+  #endif
+#endif
+
+#if defined( __PGIC__ ) && ! defined( __GNUC__ )
+  #define KOKKOS_COMPILER_PGI __PGIC__*100+__PGIC_MINOR__*10+__PGIC_PATCHLEVEL__
+  #if ( 1540 > KOKKOS_COMPILER_PGI )
+    #error "Compiling with PGI version earlier than 15.4 is not supported."
+  #endif
+#endif
+
+#endif /* #if ! defined( __CUDA_ARCH__ ) */
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+/* Intel compiler macros */
+
+#if defined( KOKKOS_COMPILER_INTEL )
+
+  #define KOKKOS_HAVE_PRAGMA_UNROLL 1
+  #define KOKKOS_HAVE_PRAGMA_IVDEP 1
+  #define KOKKOS_HAVE_PRAGMA_LOOPCOUNT 1
+  #define KOKKOS_HAVE_PRAGMA_VECTOR 1
+  #define KOKKOS_HAVE_PRAGMA_SIMD 1
+
+  #if ( 1400 > KOKKOS_COMPILER_INTEL )
+    #if ( 1300 > KOKKOS_COMPILER_INTEL )
+      #error "Compiling with Intel version earlier than 13.0 is not supported. Official minimal version is 14.0."
+    #else
+      #warning "Compiling with Intel version 13.x probably works but is not officially supported. Official minimal version is 14.0."
+    #endif
+  #endif
+  #if ( 1200 <= KOKKOS_COMPILER_INTEL ) && ! defined( KOKKOS_ENABLE_ASM ) && ! defined( _WIN32 )
+    #define KOKKOS_ENABLE_ASM 1
+  #endif
+
+  #if ( 1200 <= KOKKOS_COMPILER_INTEL ) && ! defined( KOKKOS_FORCEINLINE_FUNCTION )
+    #if !defined (_WIN32)
+      #define KOKKOS_FORCEINLINE_FUNCTION  inline __attribute__((always_inline))
+    #else
+      #define KOKKOS_FORCEINLINE_FUNCTION inline
+    #endif
+  #endif
+
+  #if defined( __MIC__ )
+    // Compiling for Xeon Phi
+  #endif
+
+#endif
+
+/*--------------------------------------------------------------------------*/
+/* Cray compiler macros */
+
+#if defined( KOKKOS_COMPILER_CRAYC )
+
+
+#endif
+
+/*--------------------------------------------------------------------------*/
+/* IBM Compiler macros */
+
+#if defined( KOKKOS_COMPILER_IBM )
+
+  #define KOKKOS_HAVE_PRAGMA_UNROLL 1
+  //#define KOKKOS_HAVE_PRAGMA_IVDEP 1
+  //#define KOKKOS_HAVE_PRAGMA_LOOPCOUNT 1
+  //#define KOKKOS_HAVE_PRAGMA_VECTOR 1
+  //#define KOKKOS_HAVE_PRAGMA_SIMD 1
+
+#endif
+
+/*--------------------------------------------------------------------------*/
+/* CLANG compiler macros */
+
+#if defined( KOKKOS_COMPILER_CLANG )
+
+  //#define KOKKOS_HAVE_PRAGMA_UNROLL 1
+  //#define KOKKOS_HAVE_PRAGMA_IVDEP 1
+  //#define KOKKOS_HAVE_PRAGMA_LOOPCOUNT 1
+  //#define KOKKOS_HAVE_PRAGMA_VECTOR 1
+  //#define KOKKOS_HAVE_PRAGMA_SIMD 1
+
+  #if ! defined( KOKKOS_FORCEINLINE_FUNCTION )
+    #define KOKKOS_FORCEINLINE_FUNCTION  inline __attribute__((always_inline))
+  #endif
+
+#endif
+
+/*--------------------------------------------------------------------------*/
+/* GNU Compiler macros */
+
+#if defined( KOKKOS_COMPILER_GNU )
+
+  //#define KOKKOS_HAVE_PRAGMA_UNROLL 1
+  //#define KOKKOS_HAVE_PRAGMA_IVDEP 1
+  //#define KOKKOS_HAVE_PRAGMA_LOOPCOUNT 1
+  //#define KOKKOS_HAVE_PRAGMA_VECTOR 1
+  //#define KOKKOS_HAVE_PRAGMA_SIMD 1
+
+  #if ! defined( KOKKOS_FORCEINLINE_FUNCTION )
+    #define KOKKOS_FORCEINLINE_FUNCTION inline __attribute__((always_inline))
+  #endif
+
+  #if ! defined( KOKKOS_ENABLE_ASM ) && \
+      ! ( defined( __powerpc) || \
+          defined(__powerpc__) || \
+          defined(__powerpc64__) || \
+          defined(__POWERPC__) || \
+          defined(__ppc__) || \
+          defined(__ppc64__) || \
+          defined(__PGIC__) )
+    #define KOKKOS_ENABLE_ASM 1
+  #endif
+
+#endif
+
+/*--------------------------------------------------------------------------*/
+
+#if defined( KOKKOS_COMPILER_PGI )
+
+  #define KOKKOS_HAVE_PRAGMA_UNROLL 1
+  #define KOKKOS_HAVE_PRAGMA_IVDEP 1
+  //#define KOKKOS_HAVE_PRAGMA_LOOPCOUNT 1
+  #define KOKKOS_HAVE_PRAGMA_VECTOR 1
+  //#define KOKKOS_HAVE_PRAGMA_SIMD 1
+
+#endif
+
+/*--------------------------------------------------------------------------*/
+
+#if defined( KOKKOS_COMPILER_NVCC )
+
+  #if defined(__CUDA_ARCH__ )
+    #define KOKKOS_HAVE_PRAGMA_UNROLL 1
+  #endif
+
+#endif
+
+//----------------------------------------------------------------------------
+/** Define function marking macros if compiler specific macros are undefined: */
+
+#if ! defined( KOKKOS_FORCEINLINE_FUNCTION )
+#define KOKKOS_FORCEINLINE_FUNCTION  inline
+#endif
+
+#if ! defined( KOKKOS_INLINE_FUNCTION )
+#define KOKKOS_INLINE_FUNCTION  inline
+#endif
+
+#if ! defined( KOKKOS_FUNCTION )
+#define KOKKOS_FUNCTION /**/
+#endif
+
+//----------------------------------------------------------------------------
+/** Define Macro for alignment: */
+#if ! defined(KOKKOS_ALIGN_16)
+#define KOKKOS_ALIGN_16 __attribute__((aligned(16)))
+#endif
+
+//----------------------------------------------------------------------------
+/** Determine the default execution space for parallel dispatch.
+ *  There is zero or one default execution space specified.
+ */
+
+#if 1 < ( ( defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_CUDA ) ? 1 : 0 ) + \
+          ( defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP ) ? 1 : 0 ) + \
+          ( defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS ) ? 1 : 0 ) + \
+          ( defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_SERIAL ) ? 1 : 0 ) )
+
+#error "More than one KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_* specified" ;
+
+#endif
+
+/** If default is not specified then chose from enabled execution spaces.
+ *  Priority: CUDA, OPENMP, THREADS, SERIAL
+ */
+#if   defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_CUDA )
+#elif defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP )
+#elif defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS )
+#elif defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_SERIAL )
+#elif defined ( KOKKOS_HAVE_CUDA )
+#define KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_CUDA
+#elif defined ( KOKKOS_HAVE_OPENMP )
+#define KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP
+#elif defined ( KOKKOS_HAVE_PTHREAD )
+#define KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS
+#else
+#define KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_SERIAL
+#endif
+
+//----------------------------------------------------------------------------
+/** Determine for what space the code is being compiled: */
+
+#if defined( __CUDACC__ ) && defined( __CUDA_ARCH__ ) && defined (KOKKOS_HAVE_CUDA)
+#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA
+#else
+#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+#endif
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#if ( defined( _POSIX_C_SOURCE ) && _POSIX_C_SOURCE >= 200112L ) || \
+    ( defined( _XOPEN_SOURCE )   && _XOPEN_SOURCE   >= 600 )
+#if defined(KOKKOS_ENABLE_PERFORMANCE_POSIX_MEMALIGN)
+#define KOKKOS_POSIX_MEMALIGN_AVAILABLE 1
+#endif
+#endif
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+/**Enable Profiling by default**/
+
+#ifndef KOKKOS_ENABLE_PROFILING
+#define KOKKOS_ENABLE_PROFILING 1
+#endif
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+/* Transitional macro to change between old and new View
+ * are no longer supported.
+ */
+
+#if defined( KOKKOS_USING_DEPRECATED_VIEW )
+#error "Kokkos deprecated View has been removed"
+#endif
+
+#define KOKKOS_USING_EXP_VIEW 1
+#define KOKKOS_USING_EXPERIMENTAL_VIEW
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_MACROS_HPP */
+
diff --git a/lib/kokkos/core/src/Kokkos_MemoryPool.hpp b/lib/kokkos/core/src/Kokkos_MemoryPool.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d843f7c9a1442f9ce1a268c04bf6395f28ed94c7
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_MemoryPool.hpp
@@ -0,0 +1,1523 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_MEMORYPOOL_HPP
+#define KOKKOS_MEMORYPOOL_HPP
+
+#include <Kokkos_Core_fwd.hpp>
+#include <Kokkos_Parallel.hpp>
+#include <Kokkos_Atomic.hpp>
+#include <impl/Kokkos_BitOps.hpp>
+#include <impl/Kokkos_Error.hpp>
+#include <impl/KokkosExp_SharedAlloc.hpp>
+
+#include <limits>
+#include <algorithm>
+#include <chrono>
+
+// How should errors be handled?  In general, production code should return a
+// value indicating failure so the user can decide how the error is handled.
+// While experimental, code can abort instead.  If KOKKOS_MEMPOOL_PRINTERR is
+// defined, the code will abort with an error message.  Otherwise, the code will
+// return with a value indicating failure when possible, or do nothing instead.
+//#define KOKKOS_MEMPOOL_PRINTERR
+
+//#define KOKKOS_MEMPOOL_PRINT_INFO
+//#define KOKKOS_MEMPOOL_PRINT_CONSTRUCTOR_INFO
+//#define KOKKOS_MEMPOOL_PRINT_BLOCKSIZE_INFO
+//#define KOKKOS_MEMPOOL_PRINT_SUPERBLOCK_INFO
+//#define KOKKOS_MEMPOOL_PRINT_ACTIVE_SUPERBLOCKS
+//#define KOKKOS_MEMPOOL_PRINT_PAGE_INFO
+//#define KOKKOS_MEMPOOL_PRINT_INDIVIDUAL_PAGE_INFO
+
+// A superblock is considered full when this percentage of its pages are full.
+#define KOKKOS_MEMPOOL_SB_FULL_FRACTION 0.80
+
+// A page is considered full when this percentage of its blocks are full.
+#define KOKKOS_MEMPOOL_PAGE_FULL_FRACTION 0.875  // 28 / 32
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+
+namespace MempoolImpl {
+
+template < typename T, typename ExecutionSpace >
+struct initialize_array {
+  typedef ExecutionSpace                      execution_space;
+  typedef typename ExecutionSpace::size_type  size_type;
+
+  T *  m_data;
+  T    m_value;
+
+  initialize_array( T * d, size_t size, T v ) : m_data( d ), m_value( v )
+  {
+    Kokkos::parallel_for( size, *this );
+
+    execution_space::fence();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( size_type i ) const { m_data[i] = m_value; }
+};
+
+template <typename Bitset>
+struct bitset_count
+{
+  typedef typename Bitset::execution_space     execution_space;
+  typedef typename execution_space::size_type  size_type;
+  typedef typename Bitset::size_type           value_type;
+  typedef typename Bitset::word_type           word_type;
+
+  word_type *   m_words;
+  value_type &  m_result;
+
+  bitset_count( word_type * w, value_type num_words, value_type & r )
+    : m_words( w ), m_result( r )
+  {
+    parallel_reduce( num_words, *this, m_result );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type & v ) const
+  { v = 0; }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile value_type & dst, volatile value_type const & src ) const
+  { dst += src; }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( size_type i, value_type & count) const
+  {
+    count += Kokkos::Impl::bit_count( m_words[i] );
+  }
+};
+
+template < typename Device >
+class Bitset {
+public:
+  typedef typename Device::execution_space  execution_space;
+  typedef typename Device::memory_space     memory_space;
+  typedef unsigned                          word_type;
+  typedef unsigned                          size_type;
+
+  typedef Kokkos::Impl::DeepCopy< memory_space, Kokkos::HostSpace > raw_deep_copy;
+
+  // Define some constants.
+  enum {
+    // Size of bitset word.  Should be 32.
+    WORD_SIZE    = sizeof(word_type) * CHAR_BIT,
+    LG_WORD_SIZE = Kokkos::Impl::integral_power_of_two( WORD_SIZE ),
+    WORD_MASK    = WORD_SIZE - 1
+  };
+
+private:
+  word_type *  m_words;
+  size_type    m_size;
+  size_type    m_num_words;
+  word_type    m_last_word_mask;
+
+public:
+  ~Bitset() = default;
+  Bitset() = default;
+  Bitset( Bitset && ) = default;
+  Bitset( const Bitset & ) = default;
+  Bitset & operator = ( Bitset && ) = default;
+  Bitset & operator = ( const Bitset & ) = default;
+
+  void init( void * w, size_type s )
+  {
+    // Assumption: The size of the memory pointed to by w is a multiple of
+    //             sizeof(word_type).
+
+    m_words = reinterpret_cast<word_type*>( w );
+    m_size = s;
+    m_num_words = ( s + WORD_SIZE - 1 ) >> LG_WORD_SIZE;
+    m_last_word_mask = m_size & WORD_MASK ? ( word_type(1) << ( m_size & WORD_MASK ) ) - 1 : 0;
+
+    reset();
+  }
+
+  size_type size() const { return m_size; }
+
+  size_type count() const
+  {
+    size_type val;
+    bitset_count< Bitset > bc( m_words, m_num_words, val );
+    return val;
+  }
+
+  void set()
+  {
+    // Set all the bits.
+    initialize_array< word_type, execution_space > ia( m_words, m_num_words, ~word_type(0) );
+
+    if ( m_last_word_mask ) {
+      // Clear the unused bits in the last block.
+      raw_deep_copy( m_words + ( m_num_words - 1 ), &m_last_word_mask, sizeof(word_type) );
+    }
+  }
+
+  void reset()
+  {
+    initialize_array< word_type, execution_space > ia( m_words, m_num_words, word_type(0) );
+  }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool test( size_type i ) const
+  {
+    size_type word_pos = i >> LG_WORD_SIZE;
+    word_type word = volatile_load( &m_words[ word_pos ] );
+    word_type mask = word_type(1) << ( i & WORD_MASK );
+
+    return word & mask;
+  }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool set( size_type i ) const
+  {
+    size_type word_pos = i >> LG_WORD_SIZE;
+    word_type mask = word_type(1) << ( i & WORD_MASK );
+
+    return !( atomic_fetch_or( &m_words[ word_pos ], mask ) & mask );
+  }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool reset( size_type i ) const
+  {
+    size_type word_pos = i >> LG_WORD_SIZE;
+    word_type mask = word_type(1) << ( i & WORD_MASK );
+
+    return atomic_fetch_and( &m_words[ word_pos ], ~mask ) & mask;
+  }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  Kokkos::pair< bool, word_type >
+  fetch_word_reset( size_type i ) const
+  {
+    size_type word_pos = i >> LG_WORD_SIZE;
+    word_type mask = word_type(1) << ( i & WORD_MASK );
+
+    Kokkos::pair<bool, word_type> result;
+    result.second = atomic_fetch_and( &m_words[ word_pos ], ~mask );
+    result.first = result.second & mask;
+
+    return result;
+  }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  Kokkos::pair< bool, size_type >
+  set_any_in_word( size_type i, word_type & prev_val ) const
+  {
+    prev_val = 0;
+
+    size_type word_pos = i >> LG_WORD_SIZE;
+    word_type word = volatile_load( &m_words[ word_pos ] );
+
+    // Loop until there are no more unset bits in the word.
+    while ( ~word ) {
+      // Find the first unset bit in the word.
+      size_type bit = Kokkos::Impl::bit_scan_forward( ~word );
+
+      // Try to set the bit.
+			word_type mask = word_type(1) << bit;
+      word = atomic_fetch_or( &m_words[ word_pos ], mask );
+
+      if ( !( word & mask ) ) {
+        // Successfully set the bit.
+        prev_val = word;
+
+        return Kokkos::pair<bool, size_type>( true, ( word_pos << LG_WORD_SIZE ) + bit );
+      }
+    }
+
+    // Didn't find a free bit in this word.
+    return Kokkos::pair<bool, size_type>( false, i );
+  }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  Kokkos::pair< bool, size_type >
+  set_any_in_word( size_type i, word_type & prev_val, word_type word_mask ) const
+  {
+    prev_val = 0;
+
+    size_type word_pos = i >> LG_WORD_SIZE;
+    word_type word = volatile_load( &m_words[ word_pos ] );
+    word = ( ~word ) & word_mask;
+
+    // Loop until there are no more unset bits in the word.
+    while ( word ) {
+      // Find the first unset bit in the word.
+      size_type bit = Kokkos::Impl::bit_scan_forward( word );
+
+      // Try to set the bit.
+			word_type mask = word_type(1) << bit;
+      word = atomic_fetch_or( &m_words[ word_pos ], mask );
+
+      if ( !( word & mask ) ) {
+        // Successfully set the bit.
+        prev_val = word;
+
+        return Kokkos::pair<bool, size_type>( true, ( word_pos << LG_WORD_SIZE ) + bit );
+      }
+
+      word = ( ~word ) & word_mask;
+    }
+
+    // Didn't find a free bit in this word.
+    return Kokkos::pair<bool, size_type>( false, i );
+  }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  Kokkos::pair< bool, size_type >
+  reset_any_in_word( size_type i, word_type & prev_val ) const
+  {
+    prev_val = 0;
+
+    size_type word_pos = i >> LG_WORD_SIZE;
+    word_type word = volatile_load( &m_words[ word_pos ] );
+
+    // Loop until there are no more set bits in the word.
+    while ( word ) {
+      // Find the first unset bit in the word.
+      size_type bit = Kokkos::Impl::bit_scan_forward( word );
+
+      // Try to reset the bit.
+			word_type mask = word_type(1) << bit;
+      word = atomic_fetch_and( &m_words[ word_pos ], ~mask );
+
+      if ( word & mask ) {
+        // Successfully reset the bit.
+        prev_val = word;
+
+        return Kokkos::pair<bool, size_type>( true, ( word_pos << LG_WORD_SIZE ) + bit );
+      }
+    }
+
+    // Didn't find a free bit in this word.
+    return Kokkos::pair<bool, size_type>( false, i );
+  }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  Kokkos::pair< bool, size_type >
+  reset_any_in_word( size_type i, word_type & prev_val, word_type word_mask ) const
+  {
+    prev_val = 0;
+
+    size_type word_pos = i >> LG_WORD_SIZE;
+    word_type word = volatile_load( &m_words[ word_pos ] );
+    word = word & word_mask;
+
+    // Loop until there are no more set bits in the word.
+    while ( word ) {
+      // Find the first unset bit in the word.
+      size_type bit = Kokkos::Impl::bit_scan_forward( word );
+
+      // Try to reset the bit.
+			word_type mask = word_type(1) << bit;
+      word = atomic_fetch_and( &m_words[ word_pos ], ~mask );
+
+      if ( word & mask ) {
+        // Successfully reset the bit.
+        prev_val = word;
+
+        return Kokkos::pair<bool, size_type>( true, ( word_pos << LG_WORD_SIZE ) + bit );
+      }
+
+      word = word & word_mask;
+    }
+
+    // Didn't find a free bit in this word.
+    return Kokkos::pair<bool, size_type>( false, i );
+  }
+};
+
+template < typename UInt32View, typename BSHeaderView, typename SBHeaderView,
+           typename MempoolBitset >
+struct create_histogram {
+  typedef typename UInt32View::execution_space  execution_space;
+  typedef typename execution_space::size_type   size_type;
+  typedef Kokkos::pair< double, uint32_t >      value_type;
+
+  size_t         m_start;
+  UInt32View     m_page_histogram;
+  BSHeaderView   m_blocksize_info;
+  SBHeaderView   m_sb_header;
+  MempoolBitset  m_sb_blocks;
+  size_t         m_lg_max_sb_blocks;
+  uint32_t       m_lg_min_block_size;
+  uint32_t       m_blocks_per_page;
+  value_type &   m_result;
+
+  create_histogram( size_t start, size_t end, UInt32View ph, BSHeaderView bsi,
+                    SBHeaderView sbh, MempoolBitset sbb, size_t lmsb,
+                    uint32_t lmbs, uint32_t bpp, value_type & r )
+    : m_start( start ), m_page_histogram( ph ), m_blocksize_info( bsi ),
+      m_sb_header( sbh ), m_sb_blocks( sbb ), m_lg_max_sb_blocks( lmsb ),
+      m_lg_min_block_size( lmbs ), m_blocks_per_page( bpp ), m_result( r )
+  {
+    Kokkos::parallel_reduce( end - start, *this, m_result );
+
+    execution_space::fence();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type & v ) const
+  {
+    v.first  = 0.0;
+    v.second = 0;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile value_type & dst, volatile value_type const & src ) const
+  {
+    dst.first += src.first;
+    dst.second += src.second;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( size_type i, value_type & r ) const
+  {
+    size_type i2 = i + m_start;
+
+    uint32_t lg_block_size = m_sb_header(i2).m_lg_block_size;
+
+    // A superblock only has a block size of 0 when it is empty.
+    if ( lg_block_size != 0 ) {
+      uint32_t block_size_id = lg_block_size - m_lg_min_block_size;
+      uint32_t blocks_per_sb = m_blocksize_info[block_size_id].m_blocks_per_sb;
+      uint32_t pages_per_sb = m_blocksize_info[block_size_id].m_pages_per_sb;
+
+      uint32_t total_allocated_blocks = 0;
+
+      for ( uint32_t j = 0; j < pages_per_sb; ++j ) {
+        unsigned start_pos = ( i2 << m_lg_max_sb_blocks ) + j * m_blocks_per_page;
+        unsigned end_pos = start_pos + m_blocks_per_page;
+        uint32_t page_allocated_blocks = 0;
+
+        for ( unsigned k = start_pos; k < end_pos; ++k ) {
+          page_allocated_blocks += m_sb_blocks.test( k );
+        }
+
+        total_allocated_blocks += page_allocated_blocks;
+
+        atomic_fetch_add( &m_page_histogram(page_allocated_blocks), 1 );
+      }
+
+      r.first += double(total_allocated_blocks) / blocks_per_sb;
+      r.second += blocks_per_sb;
+    }
+  }
+};
+
+#ifdef KOKKOS_MEMPOOL_PRINT_SUPERBLOCK_INFO
+template < typename UInt32View, typename SBHeaderView, typename MempoolBitset >
+struct count_allocated_blocks {
+  typedef typename UInt32View::execution_space  execution_space;
+  typedef typename execution_space::size_type   size_type;
+
+  UInt32View     m_num_allocated_blocks;
+  SBHeaderView   m_sb_header;
+  MempoolBitset  m_sb_blocks;
+  size_t         m_sb_size;
+  size_t         m_lg_max_sb_blocks;
+
+  count_allocated_blocks( size_t num_sb, UInt32View nab, SBHeaderView sbh,
+                          MempoolBitset sbb, size_t sbs, size_t lmsb )
+    : m_num_allocated_blocks( nab ), m_sb_header( sbh ),
+      m_sb_blocks( sbb ), m_sb_size( sbs ), m_lg_max_sb_blocks( lmsb )
+  {
+    Kokkos::parallel_for( num_sb, *this );
+
+    execution_space::fence();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( size_type i ) const
+  {
+    uint32_t lg_block_size = m_sb_header(i).m_lg_block_size;
+
+    // A superblock only has a block size of 0 when it is empty.
+    if ( lg_block_size != 0 ) {
+      // Count the allocated blocks in the superblock.
+      uint32_t blocks_per_sb = lg_block_size > 0 ? m_sb_size >> lg_block_size : 0;
+      unsigned start_pos = i << m_lg_max_sb_blocks;
+      unsigned end_pos = start_pos + blocks_per_sb;
+      uint32_t count = 0;
+
+      for ( unsigned j = start_pos; j < end_pos; ++j ) {
+        count += m_sb_blocks.test( j );
+      }
+
+      m_num_allocated_blocks(i) = count;
+    }
+  }
+};
+#endif
+
+}
+
+/// \class MemoryPool
+/// \brief Bitset based memory manager for pools of same-sized chunks of memory.
+/// \tparam Device Kokkos device that gives the execution and memory space the
+///                allocator will be used in.
+///
+/// MemoryPool is a memory space that can be on host or device.  It provides a
+/// pool memory allocator for fast allocation of same-sized chunks of memory.
+/// The memory is only accessible on the host / device this allocator is
+/// associated with.
+///
+/// This allocator is based on ideas from the following GPU allocators:
+///   Halloc (https://github.com/canonizer/halloc).
+///   ScatterAlloc (https://github.com/ComputationalRadiationPhysics/scatteralloc)
+template < typename Device >
+class MemoryPool {
+private:
+  // The allocator uses superblocks.  A superblock is divided into pages, and a
+  // page is divided into blocks.  A block is the chunk of memory that is given
+  // out by the allocator.  A page always has a number of blocks equal to the
+  // size of the word used by the bitset.  Thus, the pagesize can vary between
+  // superblocks as it is based on the block size of the superblock.  The
+  // allocator supports all powers of 2 from MIN_BLOCK_SIZE to the size of a
+  // superblock as block sizes.
+
+  // Superblocks are divided into 4 categories:
+  //   1. empty    - is completely empty; there are no active allocations
+  //   2. partfull - partially full; there are some active allocations
+  //   3. full     - full enough with active allocations that new allocations
+  //                 will likely fail
+  //   4. active   - is currently the active superblock for a block size
+  //
+  // An inactive superblock is one that is empty, partfull, or full.
+  //
+  // New allocations occur only from an active superblock.  If a superblock is
+  // made inactive after an allocation request is made to it but before the
+  // allocation request is fulfilled, the allocation will still be attempted
+  // from that superblock.  Deallocations can  occur to partfull, full, or
+  // active superblocks.  Superblocks move between categories as allocations
+  // and deallocations happen.  Superblocks all start empty.
+  //
+  // Here are the possible moves between categories:
+  //   empty    -> active    During allocation, there is no active superblock
+  //                         or the active superblock is full.
+  //   active   -> full      During allocation, the full threshold of the
+  //                         superblock is reached when increasing the fill
+  //                         level.
+  //   full     -> partfull  During deallocation, the full threshold of the
+  //                         superblock is crossed when decreasing the fill
+  //                         level.
+  //   partfull -> empty     Deallocation of the last allocated block of an
+  //                         inactive superblock.
+  //   partfull -> active    During allocation, the active superblock is full.
+  //
+  // When a new active superblock is needed, partfull superblocks of the same
+  // block size are chosen over empty superblocks.
+  //
+  // The empty and partfull superblocks are tracked using bitsets that represent
+  // the superblocks in those repsective categories.  Empty superblocks use a
+  // single bitset, while partfull superblocks use a bitset per block size
+  // (contained sequentially in a single bitset).  Active superblocks are
+  // tracked by the active superblocks array.  Full superblocks aren't tracked
+  // at all.
+
+  typedef typename Device::execution_space    execution_space;
+  typedef typename Device::memory_space       backend_memory_space;
+  typedef Device                              device_type;
+  typedef MempoolImpl::Bitset< device_type >  MempoolBitset;
+
+  // Define some constants.
+  enum {
+    MIN_BLOCK_SIZE     = 64,
+    LG_MIN_BLOCK_SIZE  = Kokkos::Impl::integral_power_of_two( MIN_BLOCK_SIZE ),
+    MAX_BLOCK_SIZES    = 31 - LG_MIN_BLOCK_SIZE + 1,
+
+    // Size of bitset word.
+    BLOCKS_PER_PAGE    = MempoolBitset::WORD_SIZE,
+    LG_BLOCKS_PER_PAGE = MempoolBitset::LG_WORD_SIZE,
+
+    INVALID_SUPERBLOCK = ~uint32_t(0),
+    SUPERBLOCK_LOCK    = ~uint32_t(0) - 1,
+
+    MAX_TRIES          = 32             // Cap on the number of pages searched
+                                        // before an allocation returns empty.
+  };
+
+public:
+  // Stores information about each superblock.
+  struct SuperblockHeader {
+    uint32_t  m_full_pages;
+    uint32_t  m_empty_pages;
+    uint32_t  m_lg_block_size;
+    uint32_t  m_is_active;
+
+    KOKKOS_FUNCTION
+    SuperblockHeader() :
+      m_full_pages(0), m_empty_pages(0), m_lg_block_size(0), m_is_active(false) {}
+  };
+
+  // Stores information about each block size.
+  struct BlockSizeHeader {
+    uint32_t  m_blocks_per_sb;
+    uint32_t  m_pages_per_sb;
+    uint32_t  m_sb_full_level;
+    uint32_t  m_page_full_level;
+
+    KOKKOS_FUNCTION
+    BlockSizeHeader() :
+      m_blocks_per_sb(0), m_pages_per_sb(0), m_sb_full_level(0), m_page_full_level(0) {}
+  };
+
+private:
+  typedef Impl::SharedAllocationTracker            Tracker;
+  typedef View< uint32_t *, device_type >          UInt32View;
+  typedef View< SuperblockHeader *, device_type >  SBHeaderView;
+
+  // The letters 'sb' used in any variable name mean superblock.
+
+  size_t           m_lg_sb_size;        // Log2 of superblock size.
+  size_t           m_sb_size;           // Superblock size.
+  size_t           m_lg_max_sb_blocks;  // Log2 of the number of blocks of the
+                                        // minimum block size in a superblock.
+  size_t           m_num_sb;            // Number of superblocks.
+  size_t           m_ceil_num_sb;       // Number of superblocks rounded up to the smallest
+                                        // multiple of the bitset word size.  Used by
+                                        // bitsets representing superblock categories to
+                                        // ensure different block sizes never share a word
+                                        // in the bitset.
+  size_t           m_num_block_size;    // Number of block sizes supported.
+  size_t           m_data_size;         // Amount of memory available to the allocator.
+  size_t           m_sb_blocks_size;    // Amount of memory for free / empty blocks bitset.
+  size_t           m_empty_sb_size;     // Amount of memory for empty superblocks bitset.
+  size_t           m_partfull_sb_size;  // Amount of memory for partfull superblocks bitset.
+  size_t           m_total_size;        // Total amount of memory allocated.
+  char *           m_data;              // Beginning device memory location used for
+                                        // superblocks.
+  UInt32View       m_active;            // Active superblocks IDs.
+  SBHeaderView     m_sb_header;         // Header info for superblocks.
+  MempoolBitset    m_sb_blocks;         // Bitsets representing free / allocated status
+                                        // of blocks in superblocks.
+  MempoolBitset    m_empty_sb;          // Bitset representing empty superblocks.
+  MempoolBitset    m_partfull_sb;       // Bitsets representing partially full superblocks.
+  Tracker          m_track;             // Tracker for superblock memory.
+  BlockSizeHeader  m_blocksize_info[MAX_BLOCK_SIZES];  // Header info for block sizes.
+
+  // There were several methods tried for storing the block size header info: in a View,
+  // in a View of const data, and in a RandomAccess View.  All of these were slower than
+  // storing it in a static array that is a member variable to the class.  In the latter
+  // case, the block size info gets copied into the constant memory on the GPU along with
+  // the class when it is copied there for exeucting a parallel loop.  Instead of storing
+  // the values, computing the values every time they were needed was also tried.  This
+  // method was slightly slower than storing them in the static array.
+
+public:
+  //! Tag this class as a kokkos memory space
+  typedef MemoryPool  memory_space;
+
+  ~MemoryPool() = default;
+  MemoryPool() = default;
+  MemoryPool( MemoryPool && ) = default;
+  MemoryPool( const MemoryPool & ) = default;
+  MemoryPool & operator = ( MemoryPool && ) = default;
+  MemoryPool & operator = ( const MemoryPool & ) = default;
+
+  /// \brief Initializes the memory pool.
+  /// \param memspace The memory space from which the memory pool will allocate memory.
+  /// \param total_size The requested memory amount controlled by the allocator.  The
+  ///                   actual amount is rounded up to the smallest multiple of the
+  ///                   superblock size >= the requested size.
+  /// \param log2_superblock_size Log2 of the size of superblocks used by the allocator.
+  ///                             In most use cases, the default value should work.
+  inline
+  MemoryPool( const backend_memory_space & memspace,
+              size_t total_size, size_t log2_superblock_size = 20 )
+    : m_lg_sb_size( log2_superblock_size ),
+      m_sb_size( size_t(1) << m_lg_sb_size ),
+      m_lg_max_sb_blocks( m_lg_sb_size - LG_MIN_BLOCK_SIZE ),
+      m_num_sb( ( total_size + m_sb_size - 1 ) >> m_lg_sb_size ),
+      m_ceil_num_sb( ( ( m_num_sb + BLOCKS_PER_PAGE - 1 ) >> LG_BLOCKS_PER_PAGE ) <<
+                     LG_BLOCKS_PER_PAGE ),
+      m_num_block_size( m_lg_sb_size - LG_MIN_BLOCK_SIZE + 1 ),
+      m_data_size( m_num_sb * m_sb_size ),
+      m_sb_blocks_size( ( m_num_sb << m_lg_max_sb_blocks ) / CHAR_BIT ),
+      m_empty_sb_size( m_ceil_num_sb / CHAR_BIT ),
+      m_partfull_sb_size( m_ceil_num_sb * m_num_block_size / CHAR_BIT ),
+      m_total_size( m_data_size +  m_sb_blocks_size + m_empty_sb_size + m_partfull_sb_size ),
+      m_data(0),
+      m_active( "Active superblocks" ),
+      m_sb_header( "Superblock headers" ),
+      m_track()
+  {
+    // Assumption.  The minimum block size must be a power of 2.
+    static_assert( Kokkos::Impl::is_integral_power_of_two( MIN_BLOCK_SIZE ), "" );
+
+    // Assumption.  Require a superblock be large enough so it takes at least 1
+    // whole bitset word to represent it using the minimum blocksize.
+    if ( m_sb_size < MIN_BLOCK_SIZE * BLOCKS_PER_PAGE ) {
+      printf( "\n** MemoryPool::MemoryPool() Superblock size must be >= %u **\n",
+              MIN_BLOCK_SIZE * BLOCKS_PER_PAGE );
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+      fflush( stdout );
+#endif
+      Kokkos::abort( "" );
+    }
+
+    // Assumption.  A superblock's size can be at most 2^31.  Verify this.
+    if ( m_lg_sb_size > 31 ) {
+      printf( "\n** MemoryPool::MemoryPool() Superblock size must be < %u **\n",
+              ( uint32_t(1) << 31 ) );
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+      fflush( stdout );
+#endif
+      Kokkos::abort( "" );
+    }
+
+    // Assumption.  The Bitset only uses unsigned for size types which limits
+    // the amount of memory the allocator can manage.  Verify the memory size
+    // is below this limit.
+    if ( m_data_size > size_t(MIN_BLOCK_SIZE) * std::numeric_limits<unsigned>::max() ) {
+      printf( "\n** MemoryPool::MemoryPool() Allocator can only manage %lu bytes of memory; requested %lu **\n",
+              size_t(MIN_BLOCK_SIZE) * std::numeric_limits<unsigned>::max(), total_size );
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+      fflush( stdout );
+#endif
+      Kokkos::abort( "" );
+    }
+
+    // Allocate memory for Views.  This is done here instead of at construction
+    // so that the runtime checks can be performed before allocating memory.
+    resize(m_active, m_num_block_size );
+    resize(m_sb_header, m_num_sb );
+
+    // Allocate superblock memory.
+    typedef Impl::SharedAllocationRecord< backend_memory_space, void >  SharedRecord;
+    SharedRecord * rec =
+      SharedRecord::allocate( memspace, "mempool", m_total_size );
+
+    m_track.assign_allocated_record_to_uninitialized( rec );
+    m_data = reinterpret_cast<char *>( rec->data() );
+
+    // Set and initialize the free / empty block bitset memory.
+    m_sb_blocks.init( m_data + m_data_size, m_num_sb << m_lg_max_sb_blocks );
+
+    // Set and initialize the empty superblock block bitset memory.
+    m_empty_sb.init( m_data + m_data_size + m_sb_blocks_size, m_num_sb );
+
+    // Start with all superblocks in the empty category.
+    m_empty_sb.set();
+
+    // Set and initialize the partfull superblock block bitset memory.
+    m_partfull_sb.init( m_data + m_data_size + m_sb_blocks_size + m_empty_sb_size,
+                        m_ceil_num_sb * m_num_block_size );
+
+    // Initialize all active superblocks to be invalid.
+    typename UInt32View::HostMirror host_active = create_mirror_view(m_active);
+    for (size_t i = 0; i < m_num_block_size; ++i) host_active(i) = INVALID_SUPERBLOCK;
+
+    deep_copy(m_active, host_active);
+
+    // Initialize the blocksize info.
+    for ( size_t i = 0; i < m_num_block_size; ++i ) {
+      uint32_t lg_block_size = i + LG_MIN_BLOCK_SIZE;
+      uint32_t blocks_per_sb = m_sb_size >> lg_block_size;
+      uint32_t pages_per_sb = ( blocks_per_sb + BLOCKS_PER_PAGE - 1 ) >> LG_BLOCKS_PER_PAGE;
+
+      m_blocksize_info[i].m_blocks_per_sb = blocks_per_sb;
+      m_blocksize_info[i].m_pages_per_sb = pages_per_sb;
+
+      // Set the full level for the superblock.
+      m_blocksize_info[i].m_sb_full_level =
+        static_cast<uint32_t>( pages_per_sb * KOKKOS_MEMPOOL_SB_FULL_FRACTION );
+
+      if ( m_blocksize_info[i].m_sb_full_level == 0 ) {
+        m_blocksize_info[i].m_sb_full_level = 1;
+      }
+
+      // Set the full level for the page.
+      uint32_t blocks_per_page =
+        blocks_per_sb < BLOCKS_PER_PAGE ? blocks_per_sb : BLOCKS_PER_PAGE;
+
+      m_blocksize_info[i].m_page_full_level =
+        static_cast<uint32_t>( blocks_per_page * KOKKOS_MEMPOOL_PAGE_FULL_FRACTION );
+
+      if ( m_blocksize_info[i].m_page_full_level == 0 ) {
+        m_blocksize_info[i].m_page_full_level = 1;
+      }
+    }
+
+#ifdef KOKKOS_MEMPOOL_PRINT_CONSTRUCTOR_INFO
+    printf( "\n" );
+    printf( "      m_lg_sb_size: %12lu\n", m_lg_sb_size );
+    printf( "         m_sb_size: %12lu\n", m_sb_size );
+    printf( "   m_max_sb_blocks: %12lu\n", size_t(1) << m_lg_max_sb_blocks );
+    printf( "m_lg_max_sb_blocks: %12lu\n", m_lg_max_sb_blocks );
+    printf( "          m_num_sb: %12lu\n", m_num_sb );
+    printf( "     m_ceil_num_sb: %12lu\n", m_ceil_num_sb );
+    printf( "  m_num_block_size: %12lu\n", m_num_block_size );
+    printf( "        data bytes: %12lu\n", m_data_size );
+    printf( "   sb_blocks bytes: %12lu\n", m_sb_blocks_size );
+    printf( "    empty_sb bytes: %12lu\n", m_empty_sb_size );
+    printf( " partfull_sb bytes: %12lu\n", m_partfull_sb_size );
+    printf( "       total bytes: %12lu\n", m_total_size );
+    printf( "   m_empty_sb size: %12u\n", m_empty_sb.size() );
+    printf( "m_partfull_sb size: %12u\n", m_partfull_sb.size() );
+    printf( "\n" );
+    fflush( stdout );
+#endif
+
+#ifdef KOKKOS_MEMPOOL_PRINT_BLOCKSIZE_INFO
+    // Print the blocksize info for all the block sizes.
+    printf( "SIZE    BLOCKS_PER_SB    PAGES_PER_SB    SB_FULL_LEVEL    PAGE_FULL_LEVEL\n" );
+    for ( size_t i = 0; i < m_num_block_size; ++i ) {
+      printf( "%4zu    %13u    %12u    %13u    %15u\n", i + LG_MIN_BLOCK_SIZE,
+              m_blocksize_info[i].m_blocks_per_sb, m_blocksize_info[i].m_pages_per_sb,
+              m_blocksize_info[i].m_sb_full_level, m_blocksize_info[i].m_page_full_level );
+    }
+    printf( "\n" );
+#endif
+  }
+
+  /// \brief  The actual block size allocated given alloc_size.
+  KOKKOS_INLINE_FUNCTION
+  size_t allocate_block_size( const size_t alloc_size ) const
+  { return size_t(1) << ( get_block_size_index( alloc_size ) + LG_MIN_BLOCK_SIZE); }
+
+  /// \brief Allocate a chunk of memory.
+  /// \param alloc_size Size of the requested allocated in number of bytes.
+  ///
+  /// The function returns a void pointer to a memory location on success and
+  /// NULL on failure.
+  KOKKOS_FUNCTION
+  void * allocate( size_t alloc_size ) const
+  {
+    void * p = 0;
+
+    // Only support allocations up to the superblock size.  Just return 0
+    // (failed allocation) for any size above this.
+    if (alloc_size <= m_sb_size )
+    {
+      int block_size_id = get_block_size_index( alloc_size );
+      uint32_t blocks_per_sb = m_blocksize_info[block_size_id].m_blocks_per_sb;
+      uint32_t pages_per_sb = m_blocksize_info[block_size_id].m_pages_per_sb;
+      unsigned word_size = blocks_per_sb > 32 ? 32 : blocks_per_sb;
+      unsigned word_mask = ( uint64_t(1) << word_size ) - 1;
+
+      uint32_t sb_id = volatile_load( &m_active(block_size_id) );
+
+      // If the active is locked, keep reading it until the lock is released.
+      while ( sb_id == SUPERBLOCK_LOCK ) {
+        sb_id = volatile_load( &m_active(block_size_id) );
+      }
+
+      bool allocation_done = false;
+
+      while (!allocation_done) {
+        bool need_new_sb = false;
+
+        if (sb_id != INVALID_SUPERBLOCK) {
+          // Use the value from the clock register as the hash value.
+          uint64_t hash_val = get_clock_register();
+
+          // Get the starting position for this superblock's bits in the bitset.
+          uint32_t pos_base = sb_id << m_lg_max_sb_blocks;
+
+          // Mod the hash value to choose a page in the superblock.  The
+          // initial block searched is the first block of that page.
+          uint32_t pos_rel = uint32_t( hash_val & ( pages_per_sb - 1 ) ) << LG_BLOCKS_PER_PAGE;
+
+          // Get the absolute starting position for this superblock's bits in the bitset.
+          uint32_t pos = pos_base + pos_rel;
+
+          // Keep track of the number of pages searched.  Pages in the superblock are
+          // searched linearly from the starting page.  All pages in the superblock are
+          // searched until either a location is found, or it is proven empty.
+          uint32_t pages_searched = 0;
+
+          bool search_done = false;
+
+          while (!search_done) {
+            bool success;
+            unsigned prev_val;
+
+            Kokkos::tie( success, pos ) =
+              m_sb_blocks.set_any_in_word( pos, prev_val, word_mask );
+
+            if ( !success ) {
+              if ( ++pages_searched >= pages_per_sb ) {
+                // Searched all the pages in this superblock.  Look for a new superblock.
+                //
+                // The previous method tried limiting the number of pages searched, but
+                // that caused a huge performance issue in CUDA where the outer loop
+                // executed massive numbers of times.  Threads weren't able to find a
+                // free location when the superblock wasn't full and were able to execute
+                // the outer loop many times before the superblock was switched for a new
+                // one.  Switching to an exhaustive search eliminated this possiblity and
+                // didn't slow anything down for the tests.
+                need_new_sb = true;
+                search_done = true;
+              }
+              else {
+                // Move to the next page making sure the new search position
+                // doesn't go past this superblock's bits.
+                pos += BLOCKS_PER_PAGE;
+                pos = ( pos < pos_base + blocks_per_sb ) ? pos : pos_base;
+              }
+            }
+            else {
+              // Reserved a memory location to allocate.
+              search_done = true;
+              allocation_done = true;
+
+              uint32_t lg_block_size = block_size_id + LG_MIN_BLOCK_SIZE;
+
+              p = m_data + ( size_t(sb_id) << m_lg_sb_size ) +
+                  ( ( pos - pos_base ) << lg_block_size );
+
+              uint32_t used_bits = Kokkos::Impl::bit_count( prev_val );
+
+              if ( used_bits == 0 ) {
+                // This page was empty.  Decrement the number of empty pages for
+                // the superblock.
+                atomic_fetch_sub( &m_sb_header(sb_id).m_empty_pages, 1 );
+              }
+              else if ( used_bits == m_blocksize_info[block_size_id].m_page_full_level - 1 )
+              {
+                // This page is full.  Increment the number of full pages for
+                // the superblock.
+                uint32_t full_pages = atomic_fetch_add( &m_sb_header(sb_id).m_full_pages, 1 );
+
+                // This allocation made the superblock full, so a new one needs to be found.
+                if ( full_pages == m_blocksize_info[block_size_id].m_sb_full_level - 1 ) {
+                  need_new_sb = true;
+                }
+              }
+            }
+          }
+        }
+        else {
+          // This is the first allocation for this block size.  A superblock needs
+          // to be set as the active one.  If this point is reached any other time,
+          // it is an error.
+          need_new_sb = true;
+        }
+
+        if ( need_new_sb ) {
+          uint32_t new_sb_id = find_superblock( block_size_id, sb_id );
+
+          if ( new_sb_id == sb_id ) {
+            allocation_done = true;
+#ifdef KOKKOS_MEMPOOL_PRINT_INFO
+            printf( "** No superblocks available. **\n" );
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+            fflush( stdout );
+#endif
+#endif
+          }
+          else {
+            sb_id = new_sb_id;
+          }
+        }
+      }
+    }
+#ifdef KOKKOS_MEMPOOL_PRINT_INFO
+    else {
+      printf( "** Requested allocation size (%zu) larger than superblock size (%lu). **\n",
+              alloc_size, m_sb_size);
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+      fflush( stdout );
+#endif
+    }
+#endif
+
+    return p;
+  }
+
+  /// \brief Release allocated memory back to the pool.
+  /// \param alloc_ptr Pointer to chunk of memory previously allocated by
+  ///                  the allocator.
+  /// \param alloc_size Size of the allocated memory in number of bytes.
+  KOKKOS_FUNCTION
+  void deallocate( void * alloc_ptr, size_t alloc_size ) const
+  {
+    char * ap = static_cast<char *>( alloc_ptr );
+
+    // Only deallocate memory controlled by this pool.
+    if ( ap >= m_data && ap + alloc_size <= m_data + m_data_size ) {
+      // Get the superblock for the address.  This can be calculated by math on
+      // the address since the superblocks are stored contiguously in one memory
+      // chunk.
+      uint32_t sb_id = ( ap - m_data ) >> m_lg_sb_size;
+
+      // Get the starting position for this superblock's bits in the bitset.
+      uint32_t pos_base = sb_id << m_lg_max_sb_blocks;
+
+      // Get the relative position for this memory location's bit in the bitset.
+      uint32_t offset = ( ap - m_data ) - ( size_t(sb_id) << m_lg_sb_size );
+      uint32_t lg_block_size = m_sb_header(sb_id).m_lg_block_size;
+      uint32_t block_size_id = lg_block_size - LG_MIN_BLOCK_SIZE;
+      uint32_t pos_rel = offset >> lg_block_size;
+
+      bool success;
+      unsigned prev_val;
+
+      Kokkos::tie( success, prev_val ) = m_sb_blocks.fetch_word_reset( pos_base + pos_rel );
+
+      // If the memory location was previously deallocated, do nothing.
+      if ( success ) {
+        uint32_t page_fill_level = Kokkos::Impl::bit_count( prev_val );
+
+        if ( page_fill_level == 1 ) {
+          // This page is now empty.  Increment the number of empty pages for the
+          // superblock.
+          uint32_t empty_pages = atomic_fetch_add( &m_sb_header(sb_id).m_empty_pages, 1 );
+
+          if ( !volatile_load( &m_sb_header(sb_id).m_is_active ) &&
+               empty_pages == m_blocksize_info[block_size_id].m_pages_per_sb - 1 )
+          {
+            // This deallocation caused the superblock to be empty.  Change the
+            // superblock category from partially full to empty.
+            unsigned pos = block_size_id * m_ceil_num_sb + sb_id;
+
+            if ( m_partfull_sb.reset( pos ) ) {
+              // Reset the empty pages and block size for the superblock.
+              volatile_store( &m_sb_header(sb_id).m_empty_pages, uint32_t(0) );
+              volatile_store( &m_sb_header(sb_id).m_lg_block_size, uint32_t(0) );
+
+              memory_fence();
+
+              m_empty_sb.set( sb_id );
+            }
+          }
+        }
+        else if ( page_fill_level == m_blocksize_info[block_size_id].m_page_full_level ) {
+          // This page is no longer full.  Decrement the number of full pages for
+          // the superblock.
+          uint32_t full_pages = atomic_fetch_sub( &m_sb_header(sb_id).m_full_pages, 1 );
+
+          if ( !volatile_load( &m_sb_header(sb_id).m_is_active ) &&
+               full_pages == m_blocksize_info[block_size_id].m_sb_full_level )
+          {
+            // This deallocation caused the number of full pages to decrease below
+            // the full threshold.  Change the superblock category from full to
+            // partially full.
+            unsigned pos = block_size_id * m_ceil_num_sb + sb_id;
+            m_partfull_sb.set( pos );
+          }
+        }
+      }
+    }
+#ifdef KOKKOS_MEMPOOL_PRINTERR
+    else {
+      printf( "\n** MemoryPool::deallocate() ADDRESS_OUT_OF_RANGE(0x%llx) **\n",
+              reinterpret_cast<uint64_t>( alloc_ptr ) );
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+      fflush( stdout );
+#endif
+    }
+#endif
+  }
+
+  /// \brief Tests if the memory pool has no more memory available to allocate.
+  KOKKOS_INLINE_FUNCTION
+  bool is_empty() const
+  {
+    // The allocator is empty if all superblocks are full.  A superblock is
+    // full if it has >= 80% of its pages allocated.
+
+    // Look at all the superblocks.  If one is not full, then the allocator
+    // isn't empty.
+    for ( size_t i = 0; i < m_num_sb; ++i ) {
+      uint32_t lg_block_size = m_sb_header(i).m_lg_block_size;
+
+      // A superblock only has a block size of 0 when it is empty.
+      if ( lg_block_size == 0 ) return false;
+
+      uint32_t block_size_id = lg_block_size - LG_MIN_BLOCK_SIZE;
+      uint32_t full_pages = volatile_load( &m_sb_header(i).m_full_pages );
+
+      if ( full_pages < m_blocksize_info[block_size_id].m_sb_full_level ) return false;
+    }
+
+    // All the superblocks were full.  The allocator is empty.
+    return true;
+  }
+
+  // The following functions are used for debugging.
+  void print_status() const
+  {
+    printf( "\n" );
+
+#ifdef KOKKOS_MEMPOOL_PRINT_SUPERBLOCK_INFO
+    typename SBHeaderView::HostMirror host_sb_header = create_mirror_view(m_sb_header);
+    deep_copy( host_sb_header, m_sb_header );
+
+    UInt32View num_allocated_blocks( "Allocated Blocks", m_num_sb );
+
+    // Count the number of allocated blocks per superblock.
+    {
+      MempoolImpl::count_allocated_blocks< UInt32View, SBHeaderView, MempoolBitset >
+        mch( m_num_sb, num_allocated_blocks, m_sb_header,
+             m_sb_blocks, m_sb_size, m_lg_max_sb_blocks );
+    }
+
+    typename UInt32View::HostMirror host_num_allocated_blocks =
+      create_mirror_view(num_allocated_blocks);
+    deep_copy( host_num_allocated_blocks, num_allocated_blocks );
+
+    // Print header info of all superblocks.
+    printf( "SB_ID    SIZE    ACTIVE    EMPTY_PAGES    FULL_PAGES    USED_BLOCKS\n" );
+    for ( size_t i = 0; i < m_num_sb; ++i ) {
+      printf( "%5zu    %4u    %6d    %11u    %10u     %10u\n", i,
+              host_sb_header(i).m_lg_block_size, host_sb_header(i).m_is_active,
+              host_sb_header(i).m_empty_pages, host_sb_header(i).m_full_pages,
+              host_num_allocated_blocks(i) );
+    }
+
+    printf( "\n" );
+#endif
+
+    UInt32View page_histogram( "Page Histogram", 33 );
+
+    // Get a View version of the blocksize info.
+    typedef View< BlockSizeHeader *, device_type >  BSHeaderView;
+    BSHeaderView blocksize_info( "BlockSize Headers", MAX_BLOCK_SIZES );
+
+    Kokkos::Impl::DeepCopy< backend_memory_space, Kokkos::HostSpace >
+      dc( blocksize_info.ptr_on_device(), m_blocksize_info,
+          sizeof(BlockSizeHeader) * m_num_block_size );
+
+    Kokkos::pair< double, uint32_t > result = Kokkos::pair< double, uint32_t >( 0.0, 0 );
+
+    // Create the page histogram.
+    {
+      MempoolImpl::create_histogram< UInt32View, BSHeaderView, SBHeaderView, MempoolBitset >
+        mch( 0, m_num_sb, page_histogram, blocksize_info, m_sb_header, m_sb_blocks,
+             m_lg_max_sb_blocks, LG_MIN_BLOCK_SIZE, BLOCKS_PER_PAGE, result );
+    }
+
+    typename UInt32View::HostMirror host_page_histogram = create_mirror_view(page_histogram);
+    deep_copy( host_page_histogram, page_histogram );
+
+    // Find the used and total pages and blocks.
+    uint32_t used_pages = 0;
+    uint32_t used_blocks = 0;
+    for ( uint32_t i = 1; i < 33; ++i ) {
+      used_pages += host_page_histogram(i);
+      used_blocks += i * host_page_histogram(i);
+    }
+    uint32_t total_pages = used_pages + host_page_histogram(0);
+
+    unsigned num_empty_sb = m_empty_sb.count();
+    unsigned num_non_empty_sb = m_num_sb - num_empty_sb;
+    unsigned num_partfull_sb = m_partfull_sb.count();
+
+    uint32_t total_blocks = result.second;
+    double ave_sb_full = num_non_empty_sb == 0 ? 0.0 : result.first / num_non_empty_sb;
+    double percent_used_sb = double( m_num_sb - num_empty_sb ) / m_num_sb;
+    double percent_used_pages = total_pages == 0 ? 0.0 : double(used_pages) / total_pages;
+    double percent_used_blocks = total_blocks == 0 ? 0.0 : double(used_blocks) / total_blocks;
+
+    // Count active superblocks.
+    typename UInt32View::HostMirror host_active = create_mirror_view(m_active);
+    deep_copy(host_active, m_active);
+
+    unsigned num_active_sb = 0;
+    for ( size_t i = 0; i < m_num_block_size; ++i ) {
+      num_active_sb += host_active(i) != INVALID_SUPERBLOCK;
+    }
+
+#ifdef KOKKOS_MEMPOOL_PRINT_ACTIVE_SUPERBLOCKS
+    // Print active superblocks.
+    printf( "BS_ID      SB_ID\n" );
+    for ( size_t i = 0; i < m_num_block_size; ++i ) {
+      uint32_t sb_id = host_active(i);
+
+      if ( sb_id == INVALID_SUPERBLOCK ) {
+        printf( "%5zu          I\n", i );
+      }
+      else if ( sb_id == SUPERBLOCK_LOCK ) {
+        printf( "%5zu          L\n", i );
+      }
+      else {
+        printf( "%5zu    %7u\n", i, sb_id );
+      }
+    }
+    printf( "\n" );
+    fflush( stdout );
+#endif
+
+#ifdef KOKKOS_MEMPOOL_PRINT_PAGE_INFO
+    // Print the summary page histogram.
+    printf( "USED_BLOCKS    PAGE_COUNT\n" );
+    for ( uint32_t i = 0; i < 33; ++i ) {
+      printf( "%10u    %10u\n", i, host_page_histogram[i] );
+    }
+    printf( "\n" );
+#endif
+
+#ifdef KOKKOS_MEMPOOL_PRINT_INDIVIDUAL_PAGE_INFO
+    // Print the page histogram for a few individual superblocks.
+//    const uint32_t num_sb_id = 2;
+//    uint32_t sb_id[num_sb_id] = { 0, 10 };
+    const uint32_t num_sb_id = 1;
+    uint32_t sb_id[num_sb_id] = { 0 };
+
+    for ( uint32_t i = 0; i < num_sb_id; ++i ) {
+      deep_copy( page_histogram, 0 );
+
+      {
+        MempoolImpl::create_histogram< UInt32View, BSHeaderView, SBHeaderView, MempoolBitset >
+          mch( sb_id[i], sb_id[i] + 1, page_histogram, blocksize_info, m_sb_header,
+               m_sb_blocks, m_lg_max_sb_blocks, LG_MIN_BLOCK_SIZE, BLOCKS_PER_PAGE, result );
+      }
+
+      deep_copy( host_page_histogram, page_histogram );
+
+      printf( "SB_ID    USED_BLOCKS    PAGE_COUNT\n" );
+      for ( uint32_t j = 0; j < 33; ++j ) {
+        printf( "%5u    %10u    %10u\n", sb_id[i], j, host_page_histogram[j] );
+      }
+      printf( "\n" );
+    }
+
+/*
+    // Print the blocks used for each page of a few individual superblocks.
+    for ( uint32_t i = 0; i < num_sb_id; ++i ) {
+      uint32_t lg_block_size = host_sb_header(sb_id[i]).m_lg_block_size;
+      if ( lg_block_size != 0 ) {
+        printf( "SB_ID    BLOCK ID    USED_BLOCKS\n" );
+
+        uint32_t block_size_id = lg_block_size - LG_MIN_BLOCK_SIZE;
+        uint32_t pages_per_sb = m_blocksize_info[block_size_id].m_pages_per_sb;
+
+        for ( uint32_t j = 0; j < pages_per_sb; ++j ) {
+          unsigned start_pos = ( sb_id[i] << m_lg_max_sb_blocks ) + j * BLOCKS_PER_PAGE;
+          unsigned end_pos = start_pos + BLOCKS_PER_PAGE;
+          uint32_t num_allocated_blocks = 0;
+
+          for ( unsigned k = start_pos; k < end_pos; ++k ) {
+            num_allocated_blocks += m_sb_blocks.test( k );
+          }
+
+          printf( "%5u    %8u    %11u\n", sb_id[i], j, num_allocated_blocks );
+        }
+
+        printf( "\n" );
+      }
+    }
+*/
+#endif
+
+    printf( "   Used blocks: %10u / %10u = %10.6lf\n", used_blocks, total_blocks,
+           percent_used_blocks );
+    printf( "    Used pages: %10u / %10u = %10.6lf\n", used_pages, total_pages,
+           percent_used_pages );
+    printf( "       Used SB: %10zu / %10zu = %10.6lf\n", m_num_sb - num_empty_sb, m_num_sb,
+           percent_used_sb );
+    printf( "     Active SB: %10u\n", num_active_sb );
+    printf( "      Empty SB: %10u\n", num_empty_sb );
+    printf( "   Partfull SB: %10u\n", num_partfull_sb );
+    printf( "       Full SB: %10lu\n",
+           m_num_sb - num_active_sb - num_empty_sb - num_partfull_sb );
+    printf( "Ave. SB Full %%: %10.6lf\n", ave_sb_full );
+    printf( "\n" );
+    fflush( stdout );
+
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+    fflush( stdout );
+#endif
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  size_t get_min_block_size() const { return MIN_BLOCK_SIZE; }
+
+  size_t get_mem_size() const { return m_data_size; }
+
+private:
+  /// \brief Returns the index into the active array for the given size.
+  ///
+  /// Computes log2 of the largest power of two >= the given size
+  /// ( ie ceil( log2(size) ) ) shifted by LG_MIN_BLOCK_SIZE.
+  KOKKOS_FORCEINLINE_FUNCTION
+  int get_block_size_index( const size_t size ) const
+  {
+    // We know the size fits in a 32 bit unsigned because the size of a
+    // superblock is limited to 2^31, so casting to an unsigned is safe.
+
+    // Find the most significant nonzero bit.
+    uint32_t first_nonzero_bit =
+      Kokkos::Impl::bit_scan_reverse( static_cast<unsigned>( size ) );
+
+    // If size is an integral power of 2, ceil( log2(size) ) is equal to the
+    // most significant nonzero bit.  Otherwise, you need to add 1.  Since the
+    // minimum block size is MIN_BLOCK_SIZE, make sure ceil( log2(size) ) is at
+    // least LG_MIN_BLOCK_SIZE.
+    uint32_t lg2_size = first_nonzero_bit + !Kokkos::Impl::is_integral_power_of_two( size );
+    lg2_size = lg2_size > LG_MIN_BLOCK_SIZE ? lg2_size : LG_MIN_BLOCK_SIZE;
+
+    // Return ceil( log2(size) ) shifted so that the value for MIN_BLOCK_SIZE
+    // is 0.
+    return lg2_size - LG_MIN_BLOCK_SIZE;
+  }
+
+  /// \brief Finds a superblock with free space to become a new active superblock.
+  ///
+  /// If this function is called, the current active superblock needs to be replaced
+  /// because it is full.  Initially, only the thread that sets the active superblock
+  /// to full calls this function.  Other threads can still allocate from the "full"
+  /// active superblock because a full superblock still has locations available.  If
+  /// a thread tries to allocate from the active superblock when it has no free
+  /// locations, then that thread will call this function, too, and spin on a lock
+  /// waiting until the active superblock has been replaced.
+  KOKKOS_FUNCTION
+  uint32_t find_superblock( int block_size_id, uint32_t old_sb ) const
+  {
+    // Try to grab the lock on the head.
+    uint32_t lock_sb =
+      Kokkos::atomic_compare_exchange( &m_active(block_size_id), old_sb, SUPERBLOCK_LOCK );
+
+    // Initialize the new superblock to be the previous one so the previous
+    // superblock is returned if a new superblock can't be found.
+    uint32_t new_sb = lock_sb;
+
+    if ( lock_sb == old_sb ) {
+      // This thread has the lock.
+
+      // 1. Look for a partially filled superblock that is of the right block
+      //    size.
+
+      size_t max_tries = m_ceil_num_sb >> LG_BLOCKS_PER_PAGE;
+      size_t tries = 0;
+      bool search_done = false;
+
+      // Set the starting search position to the beginning of this block
+      // size's bitset.
+      unsigned pos = block_size_id * m_ceil_num_sb;
+
+      while (!search_done) {
+        bool success = false;
+        unsigned prev_val;
+
+        Kokkos::tie( success, pos ) = m_partfull_sb.reset_any_in_word( pos, prev_val );
+
+        if ( !success ) {
+          if ( ++tries >= max_tries ) {
+            // Exceeded number of words for this block size's bitset.
+            search_done = true;
+          }
+          else {
+            pos += BLOCKS_PER_PAGE;
+          }
+        }
+        else {
+          // Found a superblock.
+          search_done = true;
+          new_sb = pos - block_size_id * m_ceil_num_sb;
+
+          // Assertions:
+          //   1. A different superblock than the current should be found.
+#ifdef KOKKOS_MEMPOOL_PRINTERR
+          if ( new_sb == lock_sb ) {
+            printf( "\n** MemoryPool::find_superblock() FOUND_SAME_SUPERBLOCK: %u **\n",
+                    new_sb);
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+            fflush( stdout );
+#endif
+            Kokkos::abort( "" );
+          }
+#endif
+
+          // Set the head status for the superblock.
+          volatile_store( &m_sb_header(new_sb).m_is_active, uint32_t(true) );
+
+          // If there was a previous active superblock, mark it as not active.
+          // It is now in the full category and as such isn't tracked.
+          if ( lock_sb != INVALID_SUPERBLOCK ) {
+            volatile_store( &m_sb_header(lock_sb).m_is_active, uint32_t(false) );
+          }
+
+          memory_fence();
+        }
+      }
+
+      // 2. Look for an empty superblock.
+      if ( new_sb == lock_sb ) {
+        tries = 0;
+        search_done = false;
+
+        // Set the starting search position to the beginning of this block
+        // size's bitset.
+        pos = 0;
+
+        while (!search_done) {
+          bool success = false;
+          unsigned prev_val;
+
+          Kokkos::tie( success, pos ) = m_empty_sb.reset_any_in_word( pos, prev_val );
+
+          if ( !success ) {
+            if ( ++tries >= max_tries ) {
+              // Exceeded number of words for this block size's bitset.
+              search_done = true;
+            }
+            else {
+              pos += BLOCKS_PER_PAGE;
+            }
+          }
+          else {
+            // Found a superblock.
+            search_done = true;
+            new_sb = pos;
+
+            // Assertions:
+            //   1. A different superblock than the current should be found.
+#ifdef KOKKOS_MEMPOOL_PRINTERR
+            if ( new_sb == lock_sb ) {
+              printf( "\n** MemoryPool::find_superblock() FOUND_SAME_SUPERBLOCK: %u **\n",
+                      new_sb);
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+              fflush( stdout );
+#endif
+              Kokkos::abort( "" );
+            }
+#endif
+
+            // Set the empty pages, block size, and head status for the
+            // superblock.
+            volatile_store( &m_sb_header(new_sb).m_empty_pages,
+                            m_blocksize_info[block_size_id].m_pages_per_sb );
+            volatile_store( &m_sb_header(new_sb).m_lg_block_size,
+                            block_size_id + LG_MIN_BLOCK_SIZE );
+            volatile_store( &m_sb_header(new_sb).m_is_active, uint32_t(true) );
+
+            // If there was a previous active superblock, mark it as not active.
+            // It is now in the full category and as such isn't tracked.
+            if ( lock_sb != INVALID_SUPERBLOCK ) {
+              volatile_store( &m_sb_header(lock_sb).m_is_active, uint32_t(false) );
+            }
+
+            memory_fence();
+          }
+        }
+      }
+
+      // Write the new active superblock to release the lock.
+      atomic_exchange( &m_active(block_size_id), new_sb );
+    }
+    else {
+      // Either another thread has the lock and is switching the active superblock for
+      // this block size or another thread has already changed the active superblock
+      // since this thread read its value.  Keep reading the active superblock until
+      // it isn't locked to get the new active superblock.
+      do {
+        new_sb = volatile_load( &m_active(block_size_id) );
+      } while ( new_sb == SUPERBLOCK_LOCK );
+
+      // Assertions:
+      //   1. An invalid superblock should never be found here.
+      //   2. If the new superblock is the same as the previous superblock, the
+      //      allocator is empty.
+#ifdef KOKKOS_MEMPOOL_PRINTERR
+      if ( new_sb == INVALID_SUPERBLOCK ) {
+        printf( "\n** MemoryPool::find_superblock() FOUND_INACTIVE_SUPERBLOCK **\n" );
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+        fflush( stdout );
+#endif
+        Kokkos::abort( "" );
+      }
+#endif
+    }
+
+    return new_sb;
+  }
+
+  /// Returns 64 bits from a clock register.
+  KOKKOS_FORCEINLINE_FUNCTION
+  uint64_t get_clock_register(void) const
+  {
+#if defined( __CUDA_ARCH__ )
+    // Return value of 64-bit hi-res clock register.
+	  return clock64();
+#elif defined( __i386__ ) || defined( __x86_64 )
+    // Return value of 64-bit hi-res clock register.
+    unsigned a, d;
+    __asm__ volatile("rdtsc" : "=a" (a), "=d" (d));
+    return ( (uint64_t) a) | ( ( (uint64_t) d ) << 32 );
+#else
+    const uint64_t ticks = std::chrono::high_resolution_clock::now().time_since_epoch().count();
+    return ticks;
+#endif
+  }
+};
+
+} // namespace Experimental
+} // namespace Kokkos
+
+#ifdef KOKKOS_MEMPOOL_PRINTERR
+#undef KOKKOS_MEMPOOL_PRINTERR
+#endif
+
+#ifdef KOKKOS_MEMPOOL_PRINT_INFO
+#undef KOKKOS_MEMPOOL_PRINT_INFO
+#endif
+
+#ifdef KOKKOS_MEMPOOL_PRINT_BLOCKSIZE_INFO
+#undef KOKKOS_MEMPOOL_PRINT_BLOCKSIZE_INFO
+#endif
+
+#ifdef KOKKOS_MEMPOOL_PRINT_SUPERBLOCK_INFO
+#undef KOKKOS_MEMPOOL_PRINT_SUPERBLOCK_INFO
+#endif
+
+#ifdef KOKKOS_MEMPOOL_PRINT_PAGE_INFO
+#undef KOKKOS_MEMPOOL_PRINT_PAGE_INFO
+#endif
+
+#ifdef KOKKOS_MEMPOOL_PRINT_INDIVIDUAL_PAGE_INFO
+#undef KOKKOS_MEMPOOL_PRINT_INDIVIDUAL_PAGE_INFO
+#endif
+
+#undef KOKKOS_MEMPOOL_SB_FULL_FRACTION
+#undef KOKKOS_MEMPOOL_PAGE_FULL_FRACTION
+
+#endif // KOKKOS_MEMORYPOOL_HPP
diff --git a/lib/kokkos/core/src/Kokkos_MemoryTraits.hpp b/lib/kokkos/core/src/Kokkos_MemoryTraits.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..5ee1f16fec854fc0ee45e39c488095fdee73ed4f
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_MemoryTraits.hpp
@@ -0,0 +1,116 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_MEMORYTRAITS_HPP
+#define KOKKOS_MEMORYTRAITS_HPP
+
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_Tags.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+/** \brief  Memory access traits for views, an extension point.
+ *
+ *  These traits should be orthogonal.  If there are dependencies then
+ *  the MemoryTraits template must detect and enforce dependencies.
+ *
+ *  A zero value is the default for a View, indicating that none of
+ *  these traits are present.
+ */
+enum MemoryTraitsFlags
+  { Unmanaged  = 0x01
+  , RandomAccess = 0x02
+  , Atomic = 0x04
+  };
+
+template < unsigned T >
+struct MemoryTraits {
+  //! Tag this class as a kokkos memory traits:
+  typedef MemoryTraits memory_traits ;
+
+  enum { Unmanaged    = T & unsigned(Kokkos::Unmanaged) };
+  enum { RandomAccess = T & unsigned(Kokkos::RandomAccess) };
+  enum { Atomic       = T & unsigned(Kokkos::Atomic) };
+
+};
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+typedef Kokkos::MemoryTraits<0> MemoryManaged ;
+typedef Kokkos::MemoryTraits< Kokkos::Unmanaged > MemoryUnmanaged ;
+typedef Kokkos::MemoryTraits< Kokkos::Unmanaged | Kokkos::RandomAccess > MemoryRandomAccess ;
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+/** \brief Memory alignment settings
+ *
+ *  Sets global value for memory alignment.  Must be a power of two!
+ *  Enable compatibility of views from different devices with static stride.
+ *  Use compiler flag to enable overwrites.
+ */
+enum { MEMORY_ALIGNMENT =
+#if defined( KOKKOS_MEMORY_ALIGNMENT )
+    ( 1 << Kokkos::Impl::integral_power_of_two( KOKKOS_MEMORY_ALIGNMENT ) )
+#else
+    ( 1 << Kokkos::Impl::integral_power_of_two( 128 ) )
+#endif
+  , MEMORY_ALIGNMENT_THRESHOLD = 4 
+  };
+
+
+} //namespace Impl
+} // namespace Kokkos
+
+#endif /* #ifndef KOKKOS_MEMORYTRAITS_HPP */
+
diff --git a/lib/kokkos/core/src/Kokkos_OpenMP.hpp b/lib/kokkos/core/src/Kokkos_OpenMP.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..7be4f8245f98ea464d8a27313c13c7aa35be4e46
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_OpenMP.hpp
@@ -0,0 +1,189 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_OPENMP_HPP
+#define KOKKOS_OPENMP_HPP
+
+#include <Kokkos_Core_fwd.hpp>
+
+#if defined( KOKKOS_HAVE_OPENMP ) && defined( _OPENMP )
+
+#include <omp.h>
+
+#include <cstddef>
+#include <iosfwd>
+#include <Kokkos_HostSpace.hpp>
+#ifdef KOKKOS_HAVE_HBWSPACE
+#include <Kokkos_HBWSpace.hpp>
+#endif
+#include <Kokkos_ScratchSpace.hpp>
+#include <Kokkos_Parallel.hpp>
+#include <Kokkos_TaskPolicy.hpp>
+#include <Kokkos_Layout.hpp>
+#include <impl/Kokkos_Tags.hpp>
+
+#include <KokkosExp_MDRangePolicy.hpp>
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+/// \class OpenMP
+/// \brief Kokkos device for multicore processors in the host memory space.
+class OpenMP {
+public:
+  //------------------------------------
+  //! \name Type declarations that all Kokkos devices must provide.
+  //@{
+
+  //! Tag this class as a kokkos execution space
+  typedef OpenMP                execution_space ;
+  #ifdef KOKKOS_HAVE_HBWSPACE
+  typedef Experimental::HBWSpace memory_space ;
+  #else
+  typedef HostSpace             memory_space ;
+  #endif
+  //! This execution space preferred device_type
+  typedef Kokkos::Device<execution_space,memory_space> device_type;
+
+  typedef LayoutRight           array_layout ;
+  typedef memory_space::size_type  size_type ;
+
+  typedef ScratchMemorySpace< OpenMP > scratch_memory_space ;
+
+  //@}
+  //------------------------------------
+  //! \name Functions that all Kokkos execution spaces must implement.
+  //@{
+
+  inline static bool in_parallel() { return omp_in_parallel(); }
+
+  /** \brief  Set the device in a "sleep" state. A noop for OpenMP.  */
+  static bool sleep();
+
+  /** \brief Wake the device from the 'sleep' state. A noop for OpenMP. */
+  static bool wake();
+
+  /** \brief Wait until all dispatched functors complete. A noop for OpenMP. */
+  static void fence() {}
+
+  /// \brief Print configuration information to the given output stream.
+  static void print_configuration( std::ostream & , const bool detail = false );
+
+  /// \brief Free any resources being consumed by the device.
+  static void finalize();
+
+  /** \brief  Initialize the device.
+   *
+   *  1) If the hardware locality library is enabled and OpenMP has not
+   *     already bound threads then bind OpenMP threads to maximize
+   *     core utilization and group for memory hierarchy locality.
+   *
+   *  2) Allocate a HostThread for each OpenMP thread to hold its
+   *     topology and fan in/out data.
+   */
+  static void initialize( unsigned thread_count = 0 ,
+                          unsigned use_numa_count = 0 ,
+                          unsigned use_cores_per_numa = 0 );
+
+  static int is_initialized();
+
+  /** \brief  Return the maximum amount of concurrency.  */
+  static int concurrency();
+
+  //@}
+  //------------------------------------
+  /** \brief  This execution space has a topological thread pool which can be queried.
+   *
+   *  All threads within a pool have a common memory space for which they are cache coherent.
+   *    depth = 0  gives the number of threads in the whole pool.
+   *    depth = 1  gives the number of threads in a NUMA region, typically sharing L3 cache.
+   *    depth = 2  gives the number of threads at the finest granularity, typically sharing L1 cache.
+   */
+  inline static int thread_pool_size( int depth = 0 );
+
+  /** \brief  The rank of the executing thread in this thread pool */
+  KOKKOS_INLINE_FUNCTION static int thread_pool_rank();
+
+  //------------------------------------
+
+  inline static unsigned max_hardware_threads() { return thread_pool_size(0); }
+
+  KOKKOS_INLINE_FUNCTION static
+  unsigned hardware_thread_id() { return thread_pool_rank(); }
+};
+
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+struct VerifyExecutionCanAccessMemorySpace
+  < Kokkos::OpenMP::memory_space
+  , Kokkos::OpenMP::scratch_memory_space
+  >
+{
+  enum { value = true };
+  inline static void verify( void ) { }
+  inline static void verify( const void * ) { }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+#include <OpenMP/Kokkos_OpenMPexec.hpp>
+#include <OpenMP/Kokkos_OpenMP_Parallel.hpp>
+#include <OpenMP/Kokkos_OpenMP_Task.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+#endif /* #if defined( KOKKOS_HAVE_OPENMP ) && defined( _OPENMP ) */
+#endif /* #ifndef KOKKOS_OPENMP_HPP */
+
+
diff --git a/lib/kokkos/core/src/Kokkos_Pair.hpp b/lib/kokkos/core/src/Kokkos_Pair.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..83436826f4aded7131802662327d6b80c5b5c785
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_Pair.hpp
@@ -0,0 +1,530 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+
+/// \file Kokkos_Pair.hpp
+/// \brief Declaration and definition of Kokkos::pair.
+///
+/// This header file declares and defines Kokkos::pair and its related
+/// nonmember functions.
+
+#ifndef KOKKOS_PAIR_HPP
+#define KOKKOS_PAIR_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <utility>
+
+namespace Kokkos {
+/// \struct pair
+/// \brief Replacement for std::pair that works on CUDA devices.
+///
+/// The instance methods of std::pair, including its constructors, are
+/// not marked as <tt>__device__</tt> functions.  Thus, they cannot be
+/// called on a CUDA device, such as an NVIDIA GPU.  This struct
+/// implements the same interface as std::pair, but can be used on a
+/// CUDA device as well as on the host.
+template <class T1, class T2>
+struct pair
+{
+  //! The first template parameter of this class.
+  typedef T1 first_type;
+  //! The second template parameter of this class.
+  typedef T2 second_type;
+
+  //! The first element of the pair.
+  first_type  first;
+  //! The second element of the pair.
+  second_type second;
+
+  /// \brief Default constructor.
+  ///
+  /// This calls the default constructors of T1 and T2.  It won't
+  /// compile if those default constructors are not defined and
+  /// public.
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair()
+    : first(), second()
+  {}
+
+  /// \brief Constructor that takes both elements of the pair.
+  ///
+  /// This calls the copy constructors of T1 and T2.  It won't compile
+  /// if those copy constructors are not defined and public.
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair(first_type const& f, second_type const& s)
+    : first(f), second(s)
+  {}
+
+  /// \brief Copy constructor.
+  ///
+  /// This calls the copy constructors of T1 and T2.  It won't compile
+  /// if those copy constructors are not defined and public.
+  template <class U, class V>
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair( const pair<U,V> &p)
+    : first(p.first), second(p.second)
+  {}
+
+  /// \brief Copy constructor.
+  ///
+  /// This calls the copy constructors of T1 and T2.  It won't compile
+  /// if those copy constructors are not defined and public.
+  template <class U, class V>
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair( const volatile pair<U,V> &p)
+    : first(p.first), second(p.second)
+  {}
+
+  /// \brief Assignment operator.
+  ///
+  /// This calls the assignment operators of T1 and T2.  It won't
+  /// compile if the assignment operators are not defined and public.
+  template <class U, class V>
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair<T1, T2> & operator=(const pair<U,V> &p)
+  {
+    first = p.first;
+    second = p.second;
+    return *this;
+  }
+
+
+  /// \brief Assignment operator, for volatile <tt>*this</tt>.
+  ///
+  /// \param p [in] Input; right-hand side of the assignment.
+  ///
+  /// This calls the assignment operators of T1 and T2.  It will not
+  /// compile if the assignment operators are not defined and public.
+  ///
+  /// This operator returns \c void instead of <tt>volatile pair<T1,
+  /// T2>& </tt>.  See Kokkos Issue #177 for the explanation.  In
+  /// practice, this means that you should not chain assignments with
+  /// volatile lvalues.
+  template <class U, class V>
+  KOKKOS_FORCEINLINE_FUNCTION
+  void operator=(const volatile pair<U,V> &p) volatile
+  {
+    first = p.first;
+    second = p.second;
+    // We deliberately do not return anything here.  See explanation
+    // in public documentation above.
+  }
+
+  // from std::pair<U,V>
+  template <class U, class V>
+  pair( const std::pair<U,V> &p)
+    : first(p.first), second(p.second)
+  {}
+
+  /// \brief Return the std::pair version of this object.
+  ///
+  /// This is <i>not</i> a device function; you may not call it on a
+  /// CUDA device.  It is meant to be called on the host, if the user
+  /// wants an std::pair instead of a Kokkos::pair.
+  ///
+  /// \note This is not a conversion operator, since defining a
+  ///   conversion operator made the relational operators have
+  ///   ambiguous definitions.
+  std::pair<T1,T2> to_std_pair() const
+  { return std::make_pair(first,second); }
+};
+
+template <class T1, class T2>
+struct pair<T1&, T2&>
+{
+  //! The first template parameter of this class.
+  typedef T1& first_type;
+  //! The second template parameter of this class.
+  typedef T2& second_type;
+
+  //! The first element of the pair.
+  first_type  first;
+  //! The second element of the pair.
+  second_type second;
+
+  /// \brief Constructor that takes both elements of the pair.
+  ///
+  /// This calls the copy constructors of T1 and T2.  It won't compile
+  /// if those copy constructors are not defined and public.
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair(first_type f, second_type s)
+    : first(f), second(s)
+  {}
+
+  /// \brief Copy constructor.
+  ///
+  /// This calls the copy constructors of T1 and T2.  It won't compile
+  /// if those copy constructors are not defined and public.
+  template <class U, class V>
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair( const pair<U,V> &p)
+    : first(p.first), second(p.second)
+  {}
+
+  // from std::pair<U,V>
+  template <class U, class V>
+  pair( const std::pair<U,V> &p)
+    : first(p.first), second(p.second)
+  {}
+
+  /// \brief Assignment operator.
+  ///
+  /// This calls the assignment operators of T1 and T2.  It won't
+  /// compile if the assignment operators are not defined and public.
+  template <class U, class V>
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair<first_type, second_type> & operator=(const pair<U,V> &p)
+  {
+    first = p.first;
+    second = p.second;
+    return *this;
+  }
+
+  /// \brief Return the std::pair version of this object.
+  ///
+  /// This is <i>not</i> a device function; you may not call it on a
+  /// CUDA device.  It is meant to be called on the host, if the user
+  /// wants an std::pair instead of a Kokkos::pair.
+  ///
+  /// \note This is not a conversion operator, since defining a
+  ///   conversion operator made the relational operators have
+  ///   ambiguous definitions.
+  std::pair<T1,T2> to_std_pair() const
+  { return std::make_pair(first,second); }
+};
+
+template <class T1, class T2>
+struct pair<T1, T2&>
+{
+  //! The first template parameter of this class.
+  typedef T1  first_type;
+  //! The second template parameter of this class.
+  typedef T2& second_type;
+
+  //! The first element of the pair.
+  first_type  first;
+  //! The second element of the pair.
+  second_type second;
+
+  /// \brief Constructor that takes both elements of the pair.
+  ///
+  /// This calls the copy constructors of T1 and T2.  It won't compile
+  /// if those copy constructors are not defined and public.
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair(first_type const& f, second_type s)
+    : first(f), second(s)
+  {}
+
+  /// \brief Copy constructor.
+  ///
+  /// This calls the copy constructors of T1 and T2.  It won't compile
+  /// if those copy constructors are not defined and public.
+  template <class U, class V>
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair( const pair<U,V> &p)
+    : first(p.first), second(p.second)
+  {}
+
+  // from std::pair<U,V>
+  template <class U, class V>
+  pair( const std::pair<U,V> &p)
+    : first(p.first), second(p.second)
+  {}
+
+  /// \brief Assignment operator.
+  ///
+  /// This calls the assignment operators of T1 and T2.  It won't
+  /// compile if the assignment operators are not defined and public.
+  template <class U, class V>
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair<first_type, second_type> & operator=(const pair<U,V> &p)
+  {
+    first = p.first;
+    second = p.second;
+    return *this;
+  }
+
+  /// \brief Return the std::pair version of this object.
+  ///
+  /// This is <i>not</i> a device function; you may not call it on a
+  /// CUDA device.  It is meant to be called on the host, if the user
+  /// wants an std::pair instead of a Kokkos::pair.
+  ///
+  /// \note This is not a conversion operator, since defining a
+  ///   conversion operator made the relational operators have
+  ///   ambiguous definitions.
+  std::pair<T1,T2> to_std_pair() const
+  { return std::make_pair(first,second); }
+};
+
+template <class T1, class T2>
+struct pair<T1&, T2>
+{
+  //! The first template parameter of this class.
+  typedef T1&  first_type;
+  //! The second template parameter of this class.
+  typedef T2 second_type;
+
+  //! The first element of the pair.
+  first_type  first;
+  //! The second element of the pair.
+  second_type second;
+
+  /// \brief Constructor that takes both elements of the pair.
+  ///
+  /// This calls the copy constructors of T1 and T2.  It won't compile
+  /// if those copy constructors are not defined and public.
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair(first_type f, second_type const& s)
+    : first(f), second(s)
+  {}
+
+  /// \brief Copy constructor.
+  ///
+  /// This calls the copy constructors of T1 and T2.  It won't compile
+  /// if those copy constructors are not defined and public.
+  template <class U, class V>
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair( const pair<U,V> &p)
+    : first(p.first), second(p.second)
+  {}
+
+  // from std::pair<U,V>
+  template <class U, class V>
+  pair( const std::pair<U,V> &p)
+    : first(p.first), second(p.second)
+  {}
+
+  /// \brief Assignment operator.
+  ///
+  /// This calls the assignment operators of T1 and T2.  It won't
+  /// compile if the assignment operators are not defined and public.
+  template <class U, class V>
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair<first_type, second_type> & operator=(const pair<U,V> &p)
+  {
+    first = p.first;
+    second = p.second;
+    return *this;
+  }
+
+  /// \brief Return the std::pair version of this object.
+  ///
+  /// This is <i>not</i> a device function; you may not call it on a
+  /// CUDA device.  It is meant to be called on the host, if the user
+  /// wants an std::pair instead of a Kokkos::pair.
+  ///
+  /// \note This is not a conversion operator, since defining a
+  ///   conversion operator made the relational operators have
+  ///   ambiguous definitions.
+  std::pair<T1,T2> to_std_pair() const
+  { return std::make_pair(first,second); }
+};
+
+//! Equality operator for Kokkos::pair.
+template <class T1, class T2>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator== (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
+{ return lhs.first==rhs.first && lhs.second==rhs.second; }
+
+//! Inequality operator for Kokkos::pair.
+template <class T1, class T2>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator!= (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
+{ return !(lhs==rhs); }
+
+//! Less-than operator for Kokkos::pair.
+template <class T1, class T2>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator<  (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
+{ return lhs.first<rhs.first || (!(rhs.first<lhs.first) && lhs.second<rhs.second); }
+
+//! Less-than-or-equal-to operator for Kokkos::pair.
+template <class T1, class T2>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator<= (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
+{ return !(rhs<lhs); }
+
+//! Greater-than operator for Kokkos::pair.
+template <class T1, class T2>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator>  (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
+{ return rhs<lhs; }
+
+//! Greater-than-or-equal-to operator for Kokkos::pair.
+template <class T1, class T2>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator>= (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
+{ return !(lhs<rhs); }
+
+/// \brief Return a new pair.
+///
+/// This is a "nonmember constructor" for Kokkos::pair.  It works just
+/// like std::make_pair.
+template <class T1,class T2>
+KOKKOS_FORCEINLINE_FUNCTION
+pair<T1,T2> make_pair (T1 x, T2 y)
+{ return ( pair<T1,T2>(x,y) ); }
+
+/// \brief Return a pair of references to the input arguments.
+///
+/// This compares to std::tie (new in C++11).  You can use it to
+/// assign to two variables at once, from the result of a function
+/// that returns a pair.  For example (<tt>__device__</tt> and
+/// <tt>__host__</tt> attributes omitted for brevity):
+/// \code
+/// // Declaration of the function to call.
+/// // First return value: operation count.
+/// // Second return value: whether all operations succeeded.
+/// Kokkos::pair<int, bool> someFunction ();
+///
+/// // Code that uses Kokkos::tie.
+/// int myFunction () {
+///   int count = 0;
+///   bool success = false;
+///
+///   // This assigns to both count and success.
+///   Kokkos::tie (count, success) = someFunction ();
+///
+///   if (! success) {
+///     // ... Some operation failed;
+///     //     take corrective action ...
+///   }
+///   return count;
+/// }
+/// \endcode
+///
+/// The line that uses tie() could have been written like this:
+/// \code
+///   Kokkos::pair<int, bool> result = someFunction ();
+///   count = result.first;
+///   success = result.second;
+/// \endcode
+///
+/// Using tie() saves two lines of code and avoids a copy of each
+/// element of the pair.  The latter could be significant if one or
+/// both elements of the pair are more substantial objects than \c int
+/// or \c bool.
+template <class T1,class T2>
+KOKKOS_FORCEINLINE_FUNCTION
+pair<T1 &,T2 &> tie (T1 & x, T2 & y)
+{ return ( pair<T1 &,T2 &>(x,y) ); }
+
+//
+// Specialization of Kokkos::pair for a \c void second argument.  This
+// is not actually a "pair"; it only contains one element, the first.
+//
+template <class T1>
+struct pair<T1,void>
+{
+  typedef T1 first_type;
+  typedef void second_type;
+
+  first_type  first;
+  enum { second = 0 };
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair()
+    : first()
+  {}
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair(const first_type & f)
+    : first(f)
+  {}
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair(const first_type & f, int)
+    : first(f)
+  {}
+
+  template <class U>
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair( const pair<U,void> &p)
+    : first(p.first)
+  {}
+
+  template <class U>
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair<T1, void> & operator=(const pair<U,void> &p)
+  {
+    first = p.first;
+    return *this;
+  }
+};
+
+//
+// Specialization of relational operators for Kokkos::pair<T1,void>.
+//
+
+template <class T1>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator== (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
+{ return lhs.first==rhs.first; }
+
+template <class T1>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator!= (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
+{ return !(lhs==rhs); }
+
+template <class T1>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator<  (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
+{ return lhs.first<rhs.first; }
+
+template <class T1>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator<= (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
+{ return !(rhs<lhs); }
+
+template <class T1>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator>  (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
+{ return rhs<lhs; }
+
+template <class T1>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator>= (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
+{ return !(lhs<rhs); }
+
+} // namespace Kokkos
+
+
+#endif //KOKKOS_PAIR_HPP
diff --git a/lib/kokkos/core/src/Kokkos_Parallel.hpp b/lib/kokkos/core/src/Kokkos_Parallel.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..588dc90af38c14c691c39fc88d22efaba51f6be4
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_Parallel.hpp
@@ -0,0 +1,527 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+/// \file Kokkos_Parallel.hpp
+/// \brief Declaration of parallel operators
+
+#ifndef KOKKOS_PARALLEL_HPP
+#define KOKKOS_PARALLEL_HPP
+
+#include <cstddef>
+#include <Kokkos_Core_fwd.hpp>
+#include <Kokkos_View.hpp>
+#include <Kokkos_ExecPolicy.hpp>
+
+#if (KOKKOS_ENABLE_PROFILING)
+#include <impl/Kokkos_Profiling_Interface.hpp>
+#include <typeinfo>
+#endif
+
+#include <impl/Kokkos_Tags.hpp>
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_FunctorAdapter.hpp>
+
+#ifdef KOKKOS_HAVE_DEBUG
+#include<iostream>
+#endif
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+/** \brief  Given a Functor and Execution Policy query an execution space.
+ *
+ *  if       the Policy has an execution space use that
+ *  else if  the Functor has an execution_space use that
+ *  else if  the Functor has a device_type use that for backward compatibility
+ *  else     use the default
+ */
+template< class Functor
+        , class Policy
+        , class EnableFunctor
+        , class EnablePolicy
+        >
+struct FunctorPolicyExecutionSpace {
+  typedef Kokkos::DefaultExecutionSpace execution_space ;
+};
+
+template< class Functor , class Policy >
+struct FunctorPolicyExecutionSpace
+  < Functor , Policy
+  , typename enable_if_type< typename Functor::device_type     >::type
+  , typename enable_if_type< typename Policy ::execution_space >::type
+  >
+{
+  typedef typename Policy ::execution_space execution_space ;
+};
+
+template< class Functor , class Policy >
+struct FunctorPolicyExecutionSpace
+  < Functor , Policy
+  , typename enable_if_type< typename Functor::execution_space >::type
+  , typename enable_if_type< typename Policy ::execution_space >::type
+  >
+{
+  typedef typename Policy ::execution_space execution_space ;
+};
+
+template< class Functor , class Policy , class EnableFunctor >
+struct FunctorPolicyExecutionSpace
+  < Functor , Policy
+  , EnableFunctor
+  , typename enable_if_type< typename Policy::execution_space >::type
+  >
+{
+  typedef typename Policy ::execution_space execution_space ;
+};
+
+template< class Functor , class Policy , class EnablePolicy >
+struct FunctorPolicyExecutionSpace
+  < Functor , Policy
+  , typename enable_if_type< typename Functor::device_type >::type
+  , EnablePolicy
+  >
+{
+  typedef typename Functor::device_type execution_space ;
+};
+
+template< class Functor , class Policy , class EnablePolicy >
+struct FunctorPolicyExecutionSpace
+  < Functor , Policy
+  , typename enable_if_type< typename Functor::execution_space >::type
+  , EnablePolicy
+  >
+{
+  typedef typename Functor::execution_space execution_space ;
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+/** \brief Execute \c functor in parallel according to the execution \c policy.
+ *
+ * A "functor" is a class containing the function to execute in parallel,
+ * data needed for that execution, and an optional \c execution_space
+ * typedef.  Here is an example functor for parallel_for:
+ *
+ * \code
+ *  class FunctorType {
+ *  public:
+ *    typedef  ...  execution_space ;
+ *    void operator() ( WorkType iwork ) const ;
+ *  };
+ * \endcode
+ *
+ * In the above example, \c WorkType is any integer type for which a
+ * valid conversion from \c size_t to \c IntType exists.  Its
+ * <tt>operator()</tt> method defines the operation to parallelize,
+ * over the range of integer indices <tt>iwork=[0,work_count-1]</tt>.
+ * This compares to a single iteration \c iwork of a \c for loop.
+ * If \c execution_space is not defined DefaultExecutionSpace will be used.
+ */
+template< class ExecPolicy , class FunctorType >
+inline
+void parallel_for( const ExecPolicy  & policy
+                 , const FunctorType & functor
+                 , const std::string& str = ""
+                 , typename Impl::enable_if< ! Impl::is_integral< ExecPolicy >::value >::type * = 0
+                 )
+{
+#if (KOKKOS_ENABLE_PROFILING)
+    uint64_t kpID = 0;
+     if(Kokkos::Profiling::profileLibraryLoaded()) {
+     	Kokkos::Profiling::beginParallelFor("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
+     }
+#endif
+
+    Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
+    Impl::ParallelFor< FunctorType , ExecPolicy > closure( functor , policy );
+    Kokkos::Impl::shared_allocation_tracking_release_and_enable();
+   
+   closure.execute();
+
+#if (KOKKOS_ENABLE_PROFILING)
+     if(Kokkos::Profiling::profileLibraryLoaded()) {
+        Kokkos::Profiling::endParallelFor(kpID);
+     }
+#endif
+}
+
+template< class FunctorType >
+inline
+void parallel_for( const size_t        work_count
+                 , const FunctorType & functor
+                 , const std::string& str = ""
+                 )
+{
+  typedef typename
+    Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space
+      execution_space ;
+  typedef RangePolicy< execution_space > policy ;
+
+#if (KOKKOS_ENABLE_PROFILING)
+  uint64_t kpID = 0;
+     if(Kokkos::Profiling::profileLibraryLoaded()) {
+  	Kokkos::Profiling::beginParallelFor("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
+     }
+#endif
+    
+  Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
+  Impl::ParallelFor< FunctorType , policy > closure( functor , policy(0,work_count) );
+  Kokkos::Impl::shared_allocation_tracking_release_and_enable();
+
+  closure.execute();
+
+#if (KOKKOS_ENABLE_PROFILING)
+     if(Kokkos::Profiling::profileLibraryLoaded()) {
+	Kokkos::Profiling::endParallelFor(kpID);
+     }
+#endif
+}
+
+template< class ExecPolicy , class FunctorType >
+inline
+void parallel_for( const std::string & str
+                 , const ExecPolicy  & policy
+                 , const FunctorType & functor )
+{
+  #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
+  Kokkos::fence();
+  std::cout << "KOKKOS_DEBUG Start parallel_for kernel: " << str << std::endl;
+  #endif
+
+  parallel_for(policy,functor,str);
+
+  #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
+  Kokkos::fence();
+  std::cout << "KOKKOS_DEBUG End   parallel_for kernel: " << str << std::endl;
+  #endif
+  (void) str;
+}
+
+}
+
+#include <Kokkos_Parallel_Reduce.hpp>
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+/// \fn parallel_scan
+/// \tparam ExecutionPolicy The execution policy type.
+/// \tparam FunctorType     The scan functor type.
+///
+/// \param policy  [in] The execution policy.
+/// \param functor [in] The scan functor.
+///
+/// This function implements a parallel scan pattern.  The scan can
+/// be either inclusive or exclusive, depending on how you implement
+/// the scan functor.
+///
+/// A scan functor looks almost exactly like a reduce functor, except
+/// that its operator() takes a third \c bool argument, \c final_pass,
+/// which indicates whether this is the last pass of the scan
+/// operation.  We will show below how to use the \c final_pass
+/// argument to control whether the scan is inclusive or exclusive.
+///
+/// Here is the minimum required interface of a scan functor for a POD
+/// (plain old data) value type \c PodType.  That is, the result is a
+/// View of zero or more PodType.  It is also possible for the result
+/// to be an array of (same-sized) arrays of PodType, but we do not
+/// show the required interface for that here.
+/// \code
+/// template< class ExecPolicy , class FunctorType >
+/// class ScanFunctor {
+/// public:
+///   // The Kokkos device type
+///   typedef ... execution_space;
+///   // Type of an entry of the array containing the result;
+///   // also the type of each of the entries combined using
+///   // operator() or join().
+///   typedef PodType value_type;
+///
+///   void operator () (const ExecPolicy::member_type & i, value_type& update, const bool final_pass) const;
+///   void init (value_type& update) const;
+///   void join (volatile value_type& update, volatile const value_type& input) const
+/// };
+/// \endcode
+///
+/// Here is an example of a functor which computes an inclusive plus-scan
+/// of an array of \c int, in place.  If given an array [1, 2, 3, 4], this
+/// scan will overwrite that array with [1, 3, 6, 10].
+///
+/// \code
+/// template<class SpaceType>
+/// class InclScanFunctor {
+/// public:
+///   typedef SpaceType execution_space;
+///   typedef int value_type;
+///   typedef typename SpaceType::size_type size_type;
+///
+///   InclScanFunctor( Kokkos::View<value_type*, execution_space> x
+///                  , Kokkos::View<value_type*, execution_space> y ) : m_x(x), m_y(y) {}
+///
+///   void operator () (const size_type i, value_type& update, const bool final_pass) const {
+///     update += m_x(i);
+///     if (final_pass) {
+///       m_y(i) = update;
+///     }
+///   }
+///   void init (value_type& update) const {
+///     update = 0;
+///   }
+///   void join (volatile value_type& update, volatile const value_type& input) const {
+///     update += input;
+///   }
+///
+/// private:
+///   Kokkos::View<value_type*, execution_space> m_x;
+///   Kokkos::View<value_type*, execution_space> m_y;
+/// };
+/// \endcode
+///
+/// Here is an example of a functor which computes an <i>exclusive</i>
+/// scan of an array of \c int, in place.  In operator(), note both
+/// that the final_pass test and the update have switched places, and
+/// the use of a temporary.  If given an array [1, 2, 3, 4], this scan
+/// will overwrite that array with [0, 1, 3, 6].
+///
+/// \code
+/// template<class SpaceType>
+/// class ExclScanFunctor {
+/// public:
+///   typedef SpaceType execution_space;
+///   typedef int value_type;
+///   typedef typename SpaceType::size_type size_type;
+///
+///   ExclScanFunctor (Kokkos::View<value_type*, execution_space> x) : x_ (x) {}
+///
+///   void operator () (const size_type i, value_type& update, const bool final_pass) const {
+///     const value_type x_i = x_(i);
+///     if (final_pass) {
+///       x_(i) = update;
+///     }
+///     update += x_i;
+///   }
+///   void init (value_type& update) const {
+///     update = 0;
+///   }
+///   void join (volatile value_type& update, volatile const value_type& input) const {
+///     update += input;
+///   }
+///
+/// private:
+///   Kokkos::View<value_type*, execution_space> x_;
+/// };
+/// \endcode
+///
+/// Here is an example of a functor which builds on the above
+/// exclusive scan example, to compute an offsets array from a
+/// population count array, in place.  We assume that the pop count
+/// array has an extra entry at the end to store the final count.  If
+/// given an array [1, 2, 3, 4, 0], this scan will overwrite that
+/// array with [0, 1, 3, 6, 10].
+///
+/// \code
+/// template<class SpaceType>
+/// class OffsetScanFunctor {
+/// public:
+///   typedef SpaceType execution_space;
+///   typedef int value_type;
+///   typedef typename SpaceType::size_type size_type;
+///
+///   // lastIndex_ is the last valid index (zero-based) of x.
+///   // If x has length zero, then lastIndex_ won't be used anyway.
+///   OffsetScanFunctor( Kokkos::View<value_type*, execution_space> x
+///                    , Kokkos::View<value_type*, execution_space> y )
+///      : m_x(x), m_y(y), last_index_ (x.dimension_0 () == 0 ? 0 : x.dimension_0 () - 1)
+///   {}
+///
+///   void operator () (const size_type i, int& update, const bool final_pass) const {
+///     if (final_pass) {
+///       m_y(i) = update;
+///     }
+///     update += m_x(i);
+///     // The last entry of m_y gets the final sum.
+///     if (final_pass && i == last_index_) {
+///       m_y(i+1) = update;
+///     }
+///   }
+///   void init (value_type& update) const {
+///     update = 0;
+///   }
+///   void join (volatile value_type& update, volatile const value_type& input) const {
+///     update += input;
+///   }
+///
+/// private:
+///   Kokkos::View<value_type*, execution_space> m_x;
+///   Kokkos::View<value_type*, execution_space> m_y;
+///   const size_type last_index_;
+/// };
+/// \endcode
+///
+template< class ExecutionPolicy , class FunctorType >
+inline
+void parallel_scan( const ExecutionPolicy & policy
+                  , const FunctorType     & functor
+                  , const std::string& str = ""
+                  , typename Impl::enable_if< ! Impl::is_integral< ExecutionPolicy >::value >::type * = 0
+                  )
+{
+#if (KOKKOS_ENABLE_PROFILING)
+  uint64_t kpID = 0;
+     if(Kokkos::Profiling::profileLibraryLoaded()) {
+	Kokkos::Profiling::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
+     }
+#endif
+
+  Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
+  Impl::ParallelScan< FunctorType , ExecutionPolicy > closure( functor , policy );
+  Kokkos::Impl::shared_allocation_tracking_release_and_enable();
+
+  closure.execute();
+
+#if (KOKKOS_ENABLE_PROFILING)
+     if(Kokkos::Profiling::profileLibraryLoaded()) {
+	Kokkos::Profiling::endParallelScan(kpID);
+     }
+#endif
+
+}
+
+template< class FunctorType >
+inline
+void parallel_scan( const size_t        work_count
+                  , const FunctorType & functor
+                  , const std::string& str = "" )
+{
+  typedef typename
+    Kokkos::Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space
+      execution_space ;
+
+  typedef Kokkos::RangePolicy< execution_space > policy ;
+
+#if (KOKKOS_ENABLE_PROFILING)
+  uint64_t kpID = 0;
+     if(Kokkos::Profiling::profileLibraryLoaded()) {
+	Kokkos::Profiling::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
+     }
+#endif
+    
+  Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
+  Impl::ParallelScan< FunctorType , policy > closure( functor , policy(0,work_count) );
+  Kokkos::Impl::shared_allocation_tracking_release_and_enable();
+
+  closure.execute();
+
+#if (KOKKOS_ENABLE_PROFILING)
+     if(Kokkos::Profiling::profileLibraryLoaded()) {
+	Kokkos::Profiling::endParallelScan(kpID);
+     }
+#endif
+
+}
+
+template< class ExecutionPolicy , class FunctorType >
+inline
+void parallel_scan( const std::string& str
+                  , const ExecutionPolicy & policy
+                  , const FunctorType     & functor)
+{
+  #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
+  Kokkos::fence();
+  std::cout << "KOKKOS_DEBUG Start parallel_scan kernel: " << str << std::endl;
+  #endif
+
+  parallel_scan(policy,functor,str);
+
+  #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
+  Kokkos::fence();
+  std::cout << "KOKKOS_DEBUG End   parallel_scan kernel: " << str << std::endl;
+  #endif
+  (void) str;
+}
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class Enable = void >
+struct FunctorTeamShmemSize
+{
+  KOKKOS_INLINE_FUNCTION static size_t value( const FunctorType & , int ) { return 0 ; }
+};
+
+template< class FunctorType >
+struct FunctorTeamShmemSize< FunctorType , typename Impl::enable_if< 0 < sizeof( & FunctorType::team_shmem_size ) >::type >
+{
+  static inline size_t value( const FunctorType & f , int team_size ) { return f.team_shmem_size( team_size ) ; }
+};
+
+template< class FunctorType >
+struct FunctorTeamShmemSize< FunctorType , typename Impl::enable_if< 0 < sizeof( & FunctorType::shmem_size ) >::type >
+{
+  static inline size_t value( const FunctorType & f , int team_size ) { return f.shmem_size( team_size ) ; }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* KOKKOS_PARALLEL_HPP */
+
diff --git a/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp b/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..695bc79a1ab900405a160843d8777651dc63cb22
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp
@@ -0,0 +1,1240 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+
+namespace Kokkos {
+
+
+template<class T, class Enable = void>
+struct is_reducer_type {
+  enum { value = 0 };
+};
+
+
+template<class T>
+struct is_reducer_type<T,typename std::enable_if<
+                       std::is_same<T,typename T::reducer_type>::value
+                      >::type> {
+  enum { value = 1 };
+};
+
+namespace Experimental {
+
+
+template<class Scalar,class Space = HostSpace>
+struct Sum {
+public:
+  //Required
+  typedef Sum reducer_type;
+  typedef Scalar value_type;
+
+  typedef Kokkos::View<value_type, Space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
+
+  value_type init_value;
+
+private:
+  result_view_type result;
+
+  template<class ValueType, bool is_arithmetic = std::is_arithmetic<ValueType>::value >
+  struct InitWrapper;
+
+  template<class ValueType >
+  struct InitWrapper<ValueType,true> {
+    static ValueType value() {
+      return static_cast<value_type>(0);
+    }
+  };
+
+  template<class ValueType >
+  struct InitWrapper<ValueType,false> {
+    static ValueType value() {
+      return value_type();
+    }
+  };
+
+public:
+
+  Sum(value_type& result_):
+    init_value(InitWrapper<value_type>::value()),result(&result_) {}
+  Sum(const result_view_type& result_):
+    init_value(InitWrapper<value_type>::value()),result(result_) {}
+  Sum(value_type& result_, const value_type& init_value_):
+    init_value(init_value_),result(&result_) {}
+  Sum(const result_view_type& result_, const value_type& init_value_):
+    init_value(init_value_),result(result_) {}
+
+  //Required
+  KOKKOS_INLINE_FUNCTION
+  void join(value_type& dest, const value_type& src)  const {
+    dest += src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile value_type& dest, const volatile value_type& src) const {
+    dest += src;
+  }
+
+  //Optional
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type& val)  const {
+    val = init_value;
+  }
+
+  result_view_type result_view() const {
+    return result;
+  }
+};
+
+template<class Scalar,class Space = HostSpace>
+struct Prod {
+public:
+  //Required
+  typedef Prod reducer_type;
+  typedef Scalar value_type;
+
+  typedef Kokkos::View<value_type, Space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
+
+  value_type init_value;
+
+private:
+  result_view_type result;
+
+  template<class ValueType, bool is_arithmetic = std::is_arithmetic<ValueType>::value >
+  struct InitWrapper;
+
+  template<class ValueType >
+  struct InitWrapper<ValueType,true> {
+    static ValueType value() {
+      return static_cast<value_type>(1);
+    }
+  };
+
+  template<class ValueType >
+  struct InitWrapper<ValueType,false> {
+    static ValueType value() {
+      return value_type();
+    }
+  };
+
+public:
+
+  Prod(value_type& result_):
+    init_value(InitWrapper<value_type>::value()),result(&result_) {}
+  Prod(const result_view_type& result_):
+    init_value(InitWrapper<value_type>::value()),result(result_) {}
+  Prod(value_type& result_, const value_type& init_value_):
+    init_value(init_value_),result(&result_) {}
+  Prod(const result_view_type& result_, const value_type& init_value_):
+    init_value(init_value_),result(result_) {}
+
+  //Required
+  KOKKOS_INLINE_FUNCTION
+  void join(value_type& dest, const value_type& src)  const {
+    dest *= src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile value_type& dest, const volatile value_type& src) const {
+    dest *= src;
+  }
+
+  //Optional
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type& val)  const {
+    val = init_value;
+  }
+
+  result_view_type result_view() const {
+    return result;
+  }
+};
+
+template<class Scalar, class Space = HostSpace>
+struct Min {
+public:
+  //Required
+  typedef Min reducer_type;
+  typedef typename std::remove_cv<Scalar>::type value_type;
+
+  typedef Kokkos::View<value_type, Space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
+
+  value_type init_value;
+
+private:
+  result_view_type result;
+
+  template<class ValueType, bool is_arithmetic = std::is_arithmetic<ValueType>::value >
+  struct InitWrapper;
+
+  template<class ValueType >
+  struct InitWrapper<ValueType,true> {
+    static ValueType value() {
+      return std::numeric_limits<value_type>::max();
+    }
+  };
+
+  template<class ValueType >
+  struct InitWrapper<ValueType,false> {
+    static ValueType value() {
+      return value_type();
+    }
+  };
+
+public:
+
+  Min(value_type& result_):
+    init_value(InitWrapper<value_type>::value()),result(&result_) {}
+  Min(const result_view_type& result_):
+    init_value(InitWrapper<value_type>::value()),result(result_) {}
+  Min(value_type& result_, const value_type& init_value_):
+    init_value(init_value_),result(&result_) {}
+  Min(const result_view_type& result_, const value_type& init_value_):
+    init_value(init_value_),result(result_) {}
+
+  //Required
+  KOKKOS_INLINE_FUNCTION
+  void join(value_type& dest, const value_type& src)  const {
+    if ( src < dest )
+      dest = src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile value_type& dest, const volatile value_type& src) const {
+    if ( src < dest )
+      dest = src;
+  }
+
+  //Optional
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type& val)  const {
+    val = init_value;
+  }
+
+  result_view_type result_view() const {
+    return result;
+  }
+};
+
+template<class Scalar, class Space = HostSpace>
+struct Max {
+public:
+  //Required
+  typedef Max reducer_type;
+  typedef typename std::remove_cv<Scalar>::type value_type;
+
+  typedef Kokkos::View<value_type, Space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
+
+  value_type init_value;
+
+private:
+  result_view_type result;
+
+  template<class ValueType, bool is_arithmetic = std::is_arithmetic<ValueType>::value >
+  struct InitWrapper;
+
+  template<class ValueType >
+  struct InitWrapper<ValueType,true> {
+    static ValueType value() {
+      return std::numeric_limits<value_type>::min();
+    }
+  };
+
+  template<class ValueType >
+  struct InitWrapper<ValueType,false> {
+    static ValueType value() {
+      return value_type();
+    }
+  };
+
+public:
+
+  Max(value_type& result_):
+    init_value(InitWrapper<value_type>::value()),result(&result_) {}
+  Max(const result_view_type& result_):
+    init_value(InitWrapper<value_type>::value()),result(result_) {}
+  Max(value_type& result_, const value_type& init_value_):
+    init_value(init_value_),result(&result_) {}
+  Max(const result_view_type& result_, const value_type& init_value_):
+    init_value(init_value_),result(result_) {}
+
+  //Required
+  KOKKOS_INLINE_FUNCTION
+  void join(value_type& dest, const value_type& src)  const {
+    if ( src > dest )
+      dest = src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile value_type& dest, const volatile value_type& src) const {
+    if ( src > dest )
+      dest = src;
+  }
+
+  //Optional
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type& val)  const {
+    val = init_value;
+  }
+
+  result_view_type result_view() const {
+    return result;
+  }
+};
+
+template<class Scalar, class Space = HostSpace>
+struct LAnd {
+public:
+  //Required
+  typedef LAnd reducer_type;
+  typedef Scalar value_type;
+
+  typedef Kokkos::View<value_type, Space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
+
+private:
+  result_view_type result;
+
+public:
+
+  LAnd(value_type& result_):result(&result_) {}
+  LAnd(const result_view_type& result_):result(result_) {}
+
+  //Required
+  KOKKOS_INLINE_FUNCTION
+  void join(value_type& dest, const value_type& src)  const {
+    dest = dest && src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile value_type& dest, const volatile value_type& src) const {
+    dest = dest && src;
+  }
+
+  //Optional
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type& val)  const {
+    val = 1;
+  }
+
+  result_view_type result_view() const {
+    return result;
+  }
+};
+
+template<class Scalar, class Space = HostSpace>
+struct LOr {
+public:
+  //Required
+  typedef LOr reducer_type;
+  typedef Scalar value_type;
+
+  typedef Kokkos::View<value_type, Space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
+
+private:
+  result_view_type result;
+
+public:
+
+  LOr(value_type& result_):result(&result_) {}
+  LOr(const result_view_type& result_):result(result_) {}
+
+  //Required
+  KOKKOS_INLINE_FUNCTION
+  void join(value_type& dest, const value_type& src)  const {
+    dest = dest || src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile value_type& dest, const volatile value_type& src) const {
+    dest = dest || src;
+  }
+
+  //Optional
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type& val)  const {
+    val = 0;
+  }
+
+  result_view_type result_view() const {
+    return result;
+  }
+};
+
+template<class Scalar, class Space = HostSpace>
+struct LXor {
+public:
+  //Required
+  typedef LXor reducer_type;
+  typedef Scalar value_type;
+
+  typedef Kokkos::View<value_type, Space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
+
+private:
+  result_view_type result;
+
+public:
+
+  LXor(value_type& result_):result(&result_) {}
+  LXor(const result_view_type& result_):result(result_) {}
+
+  //Required
+  KOKKOS_INLINE_FUNCTION
+  void join(value_type& dest, const value_type& src)  const {
+    dest = dest? (!src) : src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile value_type& dest, const volatile value_type& src) const {
+    dest = dest? (!src) : src;
+  }
+
+  //Optional
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type& val)  const {
+    val = 0;
+  }
+
+  result_view_type result_view() const {
+    return result;
+  }
+};
+
+template<class Scalar, class Space = HostSpace>
+struct BAnd {
+public:
+  //Required
+  typedef BAnd reducer_type;
+  typedef typename std::remove_cv<Scalar>::type value_type;
+
+  typedef Kokkos::View<value_type, Space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
+
+  value_type init_value;
+
+private:
+  result_view_type result;
+
+public:
+
+  BAnd(value_type& result_):
+    init_value(value_type() | (~value_type())),result(&result_) {}
+  BAnd(const result_view_type& result_):
+    init_value(value_type() | (~value_type())),result(result_) {}
+
+  //Required
+  KOKKOS_INLINE_FUNCTION
+  void join(value_type& dest, const value_type& src)  const {
+      dest = dest & src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile value_type& dest, const volatile value_type& src) const {
+    dest = dest & src;
+  }
+
+  //Optional
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type& val)  const {
+    val = init_value;
+  }
+
+  result_view_type result_view() const {
+    return result;
+  }
+};
+
+template<class Scalar, class Space = HostSpace>
+struct BOr {
+public:
+  //Required
+  typedef BOr reducer_type;
+  typedef typename std::remove_cv<Scalar>::type value_type;
+
+  typedef Kokkos::View<value_type, Space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
+
+  value_type init_value;
+
+private:
+  result_view_type result;
+
+public:
+
+  BOr(value_type& result_):
+    init_value(value_type() & (~value_type())),result(&result_) {}
+  BOr(const result_view_type& result_):
+    init_value(value_type() & (~value_type())),result(result_) {}
+
+  //Required
+  KOKKOS_INLINE_FUNCTION
+  void join(value_type& dest, const value_type& src)  const {
+      dest = dest | src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile value_type& dest, const volatile value_type& src) const {
+    dest = dest | src;
+  }
+
+  //Optional
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type& val)  const {
+    val = init_value;
+  }
+
+  result_view_type result_view() const {
+    return result;
+  }
+};
+
+template<class Scalar, class Space = HostSpace>
+struct BXor {
+public:
+  //Required
+  typedef BXor reducer_type;
+  typedef typename std::remove_cv<Scalar>::type value_type;
+
+  typedef Kokkos::View<value_type, Space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
+
+  value_type init_value;
+
+private:
+  result_view_type result;
+
+public:
+
+  BXor(value_type& result_):
+    init_value(value_type() & (~value_type())),result(&result_) {}
+  BXor(const result_view_type& result_):
+    init_value(value_type() & (~value_type())),result(result_) {}
+
+  //Required
+  KOKKOS_INLINE_FUNCTION
+  void join(value_type& dest, const value_type& src)  const {
+      dest = dest ^ src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile value_type& dest, const volatile value_type& src) const {
+    dest = dest ^ src;
+  }
+
+  //Optional
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type& val)  const {
+    val = init_value;
+  }
+
+  result_view_type result_view() const {
+    return result;
+  }
+};
+
+template<class Scalar, class Index>
+struct ValLocScalar {
+  Scalar val;
+  Index loc;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator = (const ValLocScalar& rhs) {
+    val = rhs.val;
+    loc = rhs.loc;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator = (const volatile ValLocScalar& rhs) volatile {
+    val = rhs.val;
+    loc = rhs.loc;
+  }
+};
+
+template<class Scalar, class Index, class Space = HostSpace>
+struct MinLoc {
+private:
+  typedef typename std::remove_cv<Scalar>::type scalar_type;
+  typedef typename std::remove_cv<Index>::type index_type;
+
+public:
+  //Required
+  typedef MinLoc reducer_type;
+  typedef ValLocScalar<scalar_type,index_type> value_type;
+
+  typedef Kokkos::View<value_type, Space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
+
+  scalar_type init_value;
+
+private:
+  result_view_type result;
+
+  template<class ValueType, bool is_arithmetic = std::is_arithmetic<ValueType>::value >
+  struct InitWrapper;
+
+  template<class ValueType >
+  struct InitWrapper<ValueType,true> {
+    static ValueType value() {
+      return std::numeric_limits<scalar_type>::max();
+    }
+  };
+
+  template<class ValueType >
+  struct InitWrapper<ValueType,false> {
+    static ValueType value() {
+      return scalar_type();
+    }
+  };
+
+public:
+
+  MinLoc(value_type& result_):
+    init_value(InitWrapper<scalar_type>::value()),result(&result_) {}
+  MinLoc(const result_view_type& result_):
+    init_value(InitWrapper<scalar_type>::value()),result(result_) {}
+  MinLoc(value_type& result_, const scalar_type& init_value_):
+    init_value(init_value_),result(&result_) {}
+  MinLoc(const result_view_type& result_, const scalar_type& init_value_):
+    init_value(init_value_),result(result_) {}
+
+
+  //Required
+  KOKKOS_INLINE_FUNCTION
+  void join(value_type& dest, const value_type& src)  const {
+    if ( src.val < dest.val )
+      dest = src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile value_type& dest, const volatile value_type& src) const {
+    if ( src.val < dest.val )
+      dest = src;
+  }
+
+  //Optional
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type& val)  const {
+    val.val = init_value;
+  }
+
+  result_view_type result_view() const {
+    return result;
+  }
+};
+
+template<class Scalar, class Index, class Space = HostSpace>
+struct MaxLoc {
+private:
+  typedef typename std::remove_cv<Scalar>::type scalar_type;
+  typedef typename std::remove_cv<Index>::type index_type;
+
+public:
+  //Required
+  typedef MaxLoc reducer_type;
+  typedef ValLocScalar<scalar_type,index_type> value_type;
+
+  typedef Kokkos::View<value_type, Space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
+
+  scalar_type init_value;
+
+private:
+  result_view_type result;
+
+  template<class ValueType, bool is_arithmetic = std::is_arithmetic<ValueType>::value >
+  struct InitWrapper;
+
+  template<class ValueType >
+  struct InitWrapper<ValueType,true> {
+    static ValueType value() {
+      return std::numeric_limits<scalar_type>::min();
+    }
+  };
+
+  template<class ValueType >
+  struct InitWrapper<ValueType,false> {
+    static ValueType value() {
+      return scalar_type();
+    }
+  };
+
+public:
+
+  MaxLoc(value_type& result_):
+    init_value(InitWrapper<scalar_type>::value()),result(&result_) {}
+  MaxLoc(const result_view_type& result_):
+    init_value(InitWrapper<scalar_type>::value()),result(result_) {}
+  MaxLoc(value_type& result_, const scalar_type& init_value_):
+    init_value(init_value_),result(&result_) {}
+  MaxLoc(const result_view_type& result_, const scalar_type& init_value_):
+    init_value(init_value_),result(result_) {}
+
+  //Required
+  KOKKOS_INLINE_FUNCTION
+  void join(value_type& dest, const value_type& src)  const {
+    if ( src.val > dest.val )
+      dest = src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile value_type& dest, const volatile value_type& src) const {
+    if ( src.val > dest.val )
+      dest = src;
+  }
+
+  //Optional
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type& val)  const {
+    val.val = init_value;
+  }
+
+  result_view_type result_view() const {
+    return result;
+  }
+};
+
+template<class Scalar, class Index>
+struct MinMaxLocScalar {
+  Scalar min_val,max_val;
+  Index min_loc,max_loc;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator = (const MinMaxLocScalar& rhs) {
+    min_val = rhs.min_val;
+    min_loc = rhs.min_loc;
+    max_val = rhs.max_val;
+    max_loc = rhs.max_loc;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator = (const volatile MinMaxLocScalar& rhs) volatile {
+    min_val = rhs.min_val;
+    min_loc = rhs.min_loc;
+    max_val = rhs.max_val;
+    max_loc = rhs.max_loc;
+  }
+};
+
+template<class Scalar, class Index, class Space = HostSpace>
+struct MinMaxLoc {
+private:
+  typedef typename std::remove_cv<Scalar>::type scalar_type;
+  typedef typename std::remove_cv<Index>::type index_type;
+
+public:
+  //Required
+  typedef MinMaxLoc reducer_type;
+  typedef MinMaxLocScalar<scalar_type,index_type> value_type;
+
+  typedef Kokkos::View<value_type, Space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
+
+  scalar_type min_init_value;
+  scalar_type max_init_value;
+
+private:
+  result_view_type result;
+
+  template<class ValueType, bool is_arithmetic = std::is_arithmetic<ValueType>::value >
+  struct MinInitWrapper;
+
+  template<class ValueType >
+  struct MinInitWrapper<ValueType,true> {
+    static ValueType value() {
+      return std::numeric_limits<scalar_type>::max();
+    }
+  };
+
+  template<class ValueType >
+  struct MinInitWrapper<ValueType,false> {
+    static ValueType value() {
+      return scalar_type();
+    }
+  };
+
+  template<class ValueType, bool is_arithmetic = std::is_arithmetic<ValueType>::value >
+  struct MaxInitWrapper;
+
+  template<class ValueType >
+  struct MaxInitWrapper<ValueType,true> {
+    static ValueType value() {
+      return std::numeric_limits<scalar_type>::min();
+    }
+  };
+
+  template<class ValueType >
+  struct MaxInitWrapper<ValueType,false> {
+    static ValueType value() {
+      return scalar_type();
+    }
+  };
+
+public:
+
+  MinMaxLoc(value_type& result_):
+    min_init_value(MinInitWrapper<scalar_type>::value()),max_init_value(MaxInitWrapper<scalar_type>::value()),result(&result_) {}
+  MinMaxLoc(const result_view_type& result_):
+    min_init_value(MinInitWrapper<scalar_type>::value()),max_init_value(MaxInitWrapper<scalar_type>::value()),result(result_) {}
+  MinMaxLoc(value_type& result_, const scalar_type& min_init_value_, const scalar_type& max_init_value_):
+    min_init_value(min_init_value_),max_init_value(max_init_value_),result(&result_) {}
+  MinMaxLoc(const result_view_type& result_, const scalar_type& min_init_value_, const scalar_type& max_init_value_):
+    min_init_value(min_init_value_),max_init_value(max_init_value_),result(result_) {}
+
+  //Required
+  KOKKOS_INLINE_FUNCTION
+  void join(value_type& dest, const value_type& src)  const {
+    if ( src.min_val < dest.min_val ) {
+      dest.min_val = src.min_val;
+      dest.min_loc = src.min_loc;
+    }
+    if ( src.max_val > dest.max_val ) {
+      dest.max_val = src.max_val;
+      dest.max_loc = src.max_loc;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile value_type& dest, const volatile value_type& src) const {
+    if ( src.min_val < dest.min_val ) {
+      dest.min_val = src.min_val;
+      dest.min_loc = src.min_loc;
+    }
+    if ( src.max_val > dest.max_val ) {
+      dest.max_val = src.max_val;
+      dest.max_loc = src.max_loc;
+    }
+  }
+
+  //Optional
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type& val)  const {
+    val.min_val = min_init_value;
+    val.max_val = max_init_value;
+  }
+
+  result_view_type result_view() const {
+    return result;
+  }
+};
+}
+}
+
+
+namespace Kokkos {
+namespace Impl {
+
+template< class T, class ReturnType , class ValueTraits>
+struct ParallelReduceReturnValue;
+
+template< class ReturnType , class FunctorType >
+struct ParallelReduceReturnValue<typename std::enable_if<Kokkos::is_view<ReturnType>::value>::type, ReturnType, FunctorType> {
+  typedef ReturnType return_type;
+  typedef InvalidType reducer_type;
+
+  typedef typename return_type::value_type value_type_scalar;
+  typedef typename return_type::value_type value_type_array[];
+
+  typedef typename if_c<return_type::rank==0,value_type_scalar,value_type_array>::type value_type;
+
+  static return_type& return_value(ReturnType& return_val, const FunctorType&) {
+    return return_val;
+  }
+};
+
+template< class ReturnType , class FunctorType>
+struct ParallelReduceReturnValue<typename std::enable_if<
+                                   !Kokkos::is_view<ReturnType>::value &&
+                                  (!std::is_array<ReturnType>::value && !std::is_pointer<ReturnType>::value) &&
+                                   !Kokkos::is_reducer_type<ReturnType>::value
+                                 >::type, ReturnType, FunctorType> {
+  typedef Kokkos::View<  ReturnType
+                       , Kokkos::HostSpace
+                       , Kokkos::MemoryUnmanaged
+      > return_type;
+
+  typedef InvalidType reducer_type;
+
+  typedef typename return_type::value_type value_type;
+
+  static return_type return_value(ReturnType& return_val, const FunctorType&) {
+    return return_type(&return_val);
+  }
+};
+
+template< class ReturnType , class FunctorType>
+struct ParallelReduceReturnValue<typename std::enable_if<
+                                  (is_array<ReturnType>::value || std::is_pointer<ReturnType>::value)
+                                >::type, ReturnType, FunctorType> {
+  typedef Kokkos::View<  typename std::remove_const<ReturnType>::type
+                       , Kokkos::HostSpace
+                       , Kokkos::MemoryUnmanaged
+      > return_type;
+
+  typedef InvalidType reducer_type;
+
+  typedef typename return_type::value_type value_type[];
+
+  static return_type return_value(ReturnType& return_val,
+                                  const FunctorType& functor) {
+    return return_type(return_val,functor.value_count);
+  }
+};
+
+template< class ReturnType , class FunctorType>
+struct ParallelReduceReturnValue<typename std::enable_if<
+                                   Kokkos::is_reducer_type<ReturnType>::value
+                                >::type, ReturnType, FunctorType> {
+  typedef ReturnType return_type;
+  typedef ReturnType reducer_type;
+  typedef typename return_type::value_type value_type;
+
+  static return_type return_value(ReturnType& return_val,
+                                  const FunctorType& functor) {
+    return return_val;
+  }
+};
+}
+
+namespace Impl {
+template< class T, class ReturnType , class FunctorType>
+struct ParallelReducePolicyType;
+
+template< class PolicyType , class FunctorType >
+struct ParallelReducePolicyType<typename std::enable_if<Kokkos::Impl::is_execution_policy<PolicyType>::value>::type, PolicyType,FunctorType> {
+
+  typedef PolicyType policy_type;
+  static PolicyType policy(const PolicyType& policy_) {
+    return policy_;
+  }
+};
+
+template< class PolicyType , class FunctorType >
+struct ParallelReducePolicyType<typename std::enable_if<std::is_integral<PolicyType>::value>::type, PolicyType,FunctorType> {
+  typedef typename
+    Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space
+      execution_space ;
+
+  typedef Kokkos::RangePolicy<execution_space> policy_type;
+
+  static policy_type policy(const PolicyType& policy_) {
+    return policy_type(0,policy_);
+  }
+};
+
+}
+
+namespace Impl {
+  template< class FunctorType, class ExecPolicy, class ValueType, class ExecutionSpace>
+  struct ParallelReduceFunctorType {
+    typedef FunctorType functor_type;
+    static const functor_type& functor(const functor_type& functor) {
+      return functor;
+    }
+  };
+}
+
+namespace Impl {
+
+  template< class PolicyType, class FunctorType, class ReturnType >
+  struct ParallelReduceAdaptor {
+    typedef Impl::ParallelReduceReturnValue<void,ReturnType,FunctorType> return_value_adapter;
+    #ifdef KOKKOS_IMPL_NEED_FUNCTOR_WRAPPER
+    typedef Impl::ParallelReduceFunctorType<FunctorType,PolicyType,
+                                            typename return_value_adapter::value_type,
+                                            typename PolicyType::execution_space> functor_adaptor;
+    #endif
+    static inline
+    void execute(const std::string& label,
+        const PolicyType& policy,
+        const FunctorType& functor,
+        ReturnType& return_value) {
+          #if (KOKKOS_ENABLE_PROFILING)
+            uint64_t kpID = 0;
+            if(Kokkos::Profiling::profileLibraryLoaded()) {
+              Kokkos::Profiling::beginParallelReduce("" == label ? typeid(FunctorType).name() : label, 0, &kpID);
+            }
+          #endif
+
+          Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
+          #ifdef KOKKOS_IMPL_NEED_FUNCTOR_WRAPPER
+          Impl::ParallelReduce<typename functor_adaptor::functor_type, PolicyType, typename return_value_adapter::reducer_type >
+             closure(functor_adaptor::functor(functor),
+                     policy,
+                     return_value_adapter::return_value(return_value,functor));
+          #else
+          Impl::ParallelReduce<FunctorType, PolicyType, typename return_value_adapter::reducer_type >
+             closure(functor,
+                     policy,
+                     return_value_adapter::return_value(return_value,functor));
+          #endif
+          Kokkos::Impl::shared_allocation_tracking_release_and_enable();
+          closure.execute();
+
+          #if (KOKKOS_ENABLE_PROFILING)
+            if(Kokkos::Profiling::profileLibraryLoaded()) {
+              Kokkos::Profiling::endParallelReduce(kpID);
+            }
+          #endif
+        }
+
+  };
+}
+/*! \fn void parallel_reduce(label,policy,functor,return_argument)
+    \brief Perform a parallel reduction.
+    \param label An optional Label giving the call name. Must be able to construct a std::string from the argument.
+    \param policy A Kokkos Execution Policy, such as an integer, a RangePolicy or a TeamPolicy.
+    \param functor A functor with a reduction operator, and optional init, join and final functions.
+    \param return_argument A return argument which can be a scalar, a View, or a ReducerStruct. This argument can be left out if the functor has a final function.
+*/
+
+/** \brief  Parallel reduction
+ *
+ * parallel_reduce performs parallel reductions with arbitrary functions - i.e.
+ * it is not solely data based. The call expects up to 4 arguments:
+ *
+ *
+ * Example of a parallel_reduce functor for a POD (plain old data) value type:
+ * \code
+ *  class FunctorType { // For POD value type
+ *  public:
+ *    typedef    ...     execution_space ;
+ *    typedef <podType>  value_type ;
+ *    void operator()( <intType> iwork , <podType> & update ) const ;
+ *    void init( <podType> & update ) const ;
+ *    void join( volatile       <podType> & update ,
+ *               volatile const <podType> & input ) const ;
+ *
+ *    typedef true_type has_final ;
+ *    void final( <podType> & update ) const ;
+ *  };
+ * \endcode
+ *
+ * Example of a parallel_reduce functor for an array of POD (plain old data) values:
+ * \code
+ *  class FunctorType { // For array of POD value
+ *  public:
+ *    typedef    ...     execution_space ;
+ *    typedef <podType>  value_type[] ;
+ *    void operator()( <intType> , <podType> update[] ) const ;
+ *    void init( <podType> update[] ) const ;
+ *    void join( volatile       <podType> update[] ,
+ *               volatile const <podType> input[] ) const ;
+ *
+ *    typedef true_type has_final ;
+ *    void final( <podType> update[] ) const ;
+ *  };
+ * \endcode
+ */
+
+// ReturnValue is scalar or array: take by reference
+
+template< class PolicyType, class FunctorType, class ReturnType >
+inline
+void parallel_reduce(const std::string& label,
+                     const PolicyType& policy,
+                     const FunctorType& functor,
+                     ReturnType& return_value,
+                     typename Impl::enable_if<
+                       Kokkos::Impl::is_execution_policy<PolicyType>::value
+                     >::type * = 0) {
+  Impl::ParallelReduceAdaptor<PolicyType,FunctorType,ReturnType>::execute(label,policy,functor,return_value);
+}
+
+template< class PolicyType, class FunctorType, class ReturnType >
+inline
+void parallel_reduce(const PolicyType& policy,
+                     const FunctorType& functor,
+                     ReturnType& return_value,
+                     typename Impl::enable_if<
+                       Kokkos::Impl::is_execution_policy<PolicyType>::value
+                     >::type * = 0) {
+  Impl::ParallelReduceAdaptor<PolicyType,FunctorType,ReturnType>::execute("",policy,functor,return_value);
+}
+
+template< class FunctorType, class ReturnType >
+inline
+void parallel_reduce(const size_t& policy,
+                     const FunctorType& functor,
+                     ReturnType& return_value) {
+  typedef typename Impl::ParallelReducePolicyType<void,size_t,FunctorType>::policy_type policy_type;
+  Impl::ParallelReduceAdaptor<policy_type,FunctorType,ReturnType>::execute("",policy_type(0,policy),functor,return_value);
+}
+
+template< class FunctorType, class ReturnType >
+inline
+void parallel_reduce(const std::string& label,
+                     const size_t& policy,
+                     const FunctorType& functor,
+                     ReturnType& return_value) {
+  typedef typename Impl::ParallelReducePolicyType<void,size_t,FunctorType>::policy_type policy_type;
+  Impl::ParallelReduceAdaptor<policy_type,FunctorType,ReturnType>::execute(label,policy_type(0,policy),functor,return_value);
+}
+
+// ReturnValue as View or Reducer: take by copy to allow for inline construction
+
+template< class PolicyType, class FunctorType, class ReturnType >
+inline
+void parallel_reduce(const std::string& label,
+                     const PolicyType& policy,
+                     const FunctorType& functor,
+                     const ReturnType& return_value,
+                     typename Impl::enable_if<
+                       Kokkos::Impl::is_execution_policy<PolicyType>::value
+                     >::type * = 0) {
+  Impl::ParallelReduceAdaptor<PolicyType,FunctorType,const ReturnType>::execute(label,policy,functor,return_value);
+}
+
+template< class PolicyType, class FunctorType, class ReturnType >
+inline
+void parallel_reduce(const PolicyType& policy,
+                     const FunctorType& functor,
+                     const ReturnType& return_value,
+                     typename Impl::enable_if<
+                       Kokkos::Impl::is_execution_policy<PolicyType>::value
+                     >::type * = 0) {
+  Impl::ParallelReduceAdaptor<PolicyType,FunctorType,const ReturnType>::execute("",policy,functor,return_value);
+}
+
+template< class FunctorType, class ReturnType >
+inline
+void parallel_reduce(const size_t& policy,
+                     const FunctorType& functor,
+                     const ReturnType& return_value) {
+  typedef typename Impl::ParallelReducePolicyType<void,size_t,FunctorType>::policy_type policy_type;
+
+  Impl::ParallelReduceAdaptor<policy_type,FunctorType,const ReturnType>::execute("",policy_type(0,policy),functor,return_value);
+}
+
+template< class FunctorType, class ReturnType >
+inline
+void parallel_reduce(const std::string& label,
+                     const size_t& policy,
+                     const FunctorType& functor,
+                     const ReturnType& return_value) {
+  typedef typename Impl::ParallelReducePolicyType<void,size_t,FunctorType>::policy_type policy_type;
+  Impl::ParallelReduceAdaptor<policy_type,FunctorType,const ReturnType>::execute(label,policy_type(0,policy),functor,return_value);
+}
+
+// No Return Argument
+
+template< class PolicyType, class FunctorType>
+inline
+void parallel_reduce(const std::string& label,
+                     const PolicyType& policy,
+                     const FunctorType& functor,
+                     typename Impl::enable_if<
+                       Kokkos::Impl::is_execution_policy<PolicyType>::value
+                     >::type * = 0) {
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void >  ValueTraits ;
+  typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
+                                     , typename ValueTraits::value_type
+                                     , typename ValueTraits::pointer_type
+                                     >::type value_type ;
+
+  typedef Kokkos::View< value_type
+              , Kokkos::HostSpace
+              , Kokkos::MemoryUnmanaged
+              > result_view_type;
+  result_view_type result_view ;
+
+  Impl::ParallelReduceAdaptor<PolicyType,FunctorType,result_view_type>::execute(label,policy,functor,result_view);
+}
+
+template< class PolicyType, class FunctorType >
+inline
+void parallel_reduce(const PolicyType& policy,
+                     const FunctorType& functor,
+                     typename Impl::enable_if<
+                       Kokkos::Impl::is_execution_policy<PolicyType>::value
+                     >::type * = 0) {
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void >  ValueTraits ;
+  typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
+                                     , typename ValueTraits::value_type
+                                     , typename ValueTraits::pointer_type
+                                     >::type value_type ;
+
+  typedef Kokkos::View< value_type
+              , Kokkos::HostSpace
+              , Kokkos::MemoryUnmanaged
+              > result_view_type;
+  result_view_type result_view ;
+
+  Impl::ParallelReduceAdaptor<PolicyType,FunctorType,result_view_type>::execute("",policy,functor,result_view);
+}
+
+template< class FunctorType >
+inline
+void parallel_reduce(const size_t& policy,
+                     const FunctorType& functor) {
+  typedef typename Impl::ParallelReducePolicyType<void,size_t,FunctorType>::policy_type policy_type;
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void >  ValueTraits ;
+  typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
+                                     , typename ValueTraits::value_type
+                                     , typename ValueTraits::pointer_type
+                                     >::type value_type ;
+
+  typedef Kokkos::View< value_type
+              , Kokkos::HostSpace
+              , Kokkos::MemoryUnmanaged
+              > result_view_type;
+  result_view_type result_view ;
+
+  Impl::ParallelReduceAdaptor<policy_type,FunctorType,result_view_type>::execute("",policy_type(0,policy),functor,result_view);
+}
+
+template< class FunctorType>
+inline
+void parallel_reduce(const std::string& label,
+                     const size_t& policy,
+                     const FunctorType& functor) {
+  typedef typename Impl::ParallelReducePolicyType<void,size_t,FunctorType>::policy_type policy_type;
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void >  ValueTraits ;
+  typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
+                                     , typename ValueTraits::value_type
+                                     , typename ValueTraits::pointer_type
+                                     >::type value_type ;
+
+  typedef Kokkos::View< value_type
+              , Kokkos::HostSpace
+              , Kokkos::MemoryUnmanaged
+              > result_view_type;
+  result_view_type result_view ;
+
+  Impl::ParallelReduceAdaptor<policy_type,FunctorType,result_view_type>::execute(label,policy_type(0,policy),functor,result_view);
+}
+
+
+
+} //namespace Kokkos
diff --git a/lib/kokkos/core/src/Kokkos_Qthread.hpp b/lib/kokkos/core/src/Kokkos_Qthread.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d61f8d518e6641debd19d4975b2535a6bfbcad8f
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_Qthread.hpp
@@ -0,0 +1,172 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_QTHREAD_HPP
+#define KOKKOS_QTHREAD_HPP
+
+#include <cstddef>
+#include <iosfwd>
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Layout.hpp>
+#include <Kokkos_MemoryTraits.hpp>
+#include <Kokkos_HostSpace.hpp>
+#include <Kokkos_ExecPolicy.hpp>
+#include <impl/Kokkos_Tags.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+class QthreadExec ;
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+/** \brief  Execution space supported by Qthread */
+class Qthread {
+public:
+  //! \name Type declarations that all Kokkos devices must provide.
+  //@{
+
+  //! Tag this class as an execution space
+  typedef Qthread                  execution_space ;
+  typedef Kokkos::HostSpace        memory_space ;
+  //! This execution space preferred device_type
+  typedef Kokkos::Device<execution_space,memory_space> device_type;
+
+  typedef Kokkos::LayoutRight      array_layout ;
+  typedef memory_space::size_type  size_type ;
+
+  typedef ScratchMemorySpace< Qthread > scratch_memory_space ;
+
+  //@}
+  /*------------------------------------------------------------------------*/
+
+  /** \brief  Initialization will construct one or more instances */
+  static Qthread & instance( int = 0 );
+
+  /** \brief  Set the execution space to a "sleep" state.
+   *
+   * This function sets the "sleep" state in which it is not ready for work.
+   * This may consume less resources than in an "ready" state,
+   * but it may also take time to transition to the "ready" state.
+   *
+   * \return True if enters or is in the "sleep" state.
+   *         False if functions are currently executing.
+   */
+  bool sleep();
+
+  /** \brief  Wake from the sleep state.
+   * 
+   *  \return True if enters or is in the "ready" state.
+   *          False if functions are currently executing.
+   */
+  static bool wake();
+
+  /** \brief Wait until all dispatched functions to complete.
+   * 
+   *  The parallel_for or parallel_reduce dispatch of a functor may
+   *  return asynchronously, before the functor completes.  This
+   *  method does not return until all dispatched functors on this
+   *  device have completed.
+   */
+  static void fence();
+
+  /*------------------------------------------------------------------------*/
+
+  static int in_parallel();
+
+  static int is_initialized();
+
+  /** \brief  Return maximum amount of concurrency */
+  static int concurrency();
+
+  static void initialize( int thread_count );
+  static void finalize();
+
+  /** \brief Print configuration information to the given output stream. */
+  static void print_configuration( std::ostream & , const bool detail = false );
+
+  int shepherd_size() const ;
+  int shepherd_worker_size() const ;
+};
+
+/*--------------------------------------------------------------------------*/
+
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+struct VerifyExecutionCanAccessMemorySpace
+  < Kokkos::Qthread::memory_space
+  , Kokkos::Qthread::scratch_memory_space
+  >
+{
+  enum { value = true };
+  inline static void verify( void ) { }
+  inline static void verify( const void * ) { }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+#include <Kokkos_Parallel.hpp>
+#include <Qthread/Kokkos_QthreadExec.hpp>
+#include <Qthread/Kokkos_Qthread_Parallel.hpp>
+
+#endif /* #define KOKKOS_QTHREAD_HPP */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
diff --git a/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp b/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..09a5993863e56835276b88003d59a98ba8e5b6b6
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp
@@ -0,0 +1,166 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_SCRATCHSPACE_HPP
+#define KOKKOS_SCRATCHSPACE_HPP
+
+#include <stdio.h>
+#include <Kokkos_Core_fwd.hpp>
+#include <impl/Kokkos_Tags.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+/** \brief  Scratch memory space associated with an execution space.
+ *
+ */
+template< class ExecSpace >
+class ScratchMemorySpace {
+  static_assert (Impl::is_execution_space<ExecSpace>::value,"Instantiating ScratchMemorySpace on non-execution-space type.");
+public:
+
+  // Alignment of memory chunks returned by 'get'
+  // must be a power of two
+  enum { ALIGN = 8 };
+
+private:
+
+  mutable char * m_iter_L0 ;
+  char *         m_end_L0 ;
+  mutable char * m_iter_L1 ;
+  char *         m_end_L1 ;
+
+
+  mutable int m_multiplier;
+  mutable int m_offset;
+  mutable int m_default_level;
+
+  ScratchMemorySpace();
+  ScratchMemorySpace & operator = ( const ScratchMemorySpace & );
+
+  enum { MASK = ALIGN - 1 }; // Alignment used by View::shmem_size
+
+public:
+
+  //! Tag this class as a memory space
+  typedef ScratchMemorySpace                memory_space ;
+  typedef ExecSpace                         execution_space ;
+  //! This execution space preferred device_type
+  typedef Kokkos::Device<execution_space,memory_space> device_type;
+
+  typedef typename ExecSpace::array_layout  array_layout ;
+  typedef typename ExecSpace::size_type     size_type ;
+
+  template< typename IntType >
+  KOKKOS_INLINE_FUNCTION static
+  IntType align( const IntType & size )
+    { return ( size + MASK ) & ~MASK ; }
+
+  template< typename IntType >
+  KOKKOS_INLINE_FUNCTION
+  void* get_shmem (const IntType& size, int level = -1) const {
+    if(level == -1)
+      level = m_default_level;
+    if(level == 0) {
+      void* tmp = m_iter_L0 + m_offset * align (size);
+      if (m_end_L0 < (m_iter_L0 += align (size) * m_multiplier)) {
+        m_iter_L0 -= align (size) * m_multiplier; // put it back like it was
+        #ifdef KOKKOS_HAVE_DEBUG
+        // mfh 23 Jun 2015: printf call consumes 25 registers
+        // in a CUDA build, so only print in debug mode.  The
+        // function still returns NULL if not enough memory.
+        printf ("ScratchMemorySpace<...>::get_shmem: Failed to allocate "
+                "%ld byte(s); remaining capacity is %ld byte(s)\n", long(size),
+                long(m_end_L0-m_iter_L0));
+        #endif // KOKKOS_HAVE_DEBUG
+        tmp = 0;
+      }
+      return tmp;
+    } else {
+      void* tmp = m_iter_L1 + m_offset * align (size);
+      if (m_end_L1 < (m_iter_L1 += align (size) * m_multiplier)) {
+        m_iter_L1 -= align (size) * m_multiplier; // put it back like it was
+        #ifdef KOKKOS_HAVE_DEBUG
+        // mfh 23 Jun 2015: printf call consumes 25 registers
+        // in a CUDA build, so only print in debug mode.  The
+        // function still returns NULL if not enough memory.
+        printf ("ScratchMemorySpace<...>::get_shmem: Failed to allocate "
+                "%ld byte(s); remaining capacity is %ld byte(s)\n", long(size),
+                long(m_end_L1-m_iter_L1));
+        #endif // KOKKOS_HAVE_DEBUG
+        tmp = 0;
+      }
+      return tmp;
+
+    }
+  }
+
+  template< typename IntType >
+  KOKKOS_INLINE_FUNCTION
+  ScratchMemorySpace( void * ptr_L0 , const IntType & size_L0 , void * ptr_L1 = NULL , const IntType & size_L1 = 0)
+    : m_iter_L0( (char *) ptr_L0 )
+    , m_end_L0(  m_iter_L0 + size_L0 )
+    , m_iter_L1( (char *) ptr_L1 )
+    , m_end_L1(  m_iter_L1 + size_L1 )
+    , m_multiplier( 1 )
+    , m_offset( 0 )
+    , m_default_level( 0 )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  const ScratchMemorySpace& set_team_thread_mode(const int& level, const int& multiplier, const int& offset) const {
+    m_default_level = level;
+    m_multiplier = multiplier;
+    m_offset = offset;
+    return *this;
+  }
+};
+
+} // namespace Kokkos
+
+#endif /* #ifndef KOKKOS_SCRATCHSPACE_HPP */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
diff --git a/lib/kokkos/core/src/Kokkos_Serial.hpp b/lib/kokkos/core/src/Kokkos_Serial.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..233b56c93956f7898346780d1bfe327fd11afb03
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_Serial.hpp
@@ -0,0 +1,1116 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+/// \file Kokkos_Serial.hpp
+/// \brief Declaration and definition of Kokkos::Serial device.
+
+#ifndef KOKKOS_SERIAL_HPP
+#define KOKKOS_SERIAL_HPP
+
+#include <cstddef>
+#include <iosfwd>
+#include <Kokkos_Parallel.hpp>
+#include <Kokkos_TaskPolicy.hpp>
+#include <Kokkos_Layout.hpp>
+#include <Kokkos_HostSpace.hpp>
+#include <Kokkos_ScratchSpace.hpp>
+#include <Kokkos_MemoryTraits.hpp>
+#include <impl/Kokkos_Tags.hpp>
+#include <impl/Kokkos_FunctorAdapter.hpp>
+#include <impl/Kokkos_Profiling_Interface.hpp>
+
+
+#include <KokkosExp_MDRangePolicy.hpp>
+
+#if defined( KOKKOS_HAVE_SERIAL )
+
+namespace Kokkos {
+
+/// \class Serial
+/// \brief Kokkos device for non-parallel execution
+///
+/// A "device" represents a parallel execution model.  It tells Kokkos
+/// how to parallelize the execution of kernels in a parallel_for or
+/// parallel_reduce.  For example, the Threads device uses Pthreads or
+/// C++11 threads on a CPU, the OpenMP device uses the OpenMP language
+/// extensions, and the Cuda device uses NVIDIA's CUDA programming
+/// model.  The Serial device executes "parallel" kernels
+/// sequentially.  This is useful if you really do not want to use
+/// threads, or if you want to explore different combinations of MPI
+/// and shared-memory parallel programming models.
+class Serial {
+public:
+  //! \name Type declarations that all Kokkos devices must provide.
+  //@{
+
+  //! Tag this class as an execution space:
+  typedef Serial                execution_space ;
+  //! The size_type typedef best suited for this device.
+  typedef HostSpace::size_type  size_type ;
+  //! This device's preferred memory space.
+  typedef HostSpace             memory_space ;
+  //! This execution space preferred device_type
+  typedef Kokkos::Device<execution_space,memory_space> device_type;
+
+  //! This device's preferred array layout.
+  typedef LayoutRight           array_layout ;
+
+  /// \brief  Scratch memory space
+  typedef ScratchMemorySpace< Kokkos::Serial >  scratch_memory_space ;
+
+  //@}
+
+  /// \brief True if and only if this method is being called in a
+  ///   thread-parallel function.
+  ///
+  /// For the Serial device, this method <i>always</i> returns false,
+  /// because parallel_for or parallel_reduce with the Serial device
+  /// always execute sequentially.
+  inline static int in_parallel() { return false ; }
+
+  /** \brief  Set the device in a "sleep" state.
+   *
+   * This function sets the device in a "sleep" state in which it is
+   * not ready for work.  This may consume less resources than if the
+   * device were in an "awake" state, but it may also take time to
+   * bring the device from a sleep state to be ready for work.
+   *
+   * \return True if the device is in the "sleep" state, else false if
+   *   the device is actively working and could not enter the "sleep"
+   *   state.
+   */
+  static bool sleep();
+
+  /// \brief Wake the device from the 'sleep' state so it is ready for work.
+  ///
+  /// \return True if the device is in the "ready" state, else "false"
+  ///  if the device is actively working (which also means that it's
+  ///  awake).
+  static bool wake();
+
+  /// \brief Wait until all dispatched functors complete.
+  ///
+  /// The parallel_for or parallel_reduce dispatch of a functor may
+  /// return asynchronously, before the functor completes.  This
+  /// method does not return until all dispatched functors on this
+  /// device have completed.
+  static void fence() {}
+
+  static void initialize( unsigned threads_count = 1 ,
+                          unsigned use_numa_count = 0 ,
+                          unsigned use_cores_per_numa = 0 ,
+                          bool allow_asynchronous_threadpool = false) {
+    (void) threads_count;
+    (void) use_numa_count;
+    (void) use_cores_per_numa;
+    (void) allow_asynchronous_threadpool;
+
+    // Init the array of locks used for arbitrarily sized atomics
+    Impl::init_lock_array_host_space();
+    #if (KOKKOS_ENABLE_PROFILING)
+      Kokkos::Profiling::initialize();
+    #endif
+  }
+
+  static int is_initialized() { return 1 ; }
+
+  /** \brief  Return the maximum amount of concurrency.  */
+  static int concurrency() {return 1;};
+
+  //! Free any resources being consumed by the device.
+  static void finalize() {
+    #if (KOKKOS_ENABLE_PROFILING)
+      Kokkos::Profiling::finalize();
+    #endif
+  }
+
+  //! Print configuration information to the given output stream.
+  static void print_configuration( std::ostream & , const bool /* detail */ = false ) {}
+
+  //--------------------------------------------------------------------------
+
+  inline static int thread_pool_size( int = 0 ) { return 1 ; }
+  KOKKOS_INLINE_FUNCTION static int thread_pool_rank() { return 0 ; }
+
+  //--------------------------------------------------------------------------
+
+  KOKKOS_INLINE_FUNCTION static unsigned hardware_thread_id() { return thread_pool_rank(); }
+  inline static unsigned max_hardware_threads() { return thread_pool_size(0); }
+
+  //--------------------------------------------------------------------------
+
+  static void * scratch_memory_resize( unsigned reduce_size , unsigned shared_size );
+
+  //--------------------------------------------------------------------------
+};
+
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+struct VerifyExecutionCanAccessMemorySpace
+  < Kokkos::Serial::memory_space
+  , Kokkos::Serial::scratch_memory_space
+  >
+{
+  enum { value = true };
+  inline static void verify( void ) { }
+  inline static void verify( const void * ) { }
+};
+
+namespace SerialImpl {
+
+struct Sentinel {
+
+  void *   m_scratch ;
+  unsigned m_reduce_end ;
+  unsigned m_shared_end ;
+
+  Sentinel();
+  ~Sentinel();
+  static Sentinel & singleton();
+};
+
+inline
+unsigned align( unsigned n );
+}
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+class SerialTeamMember {
+private:
+  typedef Kokkos::ScratchMemorySpace< Kokkos::Serial > scratch_memory_space ;
+  const scratch_memory_space  m_space ;
+  const int                   m_league_rank ;
+  const int                   m_league_size ;
+
+  SerialTeamMember & operator = ( const SerialTeamMember & );
+
+public:
+
+  KOKKOS_INLINE_FUNCTION
+  const scratch_memory_space & team_shmem() const { return m_space ; }
+
+  KOKKOS_INLINE_FUNCTION
+  const scratch_memory_space & team_scratch(int) const
+    { return m_space ; }
+
+  KOKKOS_INLINE_FUNCTION
+  const scratch_memory_space & thread_scratch(int) const
+    { return m_space ; }
+
+
+  KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
+  KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
+  KOKKOS_INLINE_FUNCTION int team_rank() const { return 0 ; }
+  KOKKOS_INLINE_FUNCTION int team_size() const { return 1 ; }
+
+  KOKKOS_INLINE_FUNCTION void team_barrier() const {}
+
+  template<class ValueType>
+  KOKKOS_INLINE_FUNCTION
+  void team_broadcast(const ValueType& , const int& ) const {}
+
+  template< class ValueType, class JoinOp >
+  KOKKOS_INLINE_FUNCTION
+  ValueType team_reduce( const ValueType & value , const JoinOp & ) const
+    {
+      return value ;
+    }
+
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
+   *          with intra-team non-deterministic ordering accumulation.
+   *
+   *  The global inter-team accumulation value will, at the end of the
+   *  league's parallel execution, be the scan's total.
+   *  Parallel execution ordering of the league's teams is non-deterministic.
+   *  As such the base value for each team's scan operation is similarly
+   *  non-deterministic.
+   */
+  template< typename Type >
+  KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value , Type * const global_accum ) const
+    {
+      const Type tmp = global_accum ? *global_accum : Type(0) ;
+      if ( global_accum ) { *global_accum += value ; }
+      return tmp ;
+    }
+
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering.
+   *
+   *  The highest rank thread can compute the reduction total as
+   *    reduction_total = dev.team_scan( value ) + value ;
+   */
+  template< typename Type >
+  KOKKOS_INLINE_FUNCTION Type team_scan( const Type & ) const
+    { return Type(0); }
+
+  //----------------------------------------
+  // Execution space specific:
+
+  SerialTeamMember( int arg_league_rank
+                  , int arg_league_size
+                  , int arg_shared_size
+                  );
+};
+
+} // namespace Impl
+
+
+/*
+ * < Kokkos::Serial , WorkArgTag >
+ * < WorkArgTag , Impl::enable_if< Impl::is_same< Kokkos::Serial , Kokkos::DefaultExecutionSpace >::value >::type >
+ *
+ */
+namespace Impl {
+template< class ... Properties >
+class TeamPolicyInternal< Kokkos::Serial , Properties ... >:public PolicyTraits<Properties...>
+{
+private:
+
+  size_t m_team_scratch_size[2] ;
+  size_t m_thread_scratch_size[2] ;
+  int    m_league_size ;
+  int    m_chunk_size;
+
+public:
+
+  //! Tag this class as a kokkos execution policy
+  typedef TeamPolicyInternal      execution_policy ;
+
+  typedef PolicyTraits<Properties ... > traits;
+
+  //! Execution space of this execution policy:
+  typedef Kokkos::Serial  execution_space ;
+
+  TeamPolicyInternal& operator = (const TeamPolicyInternal& p) {
+    m_league_size = p.m_league_size;
+    m_team_scratch_size[0] = p.m_team_scratch_size[0];
+    m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
+    m_team_scratch_size[1] = p.m_team_scratch_size[1];
+    m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
+    m_chunk_size = p.m_chunk_size;
+    return *this;
+  }
+
+  //----------------------------------------
+
+  template< class FunctorType >
+  static
+  int team_size_max( const FunctorType & ) { return 1 ; }
+
+  template< class FunctorType >
+  static
+  int team_size_recommended( const FunctorType & ) { return 1 ; }
+
+  template< class FunctorType >
+  static
+  int team_size_recommended( const FunctorType & , const int& ) { return 1 ; }
+
+  //----------------------------------------
+
+  inline int team_size() const { return 1 ; }
+  inline int league_size() const { return m_league_size ; }
+  inline size_t scratch_size(const int& level, int = 0) const { return m_team_scratch_size[level] + m_thread_scratch_size[level]; }
+
+  /** \brief  Specify league size, request team size */
+  TeamPolicyInternal( execution_space &
+            , int league_size_request
+            , int /* team_size_request */
+            , int /* vector_length_request */ = 1 )
+    : m_team_scratch_size { 0 , 0 }
+    , m_thread_scratch_size { 0 , 0 }
+    , m_league_size( league_size_request )
+    , m_chunk_size ( 32 )
+    {}
+
+  TeamPolicyInternal( execution_space &
+            , int league_size_request
+            , const Kokkos::AUTO_t & /* team_size_request */
+            , int /* vector_length_request */ = 1 )
+    : m_team_scratch_size { 0 , 0 }
+    , m_thread_scratch_size { 0 , 0 }
+    , m_league_size( league_size_request )
+    , m_chunk_size ( 32 )
+    {}
+
+  TeamPolicyInternal( int league_size_request
+            , int /* team_size_request */
+            , int /* vector_length_request */ = 1 )
+    : m_team_scratch_size { 0 , 0 }
+    , m_thread_scratch_size { 0 , 0 }
+    , m_league_size( league_size_request )
+    , m_chunk_size ( 32 )
+    {}
+
+  TeamPolicyInternal( int league_size_request
+            , const Kokkos::AUTO_t & /* team_size_request */
+            , int /* vector_length_request */ = 1 )
+    : m_team_scratch_size { 0 , 0 }
+    , m_thread_scratch_size { 0 , 0 }
+    , m_league_size( league_size_request )
+    , m_chunk_size ( 32 )
+    {}
+
+
+  inline int chunk_size() const { return m_chunk_size ; }
+
+  /** \brief set chunk_size to a discrete value*/
+  inline TeamPolicyInternal set_chunk_size(typename traits::index_type chunk_size_) const {
+    TeamPolicyInternal p = *this;
+    p.m_chunk_size = chunk_size_;
+    return p;
+  }
+
+  /** \brief set per team scratch size for a specific level of the scratch hierarchy */
+  inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team) const {
+    TeamPolicyInternal p = *this;
+    p.m_team_scratch_size[level] = per_team.value;
+    return p;
+  };
+
+  /** \brief set per thread scratch size for a specific level of the scratch hierarchy */
+  inline TeamPolicyInternal set_scratch_size(const int& level, const PerThreadValue& per_thread) const {
+    TeamPolicyInternal p = *this;
+    p.m_thread_scratch_size[level] = per_thread.value;
+    return p;
+  };
+
+  /** \brief set per thread and per team scratch size for a specific level of the scratch hierarchy */
+  inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team, const PerThreadValue& per_thread) const {
+    TeamPolicyInternal p = *this;
+    p.m_team_scratch_size[level] = per_team.value;
+    p.m_thread_scratch_size[level] = per_thread.value;
+    return p;
+  };
+
+  typedef Impl::SerialTeamMember  member_type ;
+};
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+/* Parallel patterns for Kokkos::Serial with RangePolicy */
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class ... Traits >
+class ParallelFor< FunctorType ,
+                   Kokkos::RangePolicy< Traits ... > ,
+                   Kokkos::Serial
+                 >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Traits ... > Policy ;
+
+  const FunctorType m_functor ;
+  const Policy      m_policy ;
+
+  template< class TagType >
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec() const
+    {
+      const typename Policy::member_type e = m_policy.end();
+      for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
+        m_functor( i );
+      }
+    }
+
+  template< class TagType >
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec() const
+    {
+      const TagType t{} ;
+      const typename Policy::member_type e = m_policy.end();
+      for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
+        m_functor( t , i );
+      }
+    }
+
+public:
+
+  inline
+  void execute() const
+    { this-> template exec< typename Policy::work_tag >(); }
+
+  inline
+  ParallelFor( const FunctorType & arg_functor
+             , const Policy      & arg_policy )
+    : m_functor( arg_functor )
+    , m_policy(  arg_policy )
+    {}
+};
+
+/*--------------------------------------------------------------------------*/
+
+template< class FunctorType , class ReducerType , class ... Traits >
+class ParallelReduce< FunctorType
+                    , Kokkos::RangePolicy< Traits ... >
+                    , ReducerType
+                    , Kokkos::Serial
+                    >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Traits ... > Policy ;
+  typedef typename Policy::work_tag                                  WorkTag ;
+
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
+  typedef typename ReducerConditional::type ReducerTypeFwd;
+
+  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTag >  ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd , WorkTag >  ValueInit ;
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+
+  const FunctorType   m_functor ;
+  const Policy        m_policy ;
+  const ReducerType   m_reducer ;
+  const pointer_type  m_result_ptr ;
+
+
+  template< class TagType >
+  inline
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec( pointer_type ptr ) const
+    {
+      reference_type update = ValueInit::init(  ReducerConditional::select(m_functor , m_reducer) , ptr );
+
+      const typename Policy::member_type e = m_policy.end();
+      for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
+        m_functor( i , update );
+      }
+
+      Kokkos::Impl::FunctorFinal< ReducerTypeFwd , TagType >::
+        final(  ReducerConditional::select(m_functor , m_reducer) , ptr );
+    }
+
+  template< class TagType >
+  inline
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec( pointer_type ptr ) const
+    {
+      const TagType t{} ;
+      reference_type update = ValueInit::init(  ReducerConditional::select(m_functor , m_reducer) , ptr );
+
+      const typename Policy::member_type e = m_policy.end();
+      for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
+        m_functor( t , i , update );
+      }
+
+      Kokkos::Impl::FunctorFinal< ReducerTypeFwd , TagType >::
+        final(  ReducerConditional::select(m_functor , m_reducer) , ptr );
+    }
+
+public:
+
+  inline
+  void execute() const
+    {
+      pointer_type ptr = (pointer_type) Kokkos::Serial::scratch_memory_resize
+           ( ValueTraits::value_size(  ReducerConditional::select(m_functor , m_reducer) ) , 0 );
+
+      this-> template exec< WorkTag >( m_result_ptr ? m_result_ptr : ptr );
+    }
+
+  template< class HostViewType >
+  ParallelReduce( const FunctorType  & arg_functor ,
+                  const Policy       & arg_policy ,
+                  const HostViewType & arg_result_view ,
+                  typename std::enable_if<
+                               Kokkos::is_view< HostViewType >::value &&
+                              !Kokkos::is_reducer_type<ReducerType>::value
+                  ,void*>::type = NULL)
+    : m_functor( arg_functor )
+    , m_policy( arg_policy )
+    , m_reducer( InvalidType() )
+    , m_result_ptr( arg_result_view.ptr_on_device() )
+    {
+      static_assert( Kokkos::is_view< HostViewType >::value
+        , "Kokkos::Serial reduce result must be a View" );
+
+      static_assert( std::is_same< typename HostViewType::memory_space , HostSpace >::value
+        , "Kokkos::Serial reduce result must be a View in HostSpace" );
+    }
+
+  inline
+  ParallelReduce( const FunctorType & arg_functor
+                , Policy       arg_policy
+                , const ReducerType& reducer )
+    : m_functor( arg_functor )
+    , m_policy(  arg_policy )
+    , m_reducer( reducer )
+    , m_result_ptr(  reducer.result_view().data() )
+    {
+      /*static_assert( std::is_same< typename ViewType::memory_space
+                                      , Kokkos::HostSpace >::value
+        , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
+    }
+};
+
+/*--------------------------------------------------------------------------*/
+
+template< class FunctorType , class ... Traits >
+class ParallelScan< FunctorType
+                  , Kokkos::RangePolicy< Traits ... >
+                  , Kokkos::Serial
+                  >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Traits ... > Policy ;
+  typedef typename Policy::work_tag                                  WorkTag ;
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , WorkTag >  ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   FunctorType , WorkTag >  ValueInit ;
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+
+  const FunctorType   m_functor ;
+  const Policy        m_policy ;
+
+  template< class TagType >
+  inline
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec( pointer_type ptr ) const
+    {
+      reference_type update = ValueInit::init( m_functor , ptr );
+
+      const typename Policy::member_type e = m_policy.end();
+      for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
+        m_functor( i , update , true );
+      }
+    }
+
+  template< class TagType >
+  inline
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec( pointer_type ptr ) const
+    {
+      const TagType t{} ;
+      reference_type update = ValueInit::init( m_functor , ptr );
+
+      const typename Policy::member_type e = m_policy.end();
+      for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
+        m_functor( t , i , update , true );
+      }
+    }
+
+public:
+
+  inline
+  void execute() const
+    {
+      pointer_type ptr = (pointer_type)
+        Kokkos::Serial::scratch_memory_resize( ValueTraits::value_size( m_functor ) , 0 );
+      this-> template exec< WorkTag >( ptr );
+    }
+
+  inline
+  ParallelScan( const FunctorType & arg_functor
+              , const Policy      & arg_policy
+              )
+    : m_functor( arg_functor )
+    , m_policy(  arg_policy )
+    {}
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+/* Parallel patterns for Kokkos::Serial with TeamPolicy */
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class ... Properties >
+class ParallelFor< FunctorType
+                 , Kokkos::TeamPolicy< Properties ... >
+                 , Kokkos::Serial
+                 >
+{
+private:
+
+  typedef TeamPolicyInternal< Kokkos::Serial , Properties ...> Policy ;
+  typedef typename Policy::member_type                       Member ;
+
+  const FunctorType  m_functor ;
+  const int          m_league ;
+  const int          m_shared ;
+
+  template< class TagType >
+  inline
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec() const
+    {
+      for ( int ileague = 0 ; ileague < m_league ; ++ileague ) {
+        m_functor( Member(ileague,m_league,m_shared) );
+      }
+    }
+
+  template< class TagType >
+  inline
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec() const
+    {
+      const TagType t{} ;
+      for ( int ileague = 0 ; ileague < m_league ; ++ileague ) {
+        m_functor( t , Member(ileague,m_league,m_shared) );
+      }
+    }
+
+public:
+
+  inline
+  void execute() const
+    {
+      Kokkos::Serial::scratch_memory_resize( 0 , m_shared );
+      this-> template exec< typename Policy::work_tag >();
+    }
+
+  ParallelFor( const FunctorType & arg_functor
+             , const Policy      & arg_policy )
+    : m_functor( arg_functor )
+    , m_league(  arg_policy.league_size() )
+    , m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , 1 ) )
+    { }
+};
+
+/*--------------------------------------------------------------------------*/
+
+template< class FunctorType , class ReducerType , class ... Properties >
+class ParallelReduce< FunctorType
+                    , Kokkos::TeamPolicy< Properties ... >
+                    , ReducerType
+                    , Kokkos::Serial
+                    >
+{
+private:
+
+  typedef TeamPolicyInternal< Kokkos::Serial, Properties ... > Policy ;
+  typedef typename Policy::member_type                       Member ;
+  typedef typename Policy::work_tag                          WorkTag ;
+
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
+  typedef typename ReducerConditional::type ReducerTypeFwd;
+
+  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTag >  ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd , WorkTag >  ValueInit ;
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+
+  const FunctorType  m_functor ;
+  const int          m_league ;
+  const ReducerType  m_reducer ;
+        pointer_type m_result_ptr ;
+  const int          m_shared ;
+
+  template< class TagType >
+  inline
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec( pointer_type ptr ) const
+    {
+      reference_type update = ValueInit::init(  ReducerConditional::select(m_functor , m_reducer) , ptr );
+
+      for ( int ileague = 0 ; ileague < m_league ; ++ileague ) {
+        m_functor( Member(ileague,m_league,m_shared) , update );
+      }
+
+      Kokkos::Impl::FunctorFinal< ReducerTypeFwd , TagType >::
+        final(  ReducerConditional::select(m_functor , m_reducer) , ptr );
+    }
+
+  template< class TagType >
+  inline
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec( pointer_type ptr ) const
+    {
+      const TagType t{} ;
+
+      reference_type update = ValueInit::init(  ReducerConditional::select(m_functor , m_reducer) , ptr );
+
+      for ( int ileague = 0 ; ileague < m_league ; ++ileague ) {
+        m_functor( t , Member(ileague,m_league,m_shared) , update );
+      }
+
+      Kokkos::Impl::FunctorFinal< ReducerTypeFwd , TagType >::
+        final(  ReducerConditional::select(m_functor , m_reducer) , ptr );
+    }
+
+public:
+
+  inline
+  void execute() const
+    {
+      pointer_type ptr = (pointer_type) Kokkos::Serial::scratch_memory_resize
+           ( ValueTraits::value_size(  ReducerConditional::select(m_functor , m_reducer) ) , m_shared );
+
+      this-> template exec< WorkTag >( m_result_ptr ? m_result_ptr : ptr );
+    }
+
+  template< class ViewType >
+  ParallelReduce( const FunctorType  & arg_functor
+                , const Policy       & arg_policy
+                , const ViewType     & arg_result ,
+                typename std::enable_if<
+                  Kokkos::is_view< ViewType >::value &&
+                  !Kokkos::is_reducer_type<ReducerType>::value
+                  ,void*>::type = NULL)
+    : m_functor( arg_functor )
+    , m_league( arg_policy.league_size() )
+    , m_reducer( InvalidType() )
+    , m_result_ptr( arg_result.ptr_on_device() )
+    , m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( m_functor , 1 ) )
+    {
+      static_assert( Kokkos::is_view< ViewType >::value
+        , "Reduction result on Kokkos::Serial must be a Kokkos::View" );
+
+      static_assert( std::is_same< typename ViewType::memory_space
+                                      , Kokkos::HostSpace >::value
+        , "Reduction result on Kokkos::Serial must be a Kokkos::View in HostSpace" );
+    }
+
+  inline
+  ParallelReduce( const FunctorType & arg_functor
+    , Policy       arg_policy
+    , const ReducerType& reducer )
+  : m_functor( arg_functor )
+  , m_league(  arg_policy.league_size() )
+  , m_reducer( reducer )
+  , m_result_ptr(  reducer.result_view().data() )
+  , m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
+  {
+  /*static_assert( std::is_same< typename ViewType::memory_space
+                          , Kokkos::HostSpace >::value
+  , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
+  }
+
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+/* Nested parallel patterns for Kokkos::Serial with TeamPolicy */
+
+namespace Kokkos {
+namespace Impl {
+
+template<typename iType>
+struct TeamThreadRangeBoundariesStruct<iType,SerialTeamMember> {
+  typedef iType index_type;
+  const iType begin ;
+  const iType end ;
+  enum {increment = 1};
+  const SerialTeamMember& thread;
+
+  KOKKOS_INLINE_FUNCTION
+  TeamThreadRangeBoundariesStruct (const SerialTeamMember& arg_thread, const iType& arg_count)
+    : begin(0)
+    , end(arg_count)
+    , thread(arg_thread)
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  TeamThreadRangeBoundariesStruct (const SerialTeamMember& arg_thread, const iType& arg_begin, const iType & arg_end )
+    : begin( arg_begin )
+    , end(   arg_end)
+    , thread( arg_thread )
+    {}
+};
+
+  template<typename iType>
+  struct ThreadVectorRangeBoundariesStruct<iType,SerialTeamMember> {
+    typedef iType index_type;
+    enum {start = 0};
+    const iType end;
+    enum {increment = 1};
+
+    KOKKOS_INLINE_FUNCTION
+    ThreadVectorRangeBoundariesStruct (const SerialTeamMember& thread, const iType& count):
+      end( count )
+    {}
+  };
+
+} // namespace Impl
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl::SerialTeamMember>
+TeamThreadRange( const Impl::SerialTeamMember& thread, const iType & count )
+{
+  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::SerialTeamMember>(thread,count);
+}
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl::SerialTeamMember>
+TeamThreadRange( const Impl::SerialTeamMember& thread, const iType & begin , const iType & end )
+{
+  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::SerialTeamMember>(thread,begin,end);
+}
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::SerialTeamMember >
+  ThreadVectorRange(const Impl::SerialTeamMember& thread, const iType& count) {
+  return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::SerialTeamMember >(thread,count);
+}
+
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadSingleStruct<Impl::SerialTeamMember> PerTeam(const Impl::SerialTeamMember& thread) {
+  return Impl::ThreadSingleStruct<Impl::SerialTeamMember>(thread);
+}
+
+KOKKOS_INLINE_FUNCTION
+Impl::VectorSingleStruct<Impl::SerialTeamMember> PerThread(const Impl::SerialTeamMember& thread) {
+  return Impl::VectorSingleStruct<Impl::SerialTeamMember>(thread);
+}
+
+} // namespace Kokkos
+
+namespace Kokkos {
+
+  /** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
+   *
+   * The range i=0..N-1 is mapped to all threads of the the calling thread team.
+   * This functionality requires C++11 support.*/
+template<typename iType, class Lambda>
+KOKKOS_INLINE_FUNCTION
+void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::SerialTeamMember>& loop_boundaries, const Lambda& lambda) {
+  for( iType i = loop_boundaries.begin; i < loop_boundaries.end; i+=loop_boundaries.increment)
+    lambda(i);
+}
+
+/** \brief  Inter-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all threads of the the calling thread team and a summation of
+ * val is performed and put into result. This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::SerialTeamMember>& loop_boundaries,
+                     const Lambda & lambda, ValueType& result) {
+
+  result = ValueType();
+
+  for( iType i = loop_boundaries.begin; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    result+=tmp;
+  }
+
+  result = loop_boundaries.thread.team_reduce(result,Impl::JoinAdd<ValueType>());
+}
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
+ * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
+ * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
+ * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
+ * '1 for *'). This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::SerialTeamMember>& loop_boundaries,
+                     const Lambda & lambda, const JoinType& join, ValueType& init_result) {
+
+  ValueType result = init_result;
+
+  for( iType i = loop_boundaries.begin; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    join(result,tmp);
+  }
+
+  init_result = loop_boundaries.thread.team_reduce(result,Impl::JoinLambdaAdapter<ValueType,JoinType>(join));
+}
+
+} //namespace Kokkos
+
+namespace Kokkos {
+/** \brief  Intra-thread vector parallel_for. Executes lambda(iType i) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread.
+ * This functionality requires C++11 support.*/
+template<typename iType, class Lambda>
+KOKKOS_INLINE_FUNCTION
+void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::SerialTeamMember >&
+    loop_boundaries, const Lambda& lambda) {
+  #ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+  #pragma ivdep
+  #endif
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
+    lambda(i);
+}
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a summation of
+ * val is performed and put into result. This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::SerialTeamMember >&
+      loop_boundaries, const Lambda & lambda, ValueType& result) {
+  result = ValueType();
+#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    result+=tmp;
+  }
+}
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
+ * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
+ * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
+ * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
+ * '1 for *'). This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::SerialTeamMember >&
+      loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& init_result) {
+
+  ValueType result = init_result;
+#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    join(result,tmp);
+  }
+  init_result = result;
+}
+
+/** \brief  Intra-thread vector parallel exclusive prefix sum. Executes lambda(iType i, ValueType & val, bool final)
+ *          for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes in the thread and a scan operation is performed.
+ * Depending on the target execution space the operator might be called twice: once with final=false
+ * and once with final=true. When final==true val contains the prefix sum value. The contribution of this
+ * "i" needs to be added to val no matter whether final==true or not. In a serial execution
+ * (i.e. team_size==1) the operator is only called once with final==true. Scan_val will be set
+ * to the final sum value over all vector lanes.
+ * This functionality requires C++11 support.*/
+template< typename iType, class FunctorType >
+KOKKOS_INLINE_FUNCTION
+void parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::SerialTeamMember >&
+      loop_boundaries, const FunctorType & lambda) {
+
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ;
+  typedef typename ValueTraits::value_type value_type ;
+
+  value_type scan_val = value_type();
+
+#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,scan_val,true);
+  }
+}
+
+} // namespace Kokkos
+
+namespace Kokkos {
+
+template<class FunctorType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::VectorSingleStruct<Impl::SerialTeamMember>& , const FunctorType& lambda) {
+  lambda();
+}
+
+template<class FunctorType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::ThreadSingleStruct<Impl::SerialTeamMember>& , const FunctorType& lambda) {
+  lambda();
+}
+
+template<class FunctorType, class ValueType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::VectorSingleStruct<Impl::SerialTeamMember>& , const FunctorType& lambda, ValueType& val) {
+  lambda(val);
+}
+
+template<class FunctorType, class ValueType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::ThreadSingleStruct<Impl::SerialTeamMember>& , const FunctorType& lambda, ValueType& val) {
+  lambda(val);
+}
+}
+
+//----------------------------------------------------------------------------
+
+#include <impl/Kokkos_Serial_Task.hpp>
+
+#endif // defined( KOKKOS_HAVE_SERIAL )
+#endif /* #define KOKKOS_SERIAL_HPP */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
diff --git a/lib/kokkos/core/src/Kokkos_TaskPolicy.hpp b/lib/kokkos/core/src/Kokkos_TaskPolicy.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..fc9113b75052e91fc260f95725fe360b98e548e8
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_TaskPolicy.hpp
@@ -0,0 +1,1109 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+// Experimental unified task-data parallel manycore LDRD
+
+#ifndef KOKKOS_TASKPOLICY_HPP
+#define KOKKOS_TASKPOLICY_HPP
+
+//----------------------------------------------------------------------------
+
+#include <Kokkos_Core_fwd.hpp>
+
+// If compiling with CUDA then must be using CUDA 8 or better
+// and use relocateable device code to enable the task policy.
+// nvcc relocatable device code option: --relocatable-device-code=true
+
+#if ( defined( KOKKOS_COMPILER_NVCC ) )
+  #if ( 8000 <= CUDA_VERSION ) && \
+      defined( KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE )
+
+  #define KOKKOS_ENABLE_TASKPOLICY
+
+  #endif
+#else
+
+#define KOKKOS_ENABLE_TASKPOLICY
+
+#endif
+
+
+#if defined( KOKKOS_ENABLE_TASKPOLICY )
+
+//----------------------------------------------------------------------------
+
+#include <Kokkos_MemoryPool.hpp>
+#include <impl/Kokkos_Tags.hpp>
+#include <impl/Kokkos_TaskQueue.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+enum TaskType { TaskTeam   = Impl::TaskBase<void,void,void>::TaskTeam
+              , TaskSingle = Impl::TaskBase<void,void,void>::TaskSingle };
+
+enum TaskPriority { TaskHighPriority    = 0
+                  , TaskRegularPriority = 1
+                  , TaskLowPriority     = 2 };
+
+template< typename Space >
+class TaskPolicy ;
+
+template< typename Space >
+void wait( TaskPolicy< Space > const & );
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+/*\brief  Implementation data for task data management, access, and execution.
+ *
+ *  CRTP Inheritance structure to allow static_cast from the
+ *  task root type and a task's FunctorType.
+ *
+ *    TaskBase< Space , ResultType , FunctorType >
+ *      : TaskBase< Space , ResultType , void >
+ *      , FunctorType
+ *      { ... };
+ *
+ *    TaskBase< Space , ResultType , void >
+ *      : TaskBase< Space , void , void >
+ *      { ... };
+ */
+template< typename Space , typename ResultType , typename FunctorType >
+class TaskBase ;
+
+template< typename Space >
+class TaskExec ;
+
+}} // namespace Kokkos::Impl
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+/**
+ *
+ *  Future< space >  // value_type == void
+ *  Future< value >  // space == Default
+ *  Future< value , space >
+ *
+ */
+template< typename Arg1 /* = void */ , typename Arg2 /* = void */ >
+class Future {
+private:
+
+  template< typename > friend class TaskPolicy ;
+  template< typename , typename > friend class Future ;
+  template< typename , typename , typename > friend class Impl::TaskBase ;
+
+  enum { Arg1_is_space  = Kokkos::Impl::is_space< Arg1 >::value };
+  enum { Arg2_is_space  = Kokkos::Impl::is_space< Arg2 >::value };
+  enum { Arg1_is_value  = ! Arg1_is_space &&
+                          ! std::is_same< Arg1 , void >::value };
+  enum { Arg2_is_value  = ! Arg2_is_space &&
+                          ! std::is_same< Arg2 , void >::value };
+
+  static_assert( ! ( Arg1_is_space && Arg2_is_space )
+               , "Future cannot be given two spaces" );
+
+  static_assert( ! ( Arg1_is_value && Arg2_is_value )
+               , "Future cannot be given two value types" );
+
+  using ValueType =
+    typename std::conditional< Arg1_is_value , Arg1 ,
+    typename std::conditional< Arg2_is_value , Arg2 , void
+    >::type >::type ;
+
+  using Space =
+    typename std::conditional< Arg1_is_space , Arg1 ,
+    typename std::conditional< Arg2_is_space , Arg2 , void
+    >::type >::type ;
+
+  using task_base  = Impl::TaskBase< Space , ValueType , void > ;
+  using queue_type = Impl::TaskQueue< Space > ;
+
+  task_base * m_task ;
+
+  KOKKOS_INLINE_FUNCTION explicit
+  Future( task_base * task ) : m_task(0)
+    { if ( task ) queue_type::assign( & m_task , task ); }
+
+  //----------------------------------------
+
+public:
+
+  using execution_space = typename Space::execution_space ;
+  using value_type      = ValueType ;
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  bool is_null() const { return 0 == m_task ; }
+
+  KOKKOS_INLINE_FUNCTION
+  int reference_count() const
+    { return 0 != m_task ? m_task->reference_count() : 0 ; }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  ~Future() { if ( m_task ) queue_type::assign( & m_task , (task_base*)0 ); }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr Future() noexcept : m_task(0) {}
+
+  KOKKOS_INLINE_FUNCTION
+  Future( Future && rhs )
+    : m_task( rhs.m_task ) { rhs.m_task = 0 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  Future( const Future & rhs )
+    : m_task(0)
+    { if ( rhs.m_task ) queue_type::assign( & m_task , rhs.m_task ); }
+
+  KOKKOS_INLINE_FUNCTION
+  Future & operator = ( Future && rhs )
+    {
+      if ( m_task ) queue_type::assign( & m_task , (task_base*)0 );
+      m_task = rhs.m_task ;
+      rhs.m_task = 0 ;
+      return *this ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  Future & operator = ( const Future & rhs )
+    {
+      if ( m_task || rhs.m_task ) queue_type::assign( & m_task , rhs.m_task );
+      return *this ;
+    }
+
+  //----------------------------------------
+
+  template< class A1 , class A2 >
+  KOKKOS_INLINE_FUNCTION
+  Future( Future<A1,A2> && rhs )
+    : m_task( rhs.m_task )
+    {
+      static_assert
+        ( std::is_same< Space , void >::value ||
+          std::is_same< Space , typename Future<A1,A2>::Space >::value
+        , "Assigned Futures must have the same space" );
+
+      static_assert
+        ( std::is_same< value_type , void >::value ||
+          std::is_same< value_type , typename Future<A1,A2>::value_type >::value
+        , "Assigned Futures must have the same value_type" );
+
+      rhs.m_task = 0 ;
+    }
+
+  template< class A1 , class A2 >
+  KOKKOS_INLINE_FUNCTION
+  Future( const Future<A1,A2> & rhs )
+    : m_task(0)
+    {
+      static_assert
+        ( std::is_same< Space , void >::value ||
+          std::is_same< Space , typename Future<A1,A2>::Space >::value
+        , "Assigned Futures must have the same space" );
+
+      static_assert
+        ( std::is_same< value_type , void >::value ||
+          std::is_same< value_type , typename Future<A1,A2>::value_type >::value
+        , "Assigned Futures must have the same value_type" );
+
+      if ( rhs.m_task ) queue_type::assign( & m_task , rhs.m_task );
+    }
+
+  template< class A1 , class A2 >
+  KOKKOS_INLINE_FUNCTION
+  Future & operator = ( const Future<A1,A2> & rhs )
+    {
+      static_assert
+        ( std::is_same< Space , void >::value ||
+          std::is_same< Space , typename Future<A1,A2>::Space >::value
+        , "Assigned Futures must have the same space" );
+
+      static_assert
+        ( std::is_same< value_type , void >::value ||
+          std::is_same< value_type , typename Future<A1,A2>::value_type >::value
+        , "Assigned Futures must have the same value_type" );
+
+      if ( m_task || rhs.m_task ) queue_type::assign( & m_task , rhs.m_task );
+      return *this ;
+    }
+
+  template< class A1 , class A2 >
+  KOKKOS_INLINE_FUNCTION
+  Future & operator = ( Future<A1,A2> && rhs )
+    {
+      static_assert
+        ( std::is_same< Space , void >::value ||
+          std::is_same< Space , typename Future<A1,A2>::Space >::value
+        , "Assigned Futures must have the same space" );
+
+      static_assert
+        ( std::is_same< value_type , void >::value ||
+          std::is_same< value_type , typename Future<A1,A2>::value_type >::value
+        , "Assigned Futures must have the same value_type" );
+
+      if ( m_task ) queue_type::assign( & m_task , (task_base*) 0 );
+      m_task = rhs.m_task ;
+      rhs.m_task = 0 ;
+      return *this ;
+    }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  typename task_base::get_return_type
+  get() const
+    {
+      if ( 0 == m_task ) {
+        Kokkos::abort( "Kokkos:::Future::get ERROR: is_null()");
+      }
+      return m_task->get();
+    }
+};
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+template< typename ExecSpace >
+class TaskPolicy
+{
+private:
+
+  using track_type = Kokkos::Experimental::Impl::SharedAllocationTracker ;
+  using queue_type = Kokkos::Impl::TaskQueue< ExecSpace > ;
+  using task_base  = Impl::TaskBase< ExecSpace , void , void > ;
+
+  track_type   m_track ;
+  queue_type * m_queue ;
+
+  //----------------------------------------
+  // Process optional arguments to spawn and respawn functions
+
+  KOKKOS_INLINE_FUNCTION static
+  void assign( task_base * const ) {}
+
+  // TaskTeam or TaskSingle
+  template< typename ... Options >
+  KOKKOS_INLINE_FUNCTION static
+  void assign( task_base * const task
+             , TaskType const & arg
+             , Options const & ... opts )
+    {
+      task->m_task_type = arg ;
+      assign( task , opts ... );
+    }
+
+  // TaskHighPriority or TaskRegularPriority or TaskLowPriority
+  template< typename ... Options >
+  KOKKOS_INLINE_FUNCTION static
+  void assign( task_base * const task
+             , TaskPriority const & arg
+             , Options const & ... opts )
+    {
+      task->m_priority = arg ;
+      assign( task , opts ... );
+    }
+
+  // Future for a dependence
+  template< typename A1 , typename A2 , typename ... Options >
+  KOKKOS_INLINE_FUNCTION static
+  void assign( task_base * const task
+             , Future< A1 , A2 > const & arg 
+             , Options const & ... opts )
+    {
+      // Assign dependence to task->m_next
+      // which will be processed within subsequent call to schedule.
+      // Error if the dependence is reset.
+
+      if ( 0 != Kokkos::atomic_exchange(& task->m_next, arg.m_task) ) {
+        Kokkos::abort("TaskPolicy ERROR: resetting task dependence");
+      }
+
+      if ( 0 != arg.m_task ) {
+        // The future may be destroyed upon returning from this call
+        // so increment reference count to track this assignment.
+        Kokkos::atomic_fetch_add( &(arg.m_task->m_ref_count) , 1 );
+      }
+
+      assign( task , opts ... );
+    }
+
+  //----------------------------------------
+
+public:
+
+  using execution_policy = TaskPolicy ;
+  using execution_space  = ExecSpace ;
+  using memory_space     = typename queue_type::memory_space ;
+  using member_type      = Kokkos::Impl::TaskExec< ExecSpace > ;
+
+  KOKKOS_INLINE_FUNCTION
+  TaskPolicy() : m_track(), m_queue(0) {}
+
+  KOKKOS_INLINE_FUNCTION
+  TaskPolicy( TaskPolicy && rhs ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  TaskPolicy( TaskPolicy const & rhs ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  TaskPolicy & operator = ( TaskPolicy && rhs ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  TaskPolicy & operator = ( TaskPolicy const & rhs ) = default ;
+
+  TaskPolicy( memory_space const & arg_memory_space
+            , unsigned const arg_memory_pool_capacity
+            , unsigned const arg_memory_pool_log2_superblock = 12 )
+    : m_track()
+    , m_queue(0)
+    {
+      typedef Kokkos::Experimental::Impl::SharedAllocationRecord
+        < memory_space , typename queue_type::Destroy >
+          record_type ;
+
+      record_type * record =
+        record_type::allocate( arg_memory_space
+                             , "TaskQueue"
+                             , sizeof(queue_type)
+                             );
+
+      m_queue = new( record->data() )
+        queue_type( arg_memory_space
+                  , arg_memory_pool_capacity
+                  , arg_memory_pool_log2_superblock );
+
+      record->m_destroy.m_queue = m_queue ;
+
+      m_track.assign_allocated_record_to_uninitialized( record );
+    }
+
+  //----------------------------------------
+  /**\brief  Allocation size for a spawned task */
+  template< typename FunctorType >
+  KOKKOS_FUNCTION
+  size_t spawn_allocation_size() const
+    {
+      using task_type  = Impl::TaskBase< execution_space
+                                       , typename FunctorType::value_type
+                                       , FunctorType > ;
+
+      return m_queue->allocate_block_size( sizeof(task_type) );
+    }
+
+  /**\brief  Allocation size for a when_all aggregate */
+  KOKKOS_FUNCTION
+  size_t when_all_allocation_size( int narg ) const
+    {
+      using task_base  = Kokkos::Impl::TaskBase< ExecSpace , void , void > ;
+
+      return m_queue->allocate_block_size( sizeof(task_base) + narg * sizeof(task_base*) );
+    }
+
+  //----------------------------------------
+
+  /**\brief  A task spawns a task with options
+   *
+   *  1) High, Normal, or Low priority
+   *  2) With or without dependence
+   *  3) Team or Serial
+   */
+  template< typename FunctorType , typename ... Options >
+  KOKKOS_FUNCTION
+  Future< typename FunctorType::value_type , ExecSpace >
+  task_spawn( FunctorType const & arg_functor 
+            , Options const & ... arg_options
+            ) const
+    {
+      using value_type  = typename FunctorType::value_type ;
+      using future_type = Future< value_type , execution_space > ;
+      using task_type   = Impl::TaskBase< execution_space
+                                        , value_type
+                                        , FunctorType > ;
+
+      //----------------------------------------
+      // Give single-thread back-ends an opportunity to clear
+      // queue of ready tasks before allocating a new task
+
+      m_queue->iff_single_thread_recursive_execute();
+
+      //----------------------------------------
+
+      future_type f ;
+
+      // Allocate task from memory pool
+      f.m_task =
+        reinterpret_cast< task_type * >(m_queue->allocate(sizeof(task_type)));
+
+      if ( f.m_task ) {
+
+        // Placement new construction
+        new ( f.m_task ) task_type( arg_functor );
+
+        // Reference count starts at two
+        // +1 for matching decrement when task is complete
+        // +1 for future
+        f.m_task->m_queue      = m_queue ;
+        f.m_task->m_ref_count  = 2 ;
+        f.m_task->m_alloc_size = sizeof(task_type);
+
+        assign( f.m_task , arg_options... );
+
+        // Spawning from within the execution space so the
+        // apply function pointer is guaranteed to be valid
+        f.m_task->m_apply = task_type::apply ;
+
+        m_queue->schedule( f.m_task );
+        // this task may be updated or executed at any moment
+      }
+
+      return f ;
+    }
+
+  /**\brief  The host process spawns a task with options
+   *
+   *  1) High, Normal, or Low priority
+   *  2) With or without dependence
+   *  3) Team or Serial
+   */
+  template< typename FunctorType , typename ... Options >
+  inline
+  Future< typename FunctorType::value_type , ExecSpace >
+  host_spawn( FunctorType const & arg_functor 
+            , Options const & ... arg_options
+            ) const
+    {
+      using value_type  = typename FunctorType::value_type ;
+      using future_type = Future< value_type , execution_space > ;
+      using task_type   = Impl::TaskBase< execution_space
+                                        , value_type
+                                        , FunctorType > ;
+
+      future_type f ;
+
+      // Allocate task from memory pool
+      f.m_task = 
+        reinterpret_cast<task_type*>( m_queue->allocate(sizeof(task_type)) );
+
+      if ( f.m_task ) {
+
+        // Placement new construction
+        new( f.m_task ) task_type( arg_functor );
+
+        // Reference count starts at two:
+        // +1 to match decrement when task completes
+        // +1 for the future
+        f.m_task->m_queue      = m_queue ;
+        f.m_task->m_ref_count  = 2 ;
+        f.m_task->m_alloc_size = sizeof(task_type);
+
+        assign( f.m_task , arg_options... );
+
+        // Potentially spawning outside execution space so the
+        // apply function pointer must be obtained from execution space.
+        // Required for Cuda execution space function pointer.
+        queue_type::specialization::template
+          proc_set_apply< FunctorType >( & f.m_task->m_apply );
+
+        m_queue->schedule( f.m_task );
+      }
+      return f ;
+    }
+
+  /**\brief  Return a future that is complete
+   *         when all input futures are complete.
+   */
+  template< typename A1 , typename A2 >
+  KOKKOS_FUNCTION
+  Future< ExecSpace >
+  when_all( int narg , Future< A1 , A2 > const * const arg ) const
+    {
+      static_assert
+        ( std::is_same< execution_space
+                      , typename Future< A1 , A2 >::execution_space
+                      >::value
+        , "Future must have same execution space" );
+
+      using future_type = Future< ExecSpace > ;
+      using task_base   = Kokkos::Impl::TaskBase< ExecSpace , void , void > ;
+
+      future_type f ;
+
+      size_t const size  = sizeof(task_base) + narg * sizeof(task_base*);
+
+      f.m_task =
+        reinterpret_cast< task_base * >( m_queue->allocate( size ) );
+
+      if ( f.m_task ) {
+
+        new( f.m_task ) task_base();
+
+        // Reference count starts at two:
+        // +1 to match decrement when task completes
+        // +1 for the future
+        f.m_task->m_queue      = m_queue ;
+        f.m_task->m_ref_count  = 2 ;
+        f.m_task->m_alloc_size = size ;
+        f.m_task->m_dep_count  = narg ;
+        f.m_task->m_task_type  = task_base::Aggregate ;
+
+        task_base ** const dep = f.m_task->aggregate_dependences();
+
+        // Assign dependences to increment their reference count
+        // The futures may be destroyed upon returning from this call
+        // so increment reference count to track this assignment.
+
+        for ( int i = 0 ; i < narg ; ++i ) {
+          task_base * const t = dep[i] = arg[i].m_task ;
+          if ( 0 != t ) {
+            Kokkos::atomic_fetch_add( &(t->m_ref_count) , 1 );
+          }
+        }
+
+        m_queue->schedule( f.m_task );
+        // this when_all may be processed at any moment
+      }
+
+      return f ;
+    }
+
+  /**\brief  An executing task respawns itself with options
+   *
+   *  1) High, Normal, or Low priority
+   *  2) With or without dependence
+   */
+  template< class FunctorType , typename ... Options >
+  KOKKOS_FUNCTION
+  void respawn( FunctorType * task_self
+              , Options const & ... arg_options ) const
+    {
+      using value_type  = typename FunctorType::value_type ;
+      using task_type   = Impl::TaskBase< execution_space
+                                        , value_type
+                                        , FunctorType > ;
+
+      task_base * const zero = (task_base *) 0 ;
+      task_base * const lock = (task_base *) task_base::LockTag ;
+      task_type * const task = static_cast< task_type * >( task_self );
+
+      // Precondition:
+      //   task is in Executing state
+      //   therefore  m_next == LockTag
+      //
+      // Change to m_next == 0 for no dependence
+
+      if ( lock != Kokkos::atomic_exchange( & task->m_next, zero ) ) {
+        Kokkos::abort("TaskPolicy::respawn ERROR: already respawned");
+      }
+
+      assign( task , arg_options... );
+
+      // Postcondition:
+      //   task is in Executing-Respawn state
+      //   therefore  m_next == dependece or 0
+    }
+
+  //----------------------------------------
+
+  template< typename S >
+  friend
+  void Kokkos::wait( Kokkos::TaskPolicy< S > const & );
+
+  //----------------------------------------
+
+  inline
+  int allocation_capacity() const noexcept
+    { return m_queue->m_memory.get_mem_size(); }
+
+  KOKKOS_INLINE_FUNCTION
+  int allocated_task_count() const noexcept
+    { return m_queue->m_count_alloc ; }
+
+  KOKKOS_INLINE_FUNCTION
+  int allocated_task_count_max() const noexcept
+    { return m_queue->m_max_alloc ; }
+
+  KOKKOS_INLINE_FUNCTION
+  long allocated_task_count_accum() const noexcept
+    { return m_queue->m_accum_alloc ; }
+
+};
+
+template< typename ExecSpace >
+inline
+void wait( TaskPolicy< ExecSpace > const & policy )
+{ policy.m_queue->execute(); }
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+struct FutureValueTypeIsVoidError {};
+
+template < class ExecSpace , class ResultType , class FunctorType >
+class TaskMember ;
+
+} /* namespace Impl */
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+
+/**\brief  States of a task */
+enum TaskState
+  { TASK_STATE_NULL         = 0  ///<  Does not exist
+  , TASK_STATE_CONSTRUCTING = 1  ///<  Is under construction
+  , TASK_STATE_WAITING      = 2  ///<  Is waiting for execution
+  , TASK_STATE_EXECUTING    = 4  ///<  Is executing
+  , TASK_STATE_COMPLETE     = 8  ///<  Execution is complete
+  };
+
+/**\brief  Tag for Future<Latch,Space>
+ */
+struct Latch {};
+
+/**
+ *
+ *  Future< space >  // value_type == void
+ *  Future< value >  // space == Default
+ *  Future< value , space >
+ *
+ */
+template< class Arg1 = void , class Arg2 = void >
+class Future {
+private:
+
+  template< class , class , class > friend class Impl::TaskMember ;
+  template< class > friend class TaskPolicy ;
+  template< class , class > friend class Future ;
+
+  // Argument #2, if not void, must be the space.
+  enum { Arg1_is_space  = Kokkos::Impl::is_execution_space< Arg1 >::value };
+  enum { Arg2_is_space  = Kokkos::Impl::is_execution_space< Arg2 >::value };
+  enum { Arg2_is_void   = std::is_same< Arg2 , void >::value };
+
+  struct ErrorNoExecutionSpace {};
+
+  enum { Opt1  =   Arg1_is_space && Arg2_is_void
+       , Opt2  = ! Arg1_is_space && Arg2_is_void
+       , Opt3  = ! Arg1_is_space && Arg2_is_space
+       , OptOK = Kokkos::Impl::StaticAssert< Opt1 || Opt2 || Opt3 , ErrorNoExecutionSpace >::value
+       };
+
+  typedef typename
+    Kokkos::Impl::if_c< Opt2 || Opt3 , Arg1 , void >::type
+      ValueType ;
+
+  typedef typename
+    Kokkos::Impl::if_c< Opt1 , Arg1 , typename
+    Kokkos::Impl::if_c< Opt2 , Kokkos::DefaultExecutionSpace , typename
+    Kokkos::Impl::if_c< Opt3 , Arg2 , void
+    >::type >::type >::type
+      ExecutionSpace ;
+
+  typedef Impl::TaskMember< ExecutionSpace , void , void >       TaskRoot ;
+  typedef Impl::TaskMember< ExecutionSpace , ValueType , void >  TaskValue ;
+
+  TaskRoot * m_task ;
+
+  KOKKOS_INLINE_FUNCTION explicit
+  Future( TaskRoot * task )
+    : m_task(0)
+    { TaskRoot::assign( & m_task , TaskRoot::template verify_type< ValueType >( task ) ); }
+
+  //----------------------------------------
+
+public:
+
+  typedef ValueType       value_type;
+  typedef ExecutionSpace  execution_space ;
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  TaskState get_task_state() const
+    { return 0 != m_task ? m_task->get_state() : TASK_STATE_NULL ; }
+
+  KOKKOS_INLINE_FUNCTION
+  bool is_null() const { return 0 == m_task ; }
+
+  KOKKOS_INLINE_FUNCTION
+  int reference_count() const
+    { return 0 != m_task ? m_task->reference_count() : 0 ; }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  ~Future() { TaskRoot::assign( & m_task , 0 ); }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  Future() : m_task(0) {}
+
+  KOKKOS_INLINE_FUNCTION
+  Future( const Future & rhs )
+    : m_task(0)
+    { TaskRoot::assign( & m_task , rhs.m_task ); }
+
+  KOKKOS_INLINE_FUNCTION
+  Future & operator = ( const Future & rhs )
+    { TaskRoot::assign( & m_task , rhs.m_task ); return *this ; }
+
+  //----------------------------------------
+
+  template< class A1 , class A2 >
+  KOKKOS_INLINE_FUNCTION
+  Future( const Future<A1,A2> & rhs )
+    : m_task(0)
+    { TaskRoot::assign( & m_task , TaskRoot::template verify_type< value_type >( rhs.m_task ) ); }
+
+  template< class A1 , class A2 >
+  KOKKOS_INLINE_FUNCTION
+  Future & operator = ( const Future<A1,A2> & rhs )
+    { TaskRoot::assign( & m_task , TaskRoot::template verify_type< value_type >( rhs.m_task ) ); return *this ; }
+
+  //----------------------------------------
+
+  typedef typename TaskValue::get_result_type get_result_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  get_result_type get() const
+    {
+      if ( 0 == m_task ) {
+        Kokkos::abort( "Kokkos::Experimental::Future::get ERROR: is_null()");
+      }
+      return static_cast<TaskValue*>( m_task )->get();  
+    }
+
+  //----------------------------------------
+};
+
+template< class Arg2 >
+class Future< Latch , Arg2 > {
+private:
+
+  template< class , class , class > friend class Impl::TaskMember ;
+  template< class > friend class TaskPolicy ;
+  template< class , class > friend class Future ;
+
+  // Argument #2, if not void, must be the space.
+  enum { Arg2_is_space  = Kokkos::Impl::is_execution_space< Arg2 >::value };
+  enum { Arg2_is_void   = std::is_same< Arg2 , void >::value };
+
+  static_assert( Arg2_is_space || Arg2_is_void 
+               , "Future template argument #2 must be a space" );
+
+  typedef typename
+    std::conditional< Arg2_is_space , Arg2 , Kokkos::DefaultExecutionSpace >
+     ::type ExecutionSpace ;
+
+  typedef Impl::TaskMember< ExecutionSpace , void , void >  TaskRoot ;
+
+  TaskRoot * m_task ;
+
+  KOKKOS_INLINE_FUNCTION explicit
+  Future( TaskRoot * task )
+    : m_task(0)
+    { TaskRoot::assign( & m_task , task ); }
+
+  //----------------------------------------
+
+public:
+
+  typedef void            value_type;
+  typedef ExecutionSpace  execution_space ;
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  void add( const int k ) const
+    { if ( 0 != m_task ) m_task->latch_add(k); }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  TaskState get_task_state() const
+    { return 0 != m_task ? m_task->get_state() : TASK_STATE_NULL ; }
+
+  KOKKOS_INLINE_FUNCTION
+  bool is_null() const { return 0 == m_task ; }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  ~Future() { TaskRoot::assign( & m_task , 0 ); }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  Future() : m_task(0) {}
+
+  KOKKOS_INLINE_FUNCTION
+  Future( const Future & rhs )
+    : m_task(0)
+    { TaskRoot::assign( & m_task , rhs.m_task ); }
+
+  KOKKOS_INLINE_FUNCTION
+  Future & operator = ( const Future & rhs )
+    { TaskRoot::assign( & m_task , rhs.m_task ); return *this ; }
+
+  //----------------------------------------
+
+  typedef void get_result_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  void get() const {}
+
+  //----------------------------------------
+
+};
+
+namespace Impl {
+
+template< class T >
+struct is_future : public std::false_type {};
+
+template< class Arg0 , class Arg1 >
+struct is_future< Kokkos::Experimental::Future<Arg0,Arg1> >
+  : public std::true_type {};
+
+} /* namespace Impl */
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+
+/** \brief  If the argument is an execution space then a serial task in that space */
+template< class Arg0 = Kokkos::DefaultExecutionSpace >
+class TaskPolicy {
+public:
+
+  typedef typename Arg0::execution_space  execution_space ;
+
+  //----------------------------------------
+
+  TaskPolicy
+    ( const unsigned arg_task_max_count
+    , const unsigned arg_task_max_size
+    , const unsigned arg_task_default_dependence_capacity = 4
+    , const unsigned arg_task_team_size = 0 /* choose default */
+    );
+
+  TaskPolicy() = default ;
+  TaskPolicy( TaskPolicy && rhs ) = default ;
+  TaskPolicy( const TaskPolicy & rhs ) = default ;
+  TaskPolicy & operator = ( TaskPolicy && rhs ) = default ;
+  TaskPolicy & operator = ( const TaskPolicy & rhs ) = default ;
+
+  //----------------------------------------
+  /** \brief  Create a serial task with storage for dependences.
+   *
+   *  Postcondition: Task is in the 'constructing' state.
+   */
+  template< class FunctorType >
+  Future< typename FunctorType::value_type , execution_space >
+  create( const FunctorType & functor
+        , const unsigned      dependence_capacity /* = default */ );
+
+  template< class FunctorType >
+  KOKKOS_INLINE_FUNCTION
+  Future< typename FunctorType::value_type , execution_space >
+  create_team( const FunctorType & functor
+             , const unsigned dependence_capacity /* = default */ );
+
+  /** \brief  Set dependence that 'after' cannot start execution
+   *          until 'before' has completed.
+   *
+   *  Precondition: The 'after' task must be in then 'Constructing' state.
+   */
+  template< class TA , class TB >
+  void add_dependence( const Future<TA,execution_space> & after
+                     , const Future<TB,execution_space> & before ) const ;
+
+  /** \brief  Spawn a task in the 'Constructing' state
+   *
+   *  Precondition:  Task is in the 'constructing' state.
+   *  Postcondition: Task is waiting, executing, or complete.
+   */
+  template< class T >
+  const Future<T,execution_space> &
+  spawn( const Future<T,execution_space> & ) const ;
+
+  //----------------------------------------
+  /** \brief  Query dependence of an executing task */
+
+  template< class FunctorType >
+  Future< execution_space >
+  get_dependence( FunctorType * , const int ) const ;
+
+  //----------------------------------------
+  /** \brief  Clear current dependences of an executing task
+   *          in preparation for setting new dependences and
+   *          respawning.
+   *
+   * Precondition: The functor must be a task in the executing state.
+   */
+  template< class FunctorType >
+  void clear_dependence( FunctorType * ) const ;
+
+  /** \brief  Set dependence that 'after' cannot resume execution
+   *          until 'before' has completed.
+   *
+   *  The 'after' functor must be in the executing state
+   */
+  template< class FunctorType , class TB >
+  void add_dependence( FunctorType * after
+                     , const Future<TB,execution_space> & before ) const ;
+
+  /** \brief  Respawn (reschedule) an executing task to be called again
+   *          after all dependences have completed.
+   */
+  template< class FunctorType >
+  void respawn( FunctorType * ) const ;
+};
+
+//----------------------------------------------------------------------------
+/** \brief  Create and spawn a single-thread task */
+template< class ExecSpace , class FunctorType >
+inline
+Future< typename FunctorType::value_type , ExecSpace >
+spawn( TaskPolicy<ExecSpace> & policy , const FunctorType & functor )
+{ return policy.spawn( policy.create( functor ) ); }
+
+/** \brief  Create and spawn a single-thread task with dependences */
+template< class ExecSpace , class FunctorType , class Arg0 , class Arg1 >
+inline
+Future< typename FunctorType::value_type , ExecSpace >
+spawn( TaskPolicy<ExecSpace>   & policy
+     , const FunctorType       & functor
+     , const Future<Arg0,Arg1> & before_0
+     , const Future<Arg0,Arg1> & before_1 )
+{
+  Future< typename FunctorType::value_type , ExecSpace > f ;
+  f = policy.create( functor , 2 );
+  policy.add_dependence( f , before_0 );
+  policy.add_dependence( f , before_1 );
+  policy.spawn( f );
+  return f ;
+}
+
+//----------------------------------------------------------------------------
+/** \brief  Create and spawn a parallel_for task */
+template< class ExecSpace , class ParallelPolicyType , class FunctorType >
+inline
+Future< typename FunctorType::value_type , ExecSpace >
+spawn_foreach( TaskPolicy<ExecSpace>     & task_policy
+             , const ParallelPolicyType  & parallel_policy
+             , const FunctorType         & functor )
+{ return task_policy.spawn( task_policy.create_foreach( parallel_policy , functor ) ); }
+
+/** \brief  Create and spawn a parallel_reduce task */
+template< class ExecSpace , class ParallelPolicyType , class FunctorType >
+inline
+Future< typename FunctorType::value_type , ExecSpace >
+spawn_reduce( TaskPolicy<ExecSpace>     & task_policy
+            , const ParallelPolicyType  & parallel_policy
+            , const FunctorType         & functor )
+{ return task_policy.spawn( task_policy.create_reduce( parallel_policy , functor ) ); }
+
+//----------------------------------------------------------------------------
+/** \brief  Respawn a task functor with dependences */
+template< class ExecSpace , class FunctorType , class Arg0 , class Arg1 >
+inline
+void respawn( TaskPolicy<ExecSpace>   & policy
+            , FunctorType *             functor
+            , const Future<Arg0,Arg1> & before_0
+            , const Future<Arg0,Arg1> & before_1
+            )
+{
+  policy.clear_dependence( functor );
+  policy.add_dependence( functor , before_0 );
+  policy.add_dependence( functor , before_1 );
+  policy.respawn( functor );
+}
+
+//----------------------------------------------------------------------------
+
+template< class ExecSpace >
+void wait( TaskPolicy< ExecSpace > & );
+
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
+#endif /* #ifndef KOKKOS_TASKPOLICY_HPP */
+
diff --git a/lib/kokkos/core/src/Kokkos_Threads.hpp b/lib/kokkos/core/src/Kokkos_Threads.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..c9ebbf92652b5d9a2e859cf2587b8089897d3c62
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_Threads.hpp
@@ -0,0 +1,222 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_THREADS_HPP
+#define KOKKOS_THREADS_HPP
+
+#include <Kokkos_Core_fwd.hpp>
+
+#if defined( KOKKOS_HAVE_PTHREAD )
+
+#include <cstddef>
+#include <iosfwd>
+#include <Kokkos_HostSpace.hpp>
+#include <Kokkos_ScratchSpace.hpp>
+#include <Kokkos_Layout.hpp>
+#include <Kokkos_MemoryTraits.hpp>
+#include <impl/Kokkos_Tags.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+class ThreadsExec ;
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+/** \brief  Execution space for a pool of Pthreads or C11 threads on a CPU. */
+class Threads {
+public:
+  //! \name Type declarations that all Kokkos devices must provide.
+  //@{
+  //! Tag this class as a kokkos execution space
+  typedef Threads                  execution_space ;
+  typedef Kokkos::HostSpace        memory_space ;
+
+  //! This execution space preferred device_type
+  typedef Kokkos::Device<execution_space,memory_space> device_type;
+
+  typedef Kokkos::LayoutRight      array_layout ;
+  typedef memory_space::size_type  size_type ;
+
+  typedef ScratchMemorySpace< Threads >  scratch_memory_space ;
+
+
+  //@}
+  /*------------------------------------------------------------------------*/
+  //! \name Static functions that all Kokkos devices must implement.
+  //@{
+
+  /// \brief True if and only if this method is being called in a
+  ///   thread-parallel function.
+  static int in_parallel();
+
+  /** \brief  Set the device in a "sleep" state.
+   *
+   * This function sets the device in a "sleep" state in which it is
+   * not ready for work.  This may consume less resources than if the
+   * device were in an "awake" state, but it may also take time to
+   * bring the device from a sleep state to be ready for work.
+   *
+   * \return True if the device is in the "sleep" state, else false if
+   *   the device is actively working and could not enter the "sleep"
+   *   state.
+   */
+  static bool sleep();
+
+  /// \brief Wake the device from the 'sleep' state so it is ready for work.
+  ///
+  /// \return True if the device is in the "ready" state, else "false"
+  ///  if the device is actively working (which also means that it's
+  ///  awake).
+  static bool wake();
+
+  /// \brief Wait until all dispatched functors complete.
+  ///
+  /// The parallel_for or parallel_reduce dispatch of a functor may
+  /// return asynchronously, before the functor completes.  This
+  /// method does not return until all dispatched functors on this
+  /// device have completed.
+  static void fence();
+
+  /// \brief Free any resources being consumed by the device.
+  ///
+  /// For the Threads device, this terminates spawned worker threads.
+  static void finalize();
+
+  /// \brief Print configuration information to the given output stream.
+  static void print_configuration( std::ostream & , const bool detail = false );
+
+  //@}
+  /*------------------------------------------------------------------------*/
+  /*------------------------------------------------------------------------*/
+  //! \name Space-specific functions
+  //@{
+
+  /** \brief Initialize the device in the "ready to work" state.
+   *
+   *  The device is initialized in a "ready to work" or "awake" state.
+   *  This state reduces latency and thus improves performance when
+   *  dispatching work.  However, the "awake" state consumes resources
+   *  even when no work is being done.  You may call sleep() to put
+   *  the device in a "sleeping" state that does not consume as many
+   *  resources, but it will take time (latency) to awaken the device
+   *  again (via the wake()) method so that it is ready for work.
+   *
+   *  Teams of threads are distributed as evenly as possible across
+   *  the requested number of numa regions and cores per numa region.
+   *  A team will not be split across a numa region.
+   *
+   *  If the 'use_' arguments are not supplied the hwloc is queried
+   *  to use all available cores.
+   */
+  static void initialize( unsigned threads_count = 0 ,
+                          unsigned use_numa_count = 0 ,
+                          unsigned use_cores_per_numa = 0 ,
+                          bool allow_asynchronous_threadpool = false );
+
+  static int is_initialized();
+
+  /** \brief  Return the maximum amount of concurrency.  */
+  static int concurrency();
+
+  static Threads & instance( int = 0 );
+
+  //----------------------------------------
+
+  static int thread_pool_size( int depth = 0 );
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+  static int thread_pool_rank();
+#else
+  KOKKOS_INLINE_FUNCTION static int thread_pool_rank() { return 0 ; }
+#endif
+
+  inline static unsigned max_hardware_threads() { return thread_pool_size(0); }
+  KOKKOS_INLINE_FUNCTION static unsigned hardware_thread_id() { return thread_pool_rank(); }
+
+  //@}
+  //----------------------------------------
+};
+
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+struct VerifyExecutionCanAccessMemorySpace
+  < Kokkos::Threads::memory_space
+  , Kokkos::Threads::scratch_memory_space
+  >
+{
+  enum { value = true };
+  inline static void verify( void ) { }
+  inline static void verify( const void * ) { }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+
+#include <Kokkos_ExecPolicy.hpp>
+#include <Kokkos_Parallel.hpp>
+#include <Threads/Kokkos_ThreadsExec.hpp>
+#include <Threads/Kokkos_ThreadsTeam.hpp>
+#include <Threads/Kokkos_Threads_Parallel.hpp>
+
+#include <KokkosExp_MDRangePolicy.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_HAVE_PTHREAD ) */
+#endif /* #define KOKKOS_THREADS_HPP */
+
+
diff --git a/lib/kokkos/core/src/Kokkos_Vectorization.hpp b/lib/kokkos/core/src/Kokkos_Vectorization.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a60c0ecaa7b83bd49fb187bf37ca5a84d6360744
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_Vectorization.hpp
@@ -0,0 +1,53 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+/// \file Kokkos_Vectorization.hpp
+/// \brief Declaration and definition of Kokkos::Vectorization interface.
+#ifndef KOKKOS_VECTORIZATION_HPP
+#define KOKKOS_VECTORIZATION_HPP
+
+#if defined( KOKKOS_HAVE_CUDA )
+#include <Cuda/Kokkos_Cuda_Vectorization.hpp>
+#endif
+
+#endif
diff --git a/lib/kokkos/core/src/Kokkos_View.hpp b/lib/kokkos/core/src/Kokkos_View.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..1cc8b0338155c8f8be724181806097a927d606d2
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_View.hpp
@@ -0,0 +1,2384 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_VIEW_HPP
+#define KOKKOS_VIEW_HPP
+
+#include <type_traits>
+#include <string>
+#include <algorithm>
+#include <initializer_list>
+
+#include <Kokkos_Core_fwd.hpp>
+#include <Kokkos_HostSpace.hpp>
+#include <Kokkos_MemoryTraits.hpp>
+#include <Kokkos_ExecPolicy.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template< class DstMemorySpace , class SrcMemorySpace >
+struct DeepCopy ;
+
+template< class DataType >
+struct ViewArrayAnalysis ;
+
+template< class DataType , class ArrayLayout
+        , typename ValueType =
+          typename ViewArrayAnalysis< DataType >::non_const_value_type
+        >
+struct ViewDataAnalysis ;
+
+template< class , class ... >
+class ViewMapping { public: enum { is_assignable = false }; };
+
+template< class MemorySpace >
+struct ViewOperatorBoundsErrorAbort ;
+
+template<>
+struct ViewOperatorBoundsErrorAbort< Kokkos::HostSpace > {
+  static void apply( const size_t rank
+                   , const size_t n0 , const size_t n1
+                   , const size_t n2 , const size_t n3
+                   , const size_t n4 , const size_t n5
+                   , const size_t n6 , const size_t n7
+                   , const size_t i0 , const size_t i1
+                   , const size_t i2 , const size_t i3
+                   , const size_t i4 , const size_t i5
+                   , const size_t i6 , const size_t i7 );
+};
+
+} /* namespace Impl */
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+
+/** \class ViewTraits
+ *  \brief Traits class for accessing attributes of a View.
+ *
+ * This is an implementation detail of View.  It is only of interest
+ * to developers implementing a new specialization of View.
+ *
+ * Template argument options:
+ *   - View< DataType >
+ *   - View< DataType , Space >
+ *   - View< DataType , Space , MemoryTraits >
+ *   - View< DataType , ArrayLayout >
+ *   - View< DataType , ArrayLayout , Space >
+ *   - View< DataType , ArrayLayout , MemoryTraits >
+ *   - View< DataType , ArrayLayout , Space , MemoryTraits >
+ *   - View< DataType , MemoryTraits >
+ */
+
+template< class DataType , class ... Properties >
+struct ViewTraits ;
+
+template<>
+struct ViewTraits< void >
+{
+  typedef void  execution_space ;
+  typedef void  memory_space ;
+  typedef void  HostMirrorSpace ;
+  typedef void  array_layout ;
+  typedef void  memory_traits ;
+};
+
+template< class ... Prop >
+struct ViewTraits< void , void , Prop ... >
+{
+  // Ignore an extraneous 'void'
+  typedef typename ViewTraits<void,Prop...>::execution_space  execution_space ;
+  typedef typename ViewTraits<void,Prop...>::memory_space     memory_space ;
+  typedef typename ViewTraits<void,Prop...>::HostMirrorSpace  HostMirrorSpace ;
+  typedef typename ViewTraits<void,Prop...>::array_layout     array_layout ;
+  typedef typename ViewTraits<void,Prop...>::memory_traits    memory_traits ;
+};
+
+template< class ArrayLayout , class ... Prop >
+struct ViewTraits< typename std::enable_if< Kokkos::Impl::is_array_layout<ArrayLayout>::value >::type , ArrayLayout , Prop ... >
+{
+  // Specify layout, keep subsequent space and memory traits arguments
+
+  typedef typename ViewTraits<void,Prop...>::execution_space  execution_space ;
+  typedef typename ViewTraits<void,Prop...>::memory_space     memory_space ;
+  typedef typename ViewTraits<void,Prop...>::HostMirrorSpace  HostMirrorSpace ;
+  typedef          ArrayLayout                                array_layout ;
+  typedef typename ViewTraits<void,Prop...>::memory_traits    memory_traits ;
+};
+
+template< class Space , class ... Prop >
+struct ViewTraits< typename std::enable_if< Kokkos::Impl::is_space<Space>::value >::type , Space , Prop ... >
+{
+  // Specify Space, memory traits should be the only subsequent argument.
+
+  static_assert( std::is_same< typename ViewTraits<void,Prop...>::execution_space , void >::value &&
+                 std::is_same< typename ViewTraits<void,Prop...>::memory_space    , void >::value &&
+                 std::is_same< typename ViewTraits<void,Prop...>::HostMirrorSpace , void >::value &&
+                 std::is_same< typename ViewTraits<void,Prop...>::array_layout    , void >::value
+               , "Only one View Execution or Memory Space template argument" );
+
+  typedef typename Space::execution_space                   execution_space ;
+  typedef typename Space::memory_space                      memory_space ;
+  typedef typename Kokkos::Impl::is_space< Space >::host_mirror_space
+      HostMirrorSpace ;
+  typedef typename execution_space::array_layout            array_layout ;
+  typedef typename ViewTraits<void,Prop...>::memory_traits  memory_traits ;
+};
+
+template< class MemoryTraits , class ... Prop >
+struct ViewTraits< typename std::enable_if< Kokkos::Impl::is_memory_traits<MemoryTraits>::value >::type , MemoryTraits , Prop ... >
+{
+  // Specify memory trait, should not be any subsequent arguments
+
+  static_assert( std::is_same< typename ViewTraits<void,Prop...>::execution_space , void >::value &&
+                 std::is_same< typename ViewTraits<void,Prop...>::memory_space    , void >::value &&
+                 std::is_same< typename ViewTraits<void,Prop...>::array_layout    , void >::value &&
+                 std::is_same< typename ViewTraits<void,Prop...>::memory_traits   , void >::value
+               , "MemoryTrait is the final optional template argument for a View" );
+
+  typedef void          execution_space ;
+  typedef void          memory_space ;
+  typedef void          HostMirrorSpace ;
+  typedef void          array_layout ;
+  typedef MemoryTraits  memory_traits ;
+};
+
+
+template< class DataType , class ... Properties >
+struct ViewTraits {
+private:
+
+  // Unpack the properties arguments
+  typedef ViewTraits< void , Properties ... >  prop ;
+
+  typedef typename
+    std::conditional< ! std::is_same< typename prop::execution_space , void >::value
+                    , typename prop::execution_space
+                    , Kokkos::DefaultExecutionSpace
+                    >::type
+      ExecutionSpace ;
+
+  typedef typename
+    std::conditional< ! std::is_same< typename prop::memory_space , void >::value
+                    , typename prop::memory_space
+                    , typename ExecutionSpace::memory_space
+                    >::type
+      MemorySpace ;
+
+  typedef typename
+    std::conditional< ! std::is_same< typename prop::array_layout , void >::value
+                    , typename prop::array_layout
+                    , typename ExecutionSpace::array_layout
+                    >::type
+      ArrayLayout ;
+
+  typedef typename
+    std::conditional
+      < ! std::is_same< typename prop::HostMirrorSpace , void >::value
+      , typename prop::HostMirrorSpace
+      , typename Kokkos::Impl::is_space< ExecutionSpace >::host_mirror_space
+      >::type
+      HostMirrorSpace ;
+
+  typedef typename
+    std::conditional< ! std::is_same< typename prop::memory_traits , void >::value
+                    , typename prop::memory_traits
+                    , typename Kokkos::MemoryManaged
+                    >::type
+      MemoryTraits ;
+
+  // Analyze data type's properties,
+  // May be specialized based upon the layout and value type
+  typedef Kokkos::Experimental::Impl::ViewDataAnalysis< DataType , ArrayLayout > data_analysis ;
+
+public:
+
+  //------------------------------------
+  // Data type traits:
+
+  typedef typename data_analysis::type            data_type ;
+  typedef typename data_analysis::const_type      const_data_type ;
+  typedef typename data_analysis::non_const_type  non_const_data_type ;
+
+  //------------------------------------
+  // Compatible array of trivial type traits:
+
+  typedef typename data_analysis::scalar_array_type            scalar_array_type ;
+  typedef typename data_analysis::const_scalar_array_type      const_scalar_array_type ;
+  typedef typename data_analysis::non_const_scalar_array_type  non_const_scalar_array_type ;
+
+  //------------------------------------
+  // Value type traits:
+
+  typedef typename data_analysis::value_type            value_type ;
+  typedef typename data_analysis::const_value_type      const_value_type ;
+  typedef typename data_analysis::non_const_value_type  non_const_value_type ;
+
+  //------------------------------------
+  // Mapping traits:
+
+  typedef ArrayLayout                         array_layout ;
+  typedef typename data_analysis::dimension   dimension ;
+  typedef typename data_analysis::specialize  specialize /* mapping specialization tag */ ;
+
+  enum { rank         = dimension::rank };
+  enum { rank_dynamic = dimension::rank_dynamic };
+
+  //------------------------------------
+  // Execution space, memory space, memory access traits, and host mirror space.
+
+  typedef ExecutionSpace                              execution_space ;
+  typedef MemorySpace                                 memory_space ;
+  typedef Kokkos::Device<ExecutionSpace,MemorySpace>  device_type ;
+  typedef MemoryTraits                                memory_traits ;
+  typedef HostMirrorSpace                             host_mirror_space ;
+
+  typedef typename MemorySpace::size_type  size_type ;
+
+  enum { is_hostspace      = std::is_same< MemorySpace , HostSpace >::value };
+  enum { is_managed        = MemoryTraits::Unmanaged    == 0 };
+  enum { is_random_access  = MemoryTraits::RandomAccess == 1 };
+
+  //------------------------------------
+};
+
+/** \class View
+ *  \brief View to an array of data.
+ *
+ * A View represents an array of one or more dimensions.
+ * For details, please refer to Kokkos' tutorial materials.
+ *
+ * \section Kokkos_View_TemplateParameters Template parameters
+ *
+ * This class has both required and optional template parameters.  The
+ * \c DataType parameter must always be provided, and must always be
+ * first. The parameters \c Arg1Type, \c Arg2Type, and \c Arg3Type are
+ * placeholders for different template parameters.  The default value
+ * of the fifth template parameter \c Specialize suffices for most use
+ * cases.  When explaining the template parameters, we won't refer to
+ * \c Arg1Type, \c Arg2Type, and \c Arg3Type; instead, we will refer
+ * to the valid categories of template parameters, in whatever order
+ * they may occur.
+ *
+ * Valid ways in which template arguments may be specified:
+ *   - View< DataType >
+ *   - View< DataType , Layout >
+ *   - View< DataType , Layout , Space >
+ *   - View< DataType , Layout , Space , MemoryTraits >
+ *   - View< DataType , Space >
+ *   - View< DataType , Space , MemoryTraits >
+ *   - View< DataType , MemoryTraits >
+ *
+ * \tparam DataType (required) This indicates both the type of each
+ *   entry of the array, and the combination of compile-time and
+ *   run-time array dimension(s).  For example, <tt>double*</tt>
+ *   indicates a one-dimensional array of \c double with run-time
+ *   dimension, and <tt>int*[3]</tt> a two-dimensional array of \c int
+ *   with run-time first dimension and compile-time second dimension
+ *   (of 3).  In general, the run-time dimensions (if any) must go
+ *   first, followed by zero or more compile-time dimensions.  For
+ *   more examples, please refer to the tutorial materials.
+ *
+ * \tparam Space (required) The memory space.
+ *
+ * \tparam Layout (optional) The array's layout in memory.  For
+ *   example, LayoutLeft indicates a column-major (Fortran style)
+ *   layout, and LayoutRight a row-major (C style) layout.  If not
+ *   specified, this defaults to the preferred layout for the
+ *   <tt>Space</tt>.
+ *
+ * \tparam MemoryTraits (optional) Assertion of the user's intended
+ *   access behavior.  For example, RandomAccess indicates read-only
+ *   access with limited spatial locality, and Unmanaged lets users
+ *   wrap externally allocated memory in a View without automatic
+ *   deallocation.
+ *
+ * \section Kokkos_View_MT MemoryTraits discussion
+ *
+ * \subsection Kokkos_View_MT_Interp MemoryTraits interpretation depends on Space
+ *
+ * Some \c MemoryTraits options may have different interpretations for
+ * different \c Space types.  For example, with the Cuda device,
+ * \c RandomAccess tells Kokkos to fetch the data through the texture
+ * cache, whereas the non-GPU devices have no such hardware construct.
+ *
+ * \subsection Kokkos_View_MT_PrefUse Preferred use of MemoryTraits
+ *
+ * Users should defer applying the optional \c MemoryTraits parameter
+ * until the point at which they actually plan to rely on it in a
+ * computational kernel.  This minimizes the number of template
+ * parameters exposed in their code, which reduces the cost of
+ * compilation.  Users may always assign a View without specified
+ * \c MemoryTraits to a compatible View with that specification.
+ * For example:
+ * \code
+ * // Pass in the simplest types of View possible.
+ * void
+ * doSomething (View<double*, Cuda> out,
+ *              View<const double*, Cuda> in)
+ * {
+ *   // Assign the "generic" View in to a RandomAccess View in_rr.
+ *   // Note that RandomAccess View objects must have const data.
+ *   View<const double*, Cuda, RandomAccess> in_rr = in;
+ *   // ... do something with in_rr and out ...
+ * }
+ * \endcode
+ */
+template< class DataType , class ... Properties >
+class View ;
+
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#include <impl/KokkosExp_ViewMapping.hpp>
+#include <impl/KokkosExp_ViewArray.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+
+namespace {
+
+constexpr Kokkos::Experimental::Impl::ALL_t
+  ALL = Kokkos::Experimental::Impl::ALL_t();
+
+constexpr Kokkos::Experimental::Impl::WithoutInitializing_t
+  WithoutInitializing = Kokkos::Experimental::Impl::WithoutInitializing_t();
+
+constexpr Kokkos::Experimental::Impl::AllowPadding_t
+  AllowPadding        = Kokkos::Experimental::Impl::AllowPadding_t();
+
+}
+
+/** \brief  Create View allocation parameter bundle from argument list.
+ *
+ *  Valid argument list members are:
+ *    1) label as a "string" or std::string
+ *    2) memory space instance of the View::memory_space type
+ *    3) execution space instance compatible with the View::memory_space
+ *    4) Kokkos::WithoutInitializing to bypass initialization
+ *    4) Kokkos::AllowPadding to allow allocation to pad dimensions for memory alignment
+ */
+template< class ... Args >
+inline
+Impl::ViewCtorProp< typename Impl::ViewCtorProp< void , Args >::type ... >
+view_alloc( Args const & ... args )
+{
+  typedef
+    Impl::ViewCtorProp< typename Impl::ViewCtorProp< void , Args >::type ... >
+      return_type ;
+
+  static_assert( ! return_type::has_pointer
+               , "Cannot give pointer-to-memory for view allocation" );
+
+  return return_type( args... );
+}
+
+template< class ... Args >
+inline
+Impl::ViewCtorProp< typename Impl::ViewCtorProp< void , Args >::type ... >
+view_wrap( Args const & ... args )
+{
+  typedef
+    Impl::ViewCtorProp< typename Impl::ViewCtorProp< void , Args >::type ... >
+      return_type ;
+
+  static_assert( ! return_type::has_memory_space &&
+                 ! return_type::has_execution_space &&
+                 ! return_type::has_label &&
+                 return_type::has_pointer
+               , "Must only give pointer-to-memory for view wrapping" );
+
+  return return_type( args... );
+}
+
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+
+template< class DataType , class ... Properties >
+class View ;
+
+template< class > struct is_view : public std::false_type {};
+
+template< class D, class ... P >
+struct is_view< View<D,P...> > : public std::true_type {};
+
+template< class D, class ... P >
+struct is_view< const View<D,P...> > : public std::true_type {};
+
+template< class DataType , class ... Properties >
+class View : public ViewTraits< DataType , Properties ... > {
+private:
+
+  template< class , class ... > friend class View ;
+  template< class , class ... > friend class Impl::ViewMapping ;
+
+public:
+
+  typedef ViewTraits< DataType , Properties ... > traits ;
+
+private:
+
+  typedef Kokkos::Experimental::Impl::ViewMapping< traits , void > map_type ;
+  typedef Kokkos::Experimental::Impl::SharedAllocationTracker      track_type ;
+
+  track_type  m_track ;
+  map_type    m_map ;
+
+public:
+
+  //----------------------------------------
+  /** \brief  Compatible view of array of scalar types */
+  typedef View< typename traits::scalar_array_type ,
+                typename traits::array_layout ,
+                typename traits::device_type ,
+                typename traits::memory_traits >
+    array_type ;
+
+  /** \brief  Compatible view of const data type */
+  typedef View< typename traits::const_data_type ,
+                typename traits::array_layout ,
+                typename traits::device_type ,
+                typename traits::memory_traits >
+    const_type ;
+
+  /** \brief  Compatible view of non-const data type */
+  typedef View< typename traits::non_const_data_type ,
+                typename traits::array_layout ,
+                typename traits::device_type ,
+                typename traits::memory_traits >
+    non_const_type ;
+
+  /** \brief  Compatible HostMirror view */
+  typedef View< typename traits::non_const_data_type ,
+                typename traits::array_layout ,
+                typename traits::host_mirror_space >
+    HostMirror ;
+
+  //----------------------------------------
+  // Domain rank and extents
+
+  enum { Rank = map_type::Rank };
+
+ /** \brief rank() to be implemented
+  */
+  //KOKKOS_INLINE_FUNCTION
+  //static
+  //constexpr unsigned rank() { return map_type::Rank; }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION constexpr
+  typename std::enable_if< std::is_integral<iType>::value , size_t >::type
+  extent( const iType & r ) const
+    { return m_map.extent(r); }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION constexpr
+  typename std::enable_if< std::is_integral<iType>::value , int >::type
+  extent_int( const iType & r ) const
+    { return static_cast<int>(m_map.extent(r)); }
+
+  KOKKOS_INLINE_FUNCTION constexpr
+  typename traits::array_layout layout() const
+    { return m_map.layout(); }
+
+  //----------------------------------------
+  /*  Deprecate all 'dimension' functions in favor of
+   *  ISO/C++ vocabulary 'extent'.
+   */
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION constexpr
+  typename std::enable_if< std::is_integral<iType>::value , size_t >::type
+  dimension( const iType & r ) const { return extent( r ); }
+
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_0() const { return m_map.dimension_0(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_1() const { return m_map.dimension_1(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_2() const { return m_map.dimension_2(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_3() const { return m_map.dimension_3(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_4() const { return m_map.dimension_4(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_5() const { return m_map.dimension_5(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_6() const { return m_map.dimension_6(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_7() const { return m_map.dimension_7(); }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION constexpr size_t size() const { return m_map.dimension_0() *
+                                                                m_map.dimension_1() *
+                                                                m_map.dimension_2() *
+                                                                m_map.dimension_3() *
+                                                                m_map.dimension_4() *
+                                                                m_map.dimension_5() *
+                                                                m_map.dimension_6() *
+                                                                m_map.dimension_7(); }
+
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_0() const { return m_map.stride_0(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_1() const { return m_map.stride_1(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_2() const { return m_map.stride_2(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_3() const { return m_map.stride_3(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_4() const { return m_map.stride_4(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_5() const { return m_map.stride_5(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_6() const { return m_map.stride_6(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_7() const { return m_map.stride_7(); }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION void stride( iType * const s ) const { m_map.stride(s); }
+
+  //----------------------------------------
+  // Range span is the span which contains all members.
+
+  typedef typename map_type::reference_type  reference_type ;
+  typedef typename map_type::pointer_type    pointer_type ;
+
+  enum { reference_type_is_lvalue_reference = std::is_lvalue_reference< reference_type >::value };
+
+  KOKKOS_INLINE_FUNCTION constexpr size_t span() const { return m_map.span(); }
+  // Deprecated, use 'span()' instead
+  KOKKOS_INLINE_FUNCTION constexpr size_t capacity() const { return m_map.span(); }
+  KOKKOS_INLINE_FUNCTION constexpr bool   span_is_contiguous() const { return m_map.span_is_contiguous(); }
+  KOKKOS_INLINE_FUNCTION constexpr pointer_type data() const { return m_map.data(); }
+
+  // Deprecated, use 'span_is_contigous()' instead
+  KOKKOS_INLINE_FUNCTION constexpr bool   is_contiguous() const { return m_map.span_is_contiguous(); }
+  // Deprecated, use 'data()' instead
+  KOKKOS_INLINE_FUNCTION constexpr pointer_type ptr_on_device() const { return m_map.data(); }
+
+  //----------------------------------------
+  // Allow specializations to query their specialized map
+
+  KOKKOS_INLINE_FUNCTION
+  const Kokkos::Experimental::Impl::ViewMapping< traits , void > &
+  implementation_map() const { return m_map ; }
+
+  //----------------------------------------
+
+private:
+
+  enum {
+    is_layout_left = std::is_same< typename traits::array_layout
+                                  , Kokkos::LayoutLeft >::value ,
+
+    is_layout_right = std::is_same< typename traits::array_layout
+                                  , Kokkos::LayoutRight >::value ,
+
+    is_layout_stride = std::is_same< typename traits::array_layout
+                                   , Kokkos::LayoutStride >::value ,
+
+    is_default_map =
+      std::is_same< typename traits::specialize , void >::value &&
+      ( is_layout_left || is_layout_right || is_layout_stride )
+  };
+
+#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
+
+#define KOKKOS_VIEW_OPERATOR_VERIFY( ARG ) \
+  Kokkos::Impl::VerifyExecutionCanAccessMemorySpace \
+    < Kokkos::Impl::ActiveExecutionMemorySpace , typename traits::memory_space >::verify(); \
+  Kokkos::Experimental::Impl::view_verify_operator_bounds ARG ;
+
+#else
+
+#define KOKKOS_VIEW_OPERATOR_VERIFY( ARG ) \
+  Kokkos::Impl::VerifyExecutionCanAccessMemorySpace \
+    < Kokkos::Impl::ActiveExecutionMemorySpace , typename traits::memory_space >::verify();
+
+#endif
+
+public:
+
+  //------------------------------
+  // Rank 0 operator()
+
+  template< class ... Args >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<( Kokkos::Impl::are_integral<Args...>::value
+                            && ( 0 == Rank )
+                          ), reference_type >::type
+  operator()( Args ... args ) const
+    {
+      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,args...) )
+
+      return m_map.reference();
+    }
+
+  //------------------------------
+  // Rank 1 operator()
+
+  template< typename I0
+          , class ... Args >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,Args...>::value
+      && ( 1 == Rank )
+      && ! is_default_map
+    ), reference_type >::type
+  operator()( const I0 & i0
+            , Args ... args ) const
+    {
+      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,args...) )
+
+      return m_map.reference(i0);
+    }
+
+  template< typename I0
+          , class ... Args >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,Args...>::value
+      && ( 1 == Rank )
+      && is_default_map
+      && ! is_layout_stride
+    ), reference_type >::type
+  operator()( const I0 & i0
+            , Args ... args ) const
+    {
+      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,args...) )
+
+      return m_map.m_handle[ i0 ];
+    }
+
+  template< typename I0
+          , class ... Args >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,Args...>::value
+      && ( 1 == Rank )
+      && is_default_map
+      && is_layout_stride
+    ), reference_type >::type
+  operator()( const I0 & i0
+            , Args ... args ) const
+    {
+      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,args...) )
+
+      return m_map.m_handle[ m_map.m_offset.m_stride.S0 * i0 ];
+    }
+
+  //------------------------------
+  // Rank 1 operator[]
+
+  template< typename I0 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0>::value
+      && ( 1 == Rank )
+      && ! is_default_map
+    ), reference_type >::type
+  operator[]( const I0 & i0 ) const
+    {
+      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0) )
+
+      return m_map.reference(i0);
+    }
+
+  template< typename I0 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0>::value
+      && ( 1 == Rank )
+      && is_default_map
+      && ! is_layout_stride
+    ), reference_type >::type
+  operator[]( const I0 & i0 ) const
+    {
+      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0) )
+
+      return m_map.m_handle[ i0 ];
+    }
+
+  template< typename I0 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0>::value
+      && ( 1 == Rank )
+      && is_default_map
+      && is_layout_stride
+    ), reference_type >::type
+  operator[]( const I0 & i0 ) const
+    {
+      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0) )
+
+      return m_map.m_handle[ m_map.m_offset.m_stride.S0 * i0 ];
+    }
+
+  //------------------------------
+  // Rank 2
+
+  template< typename I0 , typename I1
+          , class ... Args >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,Args...>::value
+      && ( 2 == Rank )
+      && ! is_default_map
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1
+            , Args ... args ) const
+    {
+      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,args...) )
+
+      return m_map.reference(i0,i1);
+    }
+
+  template< typename I0 , typename I1
+          , class ... Args >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,Args...>::value
+      && ( 2 == Rank )
+      && is_default_map
+      && is_layout_left && ( traits::rank_dynamic == 0 )
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1
+            , Args ... args ) const
+    {
+      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,args...) )
+
+      return m_map.m_handle[ i0 + m_map.m_offset.m_dim.N0 * i1 ];
+    }
+
+  template< typename I0 , typename I1
+          , class ... Args >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,Args...>::value
+      && ( 2 == Rank )
+      && is_default_map
+      && is_layout_left && ( traits::rank_dynamic != 0 )
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1
+            , Args ... args ) const
+    {
+      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,args...) )
+
+      return m_map.m_handle[ i0 + m_map.m_offset.m_stride * i1 ];
+    }
+
+  template< typename I0 , typename I1
+          , class ... Args >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,Args...>::value
+      && ( 2 == Rank )
+      && is_default_map
+      && is_layout_right && ( traits::rank_dynamic == 0 )
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1
+            , Args ... args ) const
+    {
+      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,args...) )
+
+      return m_map.m_handle[ i1 + m_map.m_offset.m_dim.N1 * i0 ];
+    }
+
+  template< typename I0 , typename I1
+          , class ... Args >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,Args...>::value
+      && ( 2 == Rank )
+      && is_default_map
+      && is_layout_right && ( traits::rank_dynamic != 0 )
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1
+            , Args ... args ) const
+    {
+      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,args...) )
+
+      return m_map.m_handle[ i1 + m_map.m_offset.m_stride * i0 ];
+    }
+
+  template< typename I0 , typename I1
+          , class ... Args >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,Args...>::value
+      && ( 2 == Rank )
+      && is_default_map
+      && is_layout_stride
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1
+            , Args ... args ) const
+    {
+      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,args...) )
+
+      return m_map.m_handle[ i0 * m_map.m_offset.m_stride.S0 +
+                             i1 * m_map.m_offset.m_stride.S1 ];
+    }
+
+  //------------------------------
+  // Rank 3
+
+  template< typename I0 , typename I1 , typename I2
+          , class ... Args >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,I2,Args...>::value
+      && ( 3 == Rank )
+      && is_default_map
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2
+            , Args ... args ) const
+    {
+      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,args...) )
+
+      return m_map.m_handle[ m_map.m_offset(i0,i1,i2) ];
+    }
+
+  template< typename I0 , typename I1 , typename I2
+          , class ... Args >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,I2,Args...>::value
+      && ( 3 == Rank )
+      && ! is_default_map
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2
+            , Args ... args ) const
+    {
+      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,args...) )
+
+      return m_map.reference(i0,i1,i2);
+    }
+
+  //------------------------------
+  // Rank 4
+
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , class ... Args >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,I2,I3,Args...>::value
+      && ( 4 == Rank )
+      && is_default_map
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+            , Args ... args ) const
+    {
+      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,i3,args...) )
+
+      return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3) ];
+    }
+
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , class ... Args >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,I2,I3,Args...>::value
+      && ( 4 == Rank )
+      && ! is_default_map
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+            , Args ... args ) const
+    {
+      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,i3,args...) )
+
+      return m_map.reference(i0,i1,i2,i3);
+    }
+
+  //------------------------------
+  // Rank 5
+
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , typename I4
+          , class ... Args >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,Args...>::value
+      && ( 5 == Rank )
+      && is_default_map
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+            , const I4 & i4
+            , Args ... args ) const
+    {
+      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,i3,i4,args...) )
+
+      return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3,i4) ];
+    }
+
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , typename I4
+          , class ... Args >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,Args...>::value
+      && ( 5 == Rank )
+      && ! is_default_map
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+            , const I4 & i4
+            , Args ... args ) const
+    {
+      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,i3,i4,args...) )
+
+      return m_map.reference(i0,i1,i2,i3,i4);
+    }
+
+  //------------------------------
+  // Rank 6
+
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , typename I4 , typename I5
+          , class ... Args >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,I5,Args...>::value
+      && ( 6 == Rank )
+      && is_default_map
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+            , const I4 & i4 , const I5 & i5
+            , Args ... args ) const
+    {
+      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,i3,i4,i5,args...) )
+
+      return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3,i4,i5) ];
+    }
+
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , typename I4 , typename I5
+          , class ... Args >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,I5,Args...>::value
+      && ( 6 == Rank )
+      && ! is_default_map
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+            , const I4 & i4 , const I5 & i5
+            , Args ... args ) const
+    {
+      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,i3,i4,i5,args...) )
+
+      return m_map.reference(i0,i1,i2,i3,i4,i5);
+    }
+
+  //------------------------------
+  // Rank 7
+
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , typename I4 , typename I5 , typename I6
+          , class ... Args >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,I5,I6,Args...>::value
+      && ( 7 == Rank )
+      && is_default_map
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+            , const I4 & i4 , const I5 & i5 , const I6 & i6
+            , Args ... args ) const
+    {
+      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,i3,i4,i5,i6,args...) )
+
+      return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3,i4,i5,i6) ];
+    }
+
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , typename I4 , typename I5 , typename I6
+          , class ... Args >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,I5,I6,Args...>::value
+      && ( 7 == Rank )
+      && ! is_default_map
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+            , const I4 & i4 , const I5 & i5 , const I6 & i6
+            , Args ... args ) const
+    {
+      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,i3,i4,i5,i6,args...) )
+
+      return m_map.reference(i0,i1,i2,i3,i4,i5,i6);
+    }
+
+  //------------------------------
+  // Rank 8
+
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , typename I4 , typename I5 , typename I6 , typename I7
+          , class ... Args >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,I5,I6,I7,Args...>::value
+      && ( 8 == Rank )
+      && is_default_map
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+            , const I4 & i4 , const I5 & i5 , const I6 & i6 , const I7 & i7
+            , Args ... args ) const
+    {
+      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,i3,i4,i5,i6,i7,args...) )
+
+      return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3,i4,i5,i6,i7) ];
+    }
+
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , typename I4 , typename I5 , typename I6 , typename I7
+          , class ... Args >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,I5,I6,I7,Args...>::value
+      && ( 8 == Rank )
+      && ! is_default_map
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+            , const I4 & i4 , const I5 & i5 , const I6 & i6 , const I7 & i7
+            , Args ... args ) const
+    {
+      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,i3,i4,i5,i6,i7,args...) )
+
+      return m_map.reference(i0,i1,i2,i3,i4,i5,i6,i7);
+    }
+
+#undef KOKKOS_VIEW_OPERATOR_VERIFY
+
+  //----------------------------------------
+  // Standard destructor, constructors, and assignment operators
+
+  KOKKOS_INLINE_FUNCTION
+  ~View() {}
+
+  KOKKOS_INLINE_FUNCTION
+  View() : m_track(), m_map() {}
+
+  KOKKOS_INLINE_FUNCTION
+  View( const View & rhs ) : m_track( rhs.m_track ), m_map( rhs.m_map ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  View( View && rhs ) : m_track( rhs.m_track ), m_map( rhs.m_map ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  View & operator = ( const View & rhs ) { m_track = rhs.m_track ; m_map = rhs.m_map ; return *this ; }
+
+  KOKKOS_INLINE_FUNCTION
+  View & operator = ( View && rhs ) { m_track = rhs.m_track ; m_map = rhs.m_map ; return *this ; }
+
+  //----------------------------------------
+  // Compatible view copy constructor and assignment
+  // may assign unmanaged from managed.
+
+  template< class RT , class ... RP >
+  KOKKOS_INLINE_FUNCTION
+  View( const View<RT,RP...> & rhs )
+    : m_track( rhs.m_track , traits::is_managed )
+    , m_map()
+    {
+      typedef typename View<RT,RP...>::traits  SrcTraits ;
+      typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits , void >  Mapping ;
+      static_assert( Mapping::is_assignable , "Incompatible View copy construction" );
+      Mapping::assign( m_map , rhs.m_map , rhs.m_track );
+    }
+
+  template< class RT , class ... RP >
+  KOKKOS_INLINE_FUNCTION
+  View & operator = ( const View<RT,RP...> & rhs )
+    {
+      typedef typename View<RT,RP...>::traits  SrcTraits ;
+      typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits , void >  Mapping ;
+      static_assert( Mapping::is_assignable , "Incompatible View copy assignment" );
+      Mapping::assign( m_map , rhs.m_map , rhs.m_track );
+      m_track.assign( rhs.m_track , traits::is_managed );
+      return *this ;
+    }
+
+  //----------------------------------------
+  // Compatible subview constructor
+  // may assign unmanaged from managed.
+
+  template< class RT , class ... RP , class Arg0 , class ... Args >
+  KOKKOS_INLINE_FUNCTION
+  View( const View< RT , RP... > & src_view
+      , const Arg0 & arg0 , Args ... args )
+    : m_track( src_view.m_track , traits::is_managed )
+    , m_map()
+    {
+      typedef View< RT , RP... > SrcType ;
+
+      typedef Kokkos::Experimental::Impl::ViewMapping
+        < void /* deduce destination view type from source view traits */
+        , typename SrcType::traits
+        , Arg0 , Args... > Mapping ;
+
+      typedef typename Mapping::type DstType ;
+
+      static_assert( Kokkos::Experimental::Impl::ViewMapping< traits , typename DstType::traits , void >::is_assignable
+        , "Subview construction requires compatible view and subview arguments" );
+
+      Mapping::assign( m_map, src_view.m_map, arg0 , args... );
+    }
+
+  //----------------------------------------
+  // Allocation tracking properties
+
+  KOKKOS_INLINE_FUNCTION
+  int use_count() const
+    { return m_track.use_count(); }
+
+  inline
+  const std::string label() const
+    { return m_track.template get_label< typename traits::memory_space >(); }
+
+  //----------------------------------------
+  // Allocation according to allocation properties and array layout
+
+  template< class ... P >
+  explicit inline
+  View( const Impl::ViewCtorProp< P ... > & arg_prop
+      , typename std::enable_if< ! Impl::ViewCtorProp< P... >::has_pointer
+                               , typename traits::array_layout
+                               >::type const & arg_layout
+      )
+    : m_track()
+    , m_map()
+    {
+      // Append layout and spaces if not input
+      typedef Impl::ViewCtorProp< P ... > alloc_prop_input ;
+
+      // use 'std::integral_constant<unsigned,I>' for non-types
+      // to avoid duplicate class error.
+      typedef Impl::ViewCtorProp
+        < P ...
+        , typename std::conditional
+            < alloc_prop_input::has_label
+            , std::integral_constant<unsigned,0>
+            , typename std::string
+            >::type
+        , typename std::conditional
+            < alloc_prop_input::has_memory_space
+            , std::integral_constant<unsigned,1>
+            , typename traits::device_type::memory_space
+            >::type
+        , typename std::conditional
+            < alloc_prop_input::has_execution_space
+            , std::integral_constant<unsigned,2>
+            , typename traits::device_type::execution_space
+            >::type
+        > alloc_prop ;
+
+      static_assert( traits::is_managed
+                   , "View allocation constructor requires managed memory" );
+
+      if ( alloc_prop::initialize &&
+           ! alloc_prop::execution_space::is_initialized() ) {
+        // If initializing view data then
+        // the execution space must be initialized.
+        Kokkos::Impl::throw_runtime_exception("Constructing View and initializing data with uninitialized execution space");
+      }
+
+      // Copy the input allocation properties with possibly defaulted properties
+      alloc_prop prop( arg_prop );
+
+//------------------------------------------------------------
+#if defined( KOKKOS_HAVE_CUDA )
+      // If allocating in CudaUVMSpace must fence before and after
+      // the allocation to protect against possible concurrent access
+      // on the CPU and the GPU.
+      // Fence using the trait's executon space (which will be Kokkos::Cuda)
+      // to avoid incomplete type errors from usng Kokkos::Cuda directly.
+      if ( std::is_same< Kokkos::CudaUVMSpace , typename traits::device_type::memory_space >::value ) {
+        traits::device_type::memory_space::execution_space::fence();
+      }
+#endif
+//------------------------------------------------------------
+
+      Kokkos::Experimental::Impl::SharedAllocationRecord<> *
+        record = m_map.allocate_shared( prop , arg_layout );
+
+//------------------------------------------------------------
+#if defined( KOKKOS_HAVE_CUDA )
+      if ( std::is_same< Kokkos::CudaUVMSpace , typename traits::device_type::memory_space >::value ) {
+        traits::device_type::memory_space::execution_space::fence();
+      }
+#endif
+//------------------------------------------------------------
+
+      // Setup and initialization complete, start tracking
+      m_track.assign_allocated_record_to_uninitialized( record );
+    }
+
+  // Wrap memory according to properties and array layout
+  template< class ... P >
+  explicit KOKKOS_INLINE_FUNCTION
+  View( const Impl::ViewCtorProp< P ... > & arg_prop
+      , typename std::enable_if< Impl::ViewCtorProp< P... >::has_pointer
+                               , typename traits::array_layout
+                               >::type const & arg_layout
+      )
+    : m_track() // No memory tracking
+    , m_map( arg_prop , arg_layout )
+    {
+      static_assert(
+        std::is_same< pointer_type
+                    , typename Impl::ViewCtorProp< P... >::pointer_type
+                    >::value ,
+        "Constructing View to wrap user memory must supply matching pointer type" );
+    }
+
+  // Simple dimension-only layout
+  template< class ... P >
+  explicit inline
+  View( const Impl::ViewCtorProp< P ... > & arg_prop
+      , typename std::enable_if< ! Impl::ViewCtorProp< P... >::has_pointer
+                               , size_t
+                               >::type const arg_N0 = 0
+      , const size_t arg_N1 = 0
+      , const size_t arg_N2 = 0
+      , const size_t arg_N3 = 0
+      , const size_t arg_N4 = 0
+      , const size_t arg_N5 = 0
+      , const size_t arg_N6 = 0
+      , const size_t arg_N7 = 0
+      )
+    : View( arg_prop
+          , typename traits::array_layout
+              ( arg_N0 , arg_N1 , arg_N2 , arg_N3
+              , arg_N4 , arg_N5 , arg_N6 , arg_N7 )
+          )
+    {}
+
+  template< class ... P >
+  explicit KOKKOS_INLINE_FUNCTION
+  View( const Impl::ViewCtorProp< P ... > & arg_prop
+      , typename std::enable_if< Impl::ViewCtorProp< P... >::has_pointer
+                               , size_t
+                               >::type const arg_N0 = 0
+      , const size_t arg_N1 = 0
+      , const size_t arg_N2 = 0
+      , const size_t arg_N3 = 0
+      , const size_t arg_N4 = 0
+      , const size_t arg_N5 = 0
+      , const size_t arg_N6 = 0
+      , const size_t arg_N7 = 0
+      )
+    : View( arg_prop
+          , typename traits::array_layout
+              ( arg_N0 , arg_N1 , arg_N2 , arg_N3
+              , arg_N4 , arg_N5 , arg_N6 , arg_N7 )
+          )
+    {}
+
+  // Allocate with label and layout
+  template< typename Label >
+  explicit inline
+  View( const Label & arg_label
+      , typename std::enable_if<
+          Kokkos::Experimental::Impl::is_view_label<Label>::value ,
+          typename traits::array_layout >::type const & arg_layout
+      )
+    : View( Impl::ViewCtorProp< std::string >( arg_label ) , arg_layout )
+    {}
+
+  // Allocate label and layout, must disambiguate from subview constructor.
+  template< typename Label >
+  explicit inline
+  View( const Label & arg_label
+      , typename std::enable_if<
+          Kokkos::Experimental::Impl::is_view_label<Label>::value ,
+        const size_t >::type arg_N0 = 0
+      , const size_t arg_N1 = 0
+      , const size_t arg_N2 = 0
+      , const size_t arg_N3 = 0
+      , const size_t arg_N4 = 0
+      , const size_t arg_N5 = 0
+      , const size_t arg_N6 = 0
+      , const size_t arg_N7 = 0
+      )
+    : View( Impl::ViewCtorProp< std::string >( arg_label )
+          , typename traits::array_layout
+              ( arg_N0 , arg_N1 , arg_N2 , arg_N3
+              , arg_N4 , arg_N5 , arg_N6 , arg_N7 )
+          )
+    {}
+
+  // For backward compatibility
+  explicit inline
+  View( const ViewAllocateWithoutInitializing & arg_prop
+      , const typename traits::array_layout & arg_layout
+      )
+    : View( Impl::ViewCtorProp< std::string , Kokkos::Experimental::Impl::WithoutInitializing_t >( arg_prop.label , Kokkos::Experimental::WithoutInitializing )
+          , arg_layout
+          )
+    {}
+
+  explicit inline
+  View( const ViewAllocateWithoutInitializing & arg_prop
+      , const size_t arg_N0 = 0
+      , const size_t arg_N1 = 0
+      , const size_t arg_N2 = 0
+      , const size_t arg_N3 = 0
+      , const size_t arg_N4 = 0
+      , const size_t arg_N5 = 0
+      , const size_t arg_N6 = 0
+      , const size_t arg_N7 = 0
+      )
+    : View( Impl::ViewCtorProp< std::string , Kokkos::Experimental::Impl::WithoutInitializing_t >( arg_prop.label , Kokkos::Experimental::WithoutInitializing )
+          , typename traits::array_layout
+              ( arg_N0 , arg_N1 , arg_N2 , arg_N3
+              , arg_N4 , arg_N5 , arg_N6 , arg_N7 )
+          )
+    {}
+
+  //----------------------------------------
+  // Memory span required to wrap these dimensions.
+  static constexpr size_t required_allocation_size(
+                                       const size_t arg_N0 = 0
+                                     , const size_t arg_N1 = 0
+                                     , const size_t arg_N2 = 0
+                                     , const size_t arg_N3 = 0
+                                     , const size_t arg_N4 = 0
+                                     , const size_t arg_N5 = 0
+                                     , const size_t arg_N6 = 0
+                                     , const size_t arg_N7 = 0
+                                     )
+    {
+      return map_type::memory_span(
+        typename traits::array_layout
+          ( arg_N0 , arg_N1 , arg_N2 , arg_N3
+          , arg_N4 , arg_N5 , arg_N6 , arg_N7 ) );
+    }
+
+  explicit KOKKOS_INLINE_FUNCTION
+  View( pointer_type arg_ptr
+      , const size_t arg_N0 = 0
+      , const size_t arg_N1 = 0
+      , const size_t arg_N2 = 0
+      , const size_t arg_N3 = 0
+      , const size_t arg_N4 = 0
+      , const size_t arg_N5 = 0
+      , const size_t arg_N6 = 0
+      , const size_t arg_N7 = 0
+      )
+    : View( Impl::ViewCtorProp<pointer_type>(arg_ptr)
+          , typename traits::array_layout
+             ( arg_N0 , arg_N1 , arg_N2 , arg_N3
+             , arg_N4 , arg_N5 , arg_N6 , arg_N7 )
+          )
+    {}
+
+  explicit KOKKOS_INLINE_FUNCTION
+  View( pointer_type arg_ptr
+      , const typename traits::array_layout & arg_layout
+      )
+    : View( Impl::ViewCtorProp<pointer_type>(arg_ptr) , arg_layout )
+    {}
+
+  //----------------------------------------
+  // Shared scratch memory constructor
+
+  static inline
+  size_t shmem_size( const size_t arg_N0 = ~size_t(0) ,
+                     const size_t arg_N1 = ~size_t(0) ,
+                     const size_t arg_N2 = ~size_t(0) ,
+                     const size_t arg_N3 = ~size_t(0) ,
+                     const size_t arg_N4 = ~size_t(0) ,
+                     const size_t arg_N5 = ~size_t(0) ,
+                     const size_t arg_N6 = ~size_t(0) ,
+                     const size_t arg_N7 = ~size_t(0) )
+  {
+    const size_t num_passed_args =
+      ( arg_N0 != ~size_t(0) ) + ( arg_N1 != ~size_t(0) ) + ( arg_N2 != ~size_t(0) ) +
+      ( arg_N3 != ~size_t(0) ) + ( arg_N4 != ~size_t(0) ) + ( arg_N5 != ~size_t(0) ) +
+      ( arg_N6 != ~size_t(0) ) + ( arg_N7 != ~size_t(0) );
+
+    if ( std::is_same<typename traits::specialize,void>::value && num_passed_args != traits::rank_dynamic ) {
+      Kokkos::abort( "Kokkos::View::shmem_size() rank_dynamic != number of arguments.\n" );
+    }
+
+    return map_type::memory_span(
+           typename traits::array_layout
+            ( arg_N0 , arg_N1 , arg_N2 , arg_N3
+            , arg_N4 , arg_N5 , arg_N6 , arg_N7 ) );
+  }
+
+  explicit KOKKOS_INLINE_FUNCTION
+  View( const typename traits::execution_space::scratch_memory_space & arg_space
+      , const typename traits::array_layout & arg_layout )
+    : View( Impl::ViewCtorProp<pointer_type>(
+              reinterpret_cast<pointer_type>(
+                arg_space.get_shmem( map_type::memory_span( arg_layout ) ) ) )
+         , arg_layout )
+    {}
+
+  explicit KOKKOS_INLINE_FUNCTION
+  View( const typename traits::execution_space::scratch_memory_space & arg_space
+      , const size_t arg_N0 = 0
+      , const size_t arg_N1 = 0
+      , const size_t arg_N2 = 0
+      , const size_t arg_N3 = 0
+      , const size_t arg_N4 = 0
+      , const size_t arg_N5 = 0
+      , const size_t arg_N6 = 0
+      , const size_t arg_N7 = 0 )
+    : View( Impl::ViewCtorProp<pointer_type>(
+              reinterpret_cast<pointer_type>(
+                arg_space.get_shmem(
+                  map_type::memory_span(
+                    typename traits::array_layout
+                     ( arg_N0 , arg_N1 , arg_N2 , arg_N3
+                     , arg_N4 , arg_N5 , arg_N6 , arg_N7 ) ) ) ) )
+          , typename traits::array_layout
+             ( arg_N0 , arg_N1 , arg_N2 , arg_N3
+             , arg_N4 , arg_N5 , arg_N6 , arg_N7 )
+       )
+    {}
+};
+
+
+ /** \brief Temporary free function rank()
+  *         until rank() is implemented
+  *         in the View
+  */
+  template < typename D , class ... P >
+  KOKKOS_INLINE_FUNCTION
+  constexpr unsigned rank( const View<D , P...> & V ) { return V.Rank; } //Temporary until added to view
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+template< class V , class ... Args >
+using Subview =
+  typename Kokkos::Experimental::Impl::ViewMapping
+    < void /* deduce subview type from source view traits */
+    , typename V::traits
+    , Args ...
+    >::type ;
+
+template< class D, class ... P , class ... Args >
+KOKKOS_INLINE_FUNCTION
+typename Kokkos::Experimental::Impl::ViewMapping
+  < void /* deduce subview type from source view traits */
+  , ViewTraits< D , P... >
+  , Args ...
+  >::type
+subview( const View< D, P... > & src , Args ... args )
+{
+  static_assert( View< D , P... >::Rank == sizeof...(Args) ,
+    "subview requires one argument for each source View rank" );
+
+  return typename
+    Kokkos::Experimental::Impl::ViewMapping
+      < void /* deduce subview type from source view traits */
+      , ViewTraits< D , P ... >
+      , Args ... >::type( src , args ... );
+}
+
+template< class MemoryTraits , class D, class ... P , class ... Args >
+KOKKOS_INLINE_FUNCTION
+typename Kokkos::Experimental::Impl::ViewMapping
+  < void /* deduce subview type from source view traits */
+  , ViewTraits< D , P... >
+  , Args ...
+  >::template apply< MemoryTraits >::type
+subview( const View< D, P... > & src , Args ... args )
+{
+  static_assert( View< D , P... >::Rank == sizeof...(Args) ,
+    "subview requires one argument for each source View rank" );
+
+  return typename
+    Kokkos::Experimental::Impl::ViewMapping
+      < void /* deduce subview type from source view traits */
+      , ViewTraits< D , P ... >
+      , Args ... >
+      ::template apply< MemoryTraits >
+      ::type( src , args ... );
+}
+
+
+
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+
+template< class LT , class ... LP , class RT , class ... RP >
+KOKKOS_INLINE_FUNCTION
+bool operator == ( const View<LT,LP...> & lhs ,
+                   const View<RT,RP...> & rhs )
+{
+  // Same data, layout, dimensions
+  typedef ViewTraits<LT,LP...>  lhs_traits ;
+  typedef ViewTraits<RT,RP...>  rhs_traits ;
+
+  return
+    std::is_same< typename lhs_traits::const_value_type ,
+                  typename rhs_traits::const_value_type >::value &&
+    std::is_same< typename lhs_traits::array_layout ,
+                  typename rhs_traits::array_layout >::value &&
+    std::is_same< typename lhs_traits::memory_space ,
+                  typename rhs_traits::memory_space >::value &&
+    unsigned(lhs_traits::rank) == unsigned(rhs_traits::rank) &&
+    lhs.data()        == rhs.data() &&
+    lhs.span()        == rhs.span() &&
+    lhs.dimension_0() == rhs.dimension_0() &&
+    lhs.dimension_1() == rhs.dimension_1() &&
+    lhs.dimension_2() == rhs.dimension_2() &&
+    lhs.dimension_3() == rhs.dimension_3() &&
+    lhs.dimension_4() == rhs.dimension_4() &&
+    lhs.dimension_5() == rhs.dimension_5() &&
+    lhs.dimension_6() == rhs.dimension_6() &&
+    lhs.dimension_7() == rhs.dimension_7();
+}
+
+template< class LT , class ... LP , class RT , class ... RP >
+KOKKOS_INLINE_FUNCTION
+bool operator != ( const View<LT,LP...> & lhs ,
+                   const View<RT,RP...> & rhs )
+{
+  return ! ( operator==(lhs,rhs) );
+}
+
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+inline
+void shared_allocation_tracking_claim_and_disable()
+{ Kokkos::Experimental::Impl::SharedAllocationRecord<void,void>::tracking_claim_and_disable(); }
+
+inline
+void shared_allocation_tracking_release_and_enable()
+{ Kokkos::Experimental::Impl::SharedAllocationRecord<void,void>::tracking_release_and_enable(); }
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template< class OutputView , typename Enable = void >
+struct ViewFill {
+
+  typedef typename OutputView::const_value_type  const_value_type ;
+
+  const OutputView output ;
+  const_value_type input ;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_t i0 ) const
+  {
+    const size_t n1 = output.dimension_1();
+    const size_t n2 = output.dimension_2();
+    const size_t n3 = output.dimension_3();
+    const size_t n4 = output.dimension_4();
+    const size_t n5 = output.dimension_5();
+    const size_t n6 = output.dimension_6();
+    const size_t n7 = output.dimension_7();
+
+    for ( size_t i1 = 0 ; i1 < n1 ; ++i1 ) {
+    for ( size_t i2 = 0 ; i2 < n2 ; ++i2 ) {
+    for ( size_t i3 = 0 ; i3 < n3 ; ++i3 ) {
+    for ( size_t i4 = 0 ; i4 < n4 ; ++i4 ) {
+    for ( size_t i5 = 0 ; i5 < n5 ; ++i5 ) {
+    for ( size_t i6 = 0 ; i6 < n6 ; ++i6 ) {
+    for ( size_t i7 = 0 ; i7 < n7 ; ++i7 ) {
+      output(i0,i1,i2,i3,i4,i5,i6,i7) = input ;
+    }}}}}}}
+  }
+
+  ViewFill( const OutputView & arg_out , const_value_type & arg_in )
+    : output( arg_out ), input( arg_in )
+    {
+      typedef typename OutputView::execution_space  execution_space ;
+      typedef Kokkos::RangePolicy< execution_space > Policy ;
+
+      const Kokkos::Impl::ParallelFor< ViewFill , Policy > closure( *this , Policy( 0 , output.dimension_0() ) );
+
+      closure.execute();
+
+      execution_space::fence();
+    }
+};
+
+template< class OutputView >
+struct ViewFill< OutputView , typename std::enable_if< OutputView::Rank == 0 >::type > {
+  ViewFill( const OutputView & dst , const typename OutputView::const_value_type & src )
+    {
+      Kokkos::Impl::DeepCopy< typename OutputView::memory_space , Kokkos::HostSpace >
+        ( dst.data() , & src , sizeof(typename OutputView::const_value_type) );
+    }
+};
+
+template< class OutputView , class InputView , class ExecSpace = typename OutputView::execution_space >
+struct ViewRemap {
+
+  const OutputView output ;
+  const InputView  input ;
+  const size_t n0 ;
+  const size_t n1 ;
+  const size_t n2 ;
+  const size_t n3 ;
+  const size_t n4 ;
+  const size_t n5 ;
+  const size_t n6 ;
+  const size_t n7 ;
+
+  ViewRemap( const OutputView & arg_out , const InputView & arg_in )
+    : output( arg_out ), input( arg_in )
+    , n0( std::min( (size_t)arg_out.dimension_0() , (size_t)arg_in.dimension_0() ) )
+    , n1( std::min( (size_t)arg_out.dimension_1() , (size_t)arg_in.dimension_1() ) )
+    , n2( std::min( (size_t)arg_out.dimension_2() , (size_t)arg_in.dimension_2() ) )
+    , n3( std::min( (size_t)arg_out.dimension_3() , (size_t)arg_in.dimension_3() ) )
+    , n4( std::min( (size_t)arg_out.dimension_4() , (size_t)arg_in.dimension_4() ) )
+    , n5( std::min( (size_t)arg_out.dimension_5() , (size_t)arg_in.dimension_5() ) )
+    , n6( std::min( (size_t)arg_out.dimension_6() , (size_t)arg_in.dimension_6() ) )
+    , n7( std::min( (size_t)arg_out.dimension_7() , (size_t)arg_in.dimension_7() ) )
+    {
+      typedef Kokkos::RangePolicy< ExecSpace > Policy ;
+      const Kokkos::Impl::ParallelFor< ViewRemap , Policy > closure( *this , Policy( 0 , n0 ) );
+      closure.execute();
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_t i0 ) const
+  {
+    for ( size_t i1 = 0 ; i1 < n1 ; ++i1 ) {
+    for ( size_t i2 = 0 ; i2 < n2 ; ++i2 ) {
+    for ( size_t i3 = 0 ; i3 < n3 ; ++i3 ) {
+    for ( size_t i4 = 0 ; i4 < n4 ; ++i4 ) {
+    for ( size_t i5 = 0 ; i5 < n5 ; ++i5 ) {
+    for ( size_t i6 = 0 ; i6 < n6 ; ++i6 ) {
+    for ( size_t i7 = 0 ; i7 < n7 ; ++i7 ) {
+      output(i0,i1,i2,i3,i4,i5,i6,i7) = input(i0,i1,i2,i3,i4,i5,i6,i7);
+    }}}}}}}
+  }
+};
+
+} /* namespace Impl */
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+
+/** \brief  Deep copy a value from Host memory into a view.  */
+template< class DT , class ... DP >
+inline
+void deep_copy
+  ( const View<DT,DP...> & dst
+  , typename ViewTraits<DT,DP...>::const_value_type & value
+  , typename std::enable_if<
+    std::is_same< typename ViewTraits<DT,DP...>::specialize , void >::value
+    >::type * = 0 )
+{
+  static_assert(
+    std::is_same< typename ViewTraits<DT,DP...>::non_const_value_type ,
+                  typename ViewTraits<DT,DP...>::value_type >::value
+    , "deep_copy requires non-const type" );
+
+  Kokkos::Experimental::Impl::ViewFill< View<DT,DP...> >( dst , value );
+}
+
+/** \brief  Deep copy into a value in Host memory from a view.  */
+template< class ST , class ... SP >
+inline
+void deep_copy
+  ( typename ViewTraits<ST,SP...>::non_const_value_type & dst
+  , const View<ST,SP...> & src
+  , typename std::enable_if<
+    std::is_same< typename ViewTraits<ST,SP...>::specialize , void >::value
+    >::type * = 0 )
+{
+  static_assert( ViewTraits<ST,SP...>::rank == 0
+               , "ERROR: Non-rank-zero view in deep_copy( value , View )" );
+
+  typedef ViewTraits<ST,SP...>               src_traits ;
+  typedef typename src_traits::memory_space  src_memory_space ;
+  Kokkos::Impl::DeepCopy< HostSpace , src_memory_space >( & dst , src.data() , sizeof(ST) );
+}
+
+//----------------------------------------------------------------------------
+/** \brief  A deep copy between views of compatible type, and rank zero.  */
+template< class DT , class ... DP , class ST , class ... SP >
+inline
+void deep_copy
+  ( const View<DT,DP...> & dst
+  , const View<ST,SP...> & src
+  , typename std::enable_if<(
+    std::is_same< typename ViewTraits<DT,DP...>::specialize , void >::value &&
+    std::is_same< typename ViewTraits<ST,SP...>::specialize , void >::value &&
+    ( unsigned(ViewTraits<DT,DP...>::rank) == unsigned(0) &&
+      unsigned(ViewTraits<ST,SP...>::rank) == unsigned(0) )
+  )>::type * = 0 )
+{
+  static_assert(
+    std::is_same< typename ViewTraits<DT,DP...>::value_type ,
+                  typename ViewTraits<ST,SP...>::non_const_value_type >::value
+    , "deep_copy requires matching non-const destination type" );
+
+  typedef View<DT,DP...>  dst_type ;
+  typedef View<ST,SP...>  src_type ;
+
+  typedef typename dst_type::value_type    value_type ;
+  typedef typename dst_type::memory_space  dst_memory_space ;
+  typedef typename src_type::memory_space  src_memory_space ;
+
+  if ( dst.data() != src.data() ) {
+    Kokkos::Impl::DeepCopy< dst_memory_space , src_memory_space >( dst.data() , src.data() , sizeof(value_type) );
+  }
+}
+
+//----------------------------------------------------------------------------
+/** \brief  A deep copy between views of the default specialization, compatible type,
+ *          same non-zero rank, same contiguous layout.
+ */
+template< class DT , class ... DP , class ST , class ... SP >
+inline
+void deep_copy
+  ( const View<DT,DP...> & dst
+  , const View<ST,SP...> & src
+  , typename std::enable_if<(
+    std::is_same< typename ViewTraits<DT,DP...>::specialize , void >::value &&
+    std::is_same< typename ViewTraits<ST,SP...>::specialize , void >::value &&
+    ( unsigned(ViewTraits<DT,DP...>::rank) != 0 ||
+      unsigned(ViewTraits<ST,SP...>::rank) != 0 )
+  )>::type * = 0 )
+{
+  static_assert(
+    std::is_same< typename ViewTraits<DT,DP...>::value_type ,
+                  typename ViewTraits<DT,DP...>::non_const_value_type >::value
+    , "deep_copy requires non-const destination type" );
+
+  static_assert(
+    ( unsigned(ViewTraits<DT,DP...>::rank) ==
+      unsigned(ViewTraits<ST,SP...>::rank) )
+    , "deep_copy requires Views of equal rank" );
+
+  typedef View<DT,DP...>  dst_type ;
+  typedef View<ST,SP...>  src_type ;
+
+  typedef typename dst_type::execution_space  dst_execution_space ;
+  typedef typename src_type::execution_space  src_execution_space ;
+  typedef typename dst_type::memory_space     dst_memory_space ;
+  typedef typename src_type::memory_space     src_memory_space ;
+
+  enum { DstExecCanAccessSrc =
+   Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< typename dst_execution_space::memory_space , src_memory_space >::value };
+
+  enum { SrcExecCanAccessDst =
+   Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< typename src_execution_space::memory_space , dst_memory_space >::value };
+
+
+  if ( (void *) dst.data() != (void*) src.data() ) {
+
+    // Concern: If overlapping views then a parallel copy will be erroneous.
+    // ...
+
+    // If same type, equal layout, equal dimensions, equal span, and contiguous memory then can byte-wise copy
+
+    if ( std::is_same< typename ViewTraits<DT,DP...>::value_type ,
+                       typename ViewTraits<ST,SP...>::non_const_value_type >::value &&
+         (
+           ( std::is_same< typename ViewTraits<DT,DP...>::array_layout ,
+                           typename ViewTraits<ST,SP...>::array_layout >::value
+             &&
+             ( std::is_same< typename ViewTraits<DT,DP...>::array_layout ,
+                             typename Kokkos::LayoutLeft>::value
+             ||
+               std::is_same< typename ViewTraits<DT,DP...>::array_layout ,
+                             typename Kokkos::LayoutRight>::value
+             )
+           )
+           ||
+           ( ViewTraits<DT,DP...>::rank == 1 &&
+             ViewTraits<ST,SP...>::rank == 1 )
+         ) &&
+         dst.span_is_contiguous() &&
+         src.span_is_contiguous() &&
+         dst.span() == src.span() &&
+         dst.dimension_0() == src.dimension_0() &&
+         dst.dimension_1() == src.dimension_1() &&
+         dst.dimension_2() == src.dimension_2() &&
+         dst.dimension_3() == src.dimension_3() &&
+         dst.dimension_4() == src.dimension_4() &&
+         dst.dimension_5() == src.dimension_5() &&
+         dst.dimension_6() == src.dimension_6() &&
+         dst.dimension_7() == src.dimension_7() ) {
+
+      const size_t nbytes = sizeof(typename dst_type::value_type) * dst.span();
+
+      Kokkos::Impl::DeepCopy< dst_memory_space , src_memory_space >( dst.data() , src.data() , nbytes );
+    }
+    else if ( std::is_same< typename ViewTraits<DT,DP...>::value_type ,
+                            typename ViewTraits<ST,SP...>::non_const_value_type >::value &&
+         (
+           ( std::is_same< typename ViewTraits<DT,DP...>::array_layout ,
+                           typename ViewTraits<ST,SP...>::array_layout >::value
+             &&
+             std::is_same< typename ViewTraits<DT,DP...>::array_layout ,
+                          typename Kokkos::LayoutStride>::value
+           )
+           ||
+           ( ViewTraits<DT,DP...>::rank == 1 &&
+             ViewTraits<ST,SP...>::rank == 1 )
+         ) &&
+         dst.span_is_contiguous() &&
+         src.span_is_contiguous() &&
+         dst.span() == src.span() &&
+         dst.dimension_0() == src.dimension_0() &&
+         dst.dimension_1() == src.dimension_1() &&
+         dst.dimension_2() == src.dimension_2() &&
+         dst.dimension_3() == src.dimension_3() &&
+         dst.dimension_4() == src.dimension_4() &&
+         dst.dimension_5() == src.dimension_5() &&
+         dst.dimension_6() == src.dimension_6() &&
+         dst.dimension_7() == src.dimension_7() &&
+         dst.stride_0() == src.stride_0() &&
+         dst.stride_1() == src.stride_1() &&
+         dst.stride_2() == src.stride_2() &&
+         dst.stride_3() == src.stride_3() &&
+         dst.stride_4() == src.stride_4() &&
+         dst.stride_5() == src.stride_5() &&
+         dst.stride_6() == src.stride_6() &&
+         dst.stride_7() == src.stride_7()
+         ) {
+
+      const size_t nbytes = sizeof(typename dst_type::value_type) * dst.span();
+
+      Kokkos::Impl::DeepCopy< dst_memory_space , src_memory_space >( dst.data() , src.data() , nbytes );
+    }
+    else if ( DstExecCanAccessSrc ) {
+      // Copying data between views in accessible memory spaces and either non-contiguous or incompatible shape.
+      Kokkos::Experimental::Impl::ViewRemap< dst_type , src_type >( dst , src );
+    }
+    else if ( SrcExecCanAccessDst ) {
+      // Copying data between views in accessible memory spaces and either non-contiguous or incompatible shape.
+      Kokkos::Experimental::Impl::ViewRemap< dst_type , src_type , src_execution_space >( dst , src );
+    }
+    else {
+      Kokkos::Impl::throw_runtime_exception("deep_copy given views that would require a temporary allocation");
+    }
+  }
+}
+
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+
+/** \brief  Deep copy a value from Host memory into a view.  */
+template< class ExecSpace ,class DT , class ... DP >
+inline
+void deep_copy
+  ( const ExecSpace &
+  , const View<DT,DP...> & dst
+  , typename ViewTraits<DT,DP...>::const_value_type & value
+  , typename std::enable_if<
+    Kokkos::Impl::is_execution_space< ExecSpace >::value &&
+    std::is_same< typename ViewTraits<DT,DP...>::specialize , void >::value
+    >::type * = 0 )
+{
+  static_assert(
+    std::is_same< typename ViewTraits<DT,DP...>::non_const_value_type ,
+                  typename ViewTraits<DT,DP...>::value_type >::value
+    , "deep_copy requires non-const type" );
+
+  Kokkos::Experimental::Impl::ViewFill< View<DT,DP...> >( dst , value );
+}
+
+/** \brief  Deep copy into a value in Host memory from a view.  */
+template< class ExecSpace , class ST , class ... SP >
+inline
+void deep_copy
+  ( const ExecSpace & exec_space
+  , typename ViewTraits<ST,SP...>::non_const_value_type & dst
+  , const View<ST,SP...> & src
+  , typename std::enable_if<
+    Kokkos::Impl::is_execution_space< ExecSpace >::value &&
+    std::is_same< typename ViewTraits<ST,SP...>::specialize , void >::value
+    >::type * = 0 )
+{
+  static_assert( ViewTraits<ST,SP...>::rank == 0
+               , "ERROR: Non-rank-zero view in deep_copy( value , View )" );
+
+  typedef ViewTraits<ST,SP...>               src_traits ;
+  typedef typename src_traits::memory_space  src_memory_space ;
+  Kokkos::Impl::DeepCopy< HostSpace , src_memory_space , ExecSpace >
+    ( exec_space , & dst , src.data() , sizeof(ST) );
+}
+
+//----------------------------------------------------------------------------
+/** \brief  A deep copy between views of compatible type, and rank zero.  */
+template< class ExecSpace , class DT , class ... DP , class ST , class ... SP >
+inline
+void deep_copy
+  ( const ExecSpace & exec_space
+  , const View<DT,DP...> & dst
+  , const View<ST,SP...> & src
+  , typename std::enable_if<(
+    Kokkos::Impl::is_execution_space< ExecSpace >::value &&
+    std::is_same< typename ViewTraits<DT,DP...>::specialize , void >::value &&
+    std::is_same< typename ViewTraits<ST,SP...>::specialize , void >::value &&
+    ( unsigned(ViewTraits<DT,DP...>::rank) == unsigned(0) &&
+      unsigned(ViewTraits<ST,SP...>::rank) == unsigned(0) )
+  )>::type * = 0 )
+{
+  static_assert(
+    std::is_same< typename ViewTraits<DT,DP...>::value_type ,
+                  typename ViewTraits<ST,SP...>::non_const_value_type >::value
+    , "deep_copy requires matching non-const destination type" );
+
+  typedef View<DT,DP...>  dst_type ;
+  typedef View<ST,SP...>  src_type ;
+
+  typedef typename dst_type::value_type    value_type ;
+  typedef typename dst_type::memory_space  dst_memory_space ;
+  typedef typename src_type::memory_space  src_memory_space ;
+
+  if ( dst.data() != src.data() ) {
+    Kokkos::Impl::DeepCopy< dst_memory_space , src_memory_space , ExecSpace >
+      ( exec_space , dst.data() , src.data() , sizeof(value_type) );
+  }
+}
+
+//----------------------------------------------------------------------------
+/** \brief  A deep copy between views of the default specialization, compatible type,
+ *          same non-zero rank, same contiguous layout.
+ */
+template< class ExecSpace , class DT, class ... DP, class ST, class ... SP >
+inline
+void deep_copy
+  ( const ExecSpace & exec_space
+  , const View<DT,DP...> & dst
+  , const View<ST,SP...> & src
+  , typename std::enable_if<(
+    Kokkos::Impl::is_execution_space< ExecSpace >::value &&
+    std::is_same< typename ViewTraits<DT,DP...>::specialize , void >::value &&
+    std::is_same< typename ViewTraits<ST,SP...>::specialize , void >::value &&
+    ( unsigned(ViewTraits<DT,DP...>::rank) != 0 ||
+      unsigned(ViewTraits<ST,SP...>::rank) != 0 )
+  )>::type * = 0 )
+{
+  static_assert(
+    std::is_same< typename ViewTraits<DT,DP...>::value_type ,
+                  typename ViewTraits<DT,DP...>::non_const_value_type >::value
+    , "deep_copy requires non-const destination type" );
+
+  static_assert(
+    ( unsigned(ViewTraits<DT,DP...>::rank) ==
+      unsigned(ViewTraits<ST,SP...>::rank) )
+    , "deep_copy requires Views of equal rank" );
+
+  typedef View<DT,DP...>  dst_type ;
+  typedef View<ST,SP...>  src_type ;
+
+  typedef typename dst_type::execution_space  dst_execution_space ;
+  typedef typename src_type::execution_space  src_execution_space ;
+  typedef typename dst_type::memory_space     dst_memory_space ;
+  typedef typename src_type::memory_space     src_memory_space ;
+
+  enum { DstExecCanAccessSrc =
+   Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< typename dst_execution_space::memory_space , src_memory_space >::value };
+
+  enum { SrcExecCanAccessDst =
+   Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< typename src_execution_space::memory_space , dst_memory_space >::value };
+
+  if ( (void *) dst.data() != (void*) src.data() ) {
+
+    // Concern: If overlapping views then a parallel copy will be erroneous.
+    // ...
+
+    // If same type, equal layout, equal dimensions, equal span, and contiguous memory then can byte-wise copy
+
+    if ( std::is_same< typename ViewTraits<DT,DP...>::value_type ,
+                       typename ViewTraits<ST,SP...>::non_const_value_type >::value &&
+         (
+           std::is_same< typename ViewTraits<DT,DP...>::array_layout ,
+                         typename ViewTraits<ST,SP...>::array_layout >::value
+           ||
+           ( ViewTraits<DT,DP...>::rank == 1 &&
+             ViewTraits<ST,SP...>::rank == 1 )
+         ) &&
+         dst.span_is_contiguous() &&
+         src.span_is_contiguous() &&
+         dst.span() == src.span() &&
+         dst.dimension_0() == src.dimension_0() &&
+         dst.dimension_1() == src.dimension_1() &&
+         dst.dimension_2() == src.dimension_2() &&
+         dst.dimension_3() == src.dimension_3() &&
+         dst.dimension_4() == src.dimension_4() &&
+         dst.dimension_5() == src.dimension_5() &&
+         dst.dimension_6() == src.dimension_6() &&
+         dst.dimension_7() == src.dimension_7() ) {
+
+      const size_t nbytes = sizeof(typename dst_type::value_type) * dst.span();
+
+      Kokkos::Impl::DeepCopy< dst_memory_space , src_memory_space , ExecSpace >
+        ( exec_space , dst.data() , src.data() , nbytes );
+    }
+    else if ( DstExecCanAccessSrc ) {
+      // Copying data between views in accessible memory spaces and either non-contiguous or incompatible shape.
+      Kokkos::Experimental::Impl::ViewRemap< dst_type , src_type >( dst , src );
+    }
+    else if ( SrcExecCanAccessDst ) {
+      // Copying data between views in accessible memory spaces and either non-contiguous or incompatible shape.
+      Kokkos::Experimental::Impl::ViewRemap< dst_type , src_type , src_execution_space >( dst , src );
+    }
+    else {
+      Kokkos::Impl::throw_runtime_exception("deep_copy given views that would require a temporary allocation");
+    }
+  }
+}
+
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+// Deduce Mirror Types
+template<class Space, class T, class ... P>
+struct MirrorViewType {
+  // The incoming view_type
+  typedef typename Kokkos::Experimental::View<T,P...> src_view_type;
+  // The memory space for the mirror view
+  typedef typename Space::memory_space memory_space;
+  // Check whether it is the same memory space
+  enum { is_same_memspace = std::is_same<memory_space,typename src_view_type::memory_space>::value };
+  // The array_layout
+  typedef typename src_view_type::array_layout array_layout;
+  // The data type (we probably want it non-const since otherwise we can't even deep_copy to it.
+  typedef typename src_view_type::non_const_data_type data_type;
+  // The destination view type if it is not the same memory space
+  typedef Kokkos::Experimental::View<data_type,array_layout,Space> dest_view_type;
+  // If it is the same memory_space return the existsing view_type
+  // This will also keep the unmanaged trait if necessary
+  typedef typename std::conditional<is_same_memspace,src_view_type,dest_view_type>::type view_type;
+};
+
+template<class Space, class T, class ... P>
+struct MirrorType {
+  // The incoming view_type
+  typedef typename Kokkos::Experimental::View<T,P...> src_view_type;
+  // The memory space for the mirror view
+  typedef typename Space::memory_space memory_space;
+  // Check whether it is the same memory space
+  enum { is_same_memspace = std::is_same<memory_space,typename src_view_type::memory_space>::value };
+  // The array_layout
+  typedef typename src_view_type::array_layout array_layout;
+  // The data type (we probably want it non-const since otherwise we can't even deep_copy to it.
+  typedef typename src_view_type::non_const_data_type data_type;
+  // The destination view type if it is not the same memory space
+  typedef Kokkos::Experimental::View<data_type,array_layout,Space> view_type;
+};
+
+}
+
+template< class T , class ... P >
+inline
+typename Kokkos::Experimental::View<T,P...>::HostMirror
+create_mirror( const Kokkos::Experimental::View<T,P...> & src
+             , typename std::enable_if<
+                 ! std::is_same< typename Kokkos::Experimental::ViewTraits<T,P...>::array_layout
+                               , Kokkos::LayoutStride >::value
+               >::type * = 0
+             )
+{
+  typedef View<T,P...>                   src_type ;
+  typedef typename src_type::HostMirror  dst_type ;
+
+  return dst_type( std::string( src.label() ).append("_mirror")
+                 , src.dimension_0()
+                 , src.dimension_1()
+                 , src.dimension_2()
+                 , src.dimension_3()
+                 , src.dimension_4()
+                 , src.dimension_5()
+                 , src.dimension_6()
+                 , src.dimension_7() );
+}
+
+template< class T , class ... P >
+inline
+typename Kokkos::Experimental::View<T,P...>::HostMirror
+create_mirror( const Kokkos::Experimental::View<T,P...> & src
+             , typename std::enable_if<
+                 std::is_same< typename Kokkos::Experimental::ViewTraits<T,P...>::array_layout
+                             , Kokkos::LayoutStride >::value
+               >::type * = 0
+             )
+{
+  typedef View<T,P...>                   src_type ;
+  typedef typename src_type::HostMirror  dst_type ;
+
+  Kokkos::LayoutStride layout ;
+
+  layout.dimension[0] = src.dimension_0();
+  layout.dimension[1] = src.dimension_1();
+  layout.dimension[2] = src.dimension_2();
+  layout.dimension[3] = src.dimension_3();
+  layout.dimension[4] = src.dimension_4();
+  layout.dimension[5] = src.dimension_5();
+  layout.dimension[6] = src.dimension_6();
+  layout.dimension[7] = src.dimension_7();
+
+  layout.stride[0] = src.stride_0();
+  layout.stride[1] = src.stride_1();
+  layout.stride[2] = src.stride_2();
+  layout.stride[3] = src.stride_3();
+  layout.stride[4] = src.stride_4();
+  layout.stride[5] = src.stride_5();
+  layout.stride[6] = src.stride_6();
+  layout.stride[7] = src.stride_7();
+
+  return dst_type( std::string( src.label() ).append("_mirror") , layout );
+}
+
+
+// Create a mirror in a new space (specialization for different space)
+template<class Space, class T, class ... P>
+typename Impl::MirrorType<Space,T,P ...>::view_type create_mirror(const Space& , const Kokkos::Experimental::View<T,P...> & src) {
+  return typename Impl::MirrorType<Space,T,P ...>::view_type(src.label(),src.layout());
+}
+
+template< class T , class ... P >
+inline
+typename Kokkos::Experimental::View<T,P...>::HostMirror
+create_mirror_view( const Kokkos::Experimental::View<T,P...> & src
+                  , typename std::enable_if<(
+                      std::is_same< typename Kokkos::Experimental::View<T,P...>::memory_space
+                                  , typename Kokkos::Experimental::View<T,P...>::HostMirror::memory_space
+                                  >::value
+                      &&
+                      std::is_same< typename Kokkos::Experimental::View<T,P...>::data_type
+                                  , typename Kokkos::Experimental::View<T,P...>::HostMirror::data_type
+                                  >::value
+                    )>::type * = 0
+                  )
+{
+  return src ;
+}
+
+template< class T , class ... P >
+inline
+typename Kokkos::Experimental::View<T,P...>::HostMirror
+create_mirror_view( const Kokkos::Experimental::View<T,P...> & src
+                  , typename std::enable_if< ! (
+                      std::is_same< typename Kokkos::Experimental::View<T,P...>::memory_space
+                                  , typename Kokkos::Experimental::View<T,P...>::HostMirror::memory_space
+                                  >::value
+                      &&
+                      std::is_same< typename Kokkos::Experimental::View<T,P...>::data_type
+                                  , typename Kokkos::Experimental::View<T,P...>::HostMirror::data_type
+                                  >::value
+                    )>::type * = 0
+                  )
+{
+  return Kokkos::Experimental::create_mirror( src );
+}
+
+// Create a mirror view in a new space (specialization for same space)
+template<class Space, class T, class ... P>
+typename Impl::MirrorViewType<Space,T,P ...>::view_type
+create_mirror_view(const Space& , const Kokkos::Experimental::View<T,P...> & src
+  , typename std::enable_if<Impl::MirrorViewType<Space,T,P ...>::is_same_memspace>::type* = 0 ) {
+  return src;
+}
+
+// Create a mirror view in a new space (specialization for different space)
+template<class Space, class T, class ... P>
+typename Impl::MirrorViewType<Space,T,P ...>::view_type
+create_mirror_view(const Space& , const Kokkos::Experimental::View<T,P...> & src
+  , typename std::enable_if<!Impl::MirrorViewType<Space,T,P ...>::is_same_memspace>::type* = 0 ) {
+  return typename Impl::MirrorViewType<Space,T,P ...>::view_type(src.label(),src.layout());
+}
+
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+
+/** \brief  Resize a view with copying old data to new data at the corresponding indices. */
+template< class T , class ... P >
+inline
+void resize( Kokkos::Experimental::View<T,P...> & v ,
+             const size_t n0 = 0 ,
+             const size_t n1 = 0 ,
+             const size_t n2 = 0 ,
+             const size_t n3 = 0 ,
+             const size_t n4 = 0 ,
+             const size_t n5 = 0 ,
+             const size_t n6 = 0 ,
+             const size_t n7 = 0 )
+{
+  typedef Kokkos::Experimental::View<T,P...>  view_type ;
+
+  static_assert( Kokkos::Experimental::ViewTraits<T,P...>::is_managed , "Can only resize managed views" );
+
+  view_type v_resized( v.label(), n0, n1, n2, n3, n4, n5, n6, n7 );
+
+  Kokkos::Experimental::Impl::ViewRemap< view_type , view_type >( v_resized , v );
+
+  v = v_resized ;
+}
+
+/** \brief  Resize a view with copying old data to new data at the corresponding indices. */
+template< class T , class ... P >
+inline
+void realloc( Kokkos::Experimental::View<T,P...> & v ,
+              const size_t n0 = 0 ,
+              const size_t n1 = 0 ,
+              const size_t n2 = 0 ,
+              const size_t n3 = 0 ,
+              const size_t n4 = 0 ,
+              const size_t n5 = 0 ,
+              const size_t n6 = 0 ,
+              const size_t n7 = 0 )
+{
+  typedef Kokkos::Experimental::View<T,P...>  view_type ;
+
+  static_assert( Kokkos::Experimental::ViewTraits<T,P...>::is_managed , "Can only realloc managed views" );
+
+  const std::string label = v.label();
+
+  v = view_type(); // Deallocate first, if the only view to allocation
+  v = view_type( label, n0, n1, n2, n3, n4, n5, n6, n7 );
+}
+
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+template< class D , class ... P >
+using ViewTraits = Kokkos::Experimental::ViewTraits<D,P...> ;
+
+using Experimental::View ; //modified due to gcc parser bug 
+//template< class D , class ... P >
+//using View = Kokkos::Experimental::View<D,P...> ;
+
+using Kokkos::Experimental::ALL ;
+using Kokkos::Experimental::WithoutInitializing ;
+using Kokkos::Experimental::AllowPadding ;
+using Kokkos::Experimental::view_alloc ;
+using Kokkos::Experimental::view_wrap ;
+
+using Kokkos::Experimental::deep_copy ;
+using Kokkos::Experimental::create_mirror ;
+using Kokkos::Experimental::create_mirror_view ;
+using Kokkos::Experimental::subview ;
+using Kokkos::Experimental::resize ;
+using Kokkos::Experimental::realloc ;
+using Kokkos::Experimental::is_view ;
+
+namespace Impl {
+
+using Kokkos::Experimental::is_view ;
+
+class ViewDefault {};
+
+template< class SrcViewType
+        , class Arg0Type
+        , class Arg1Type
+        , class Arg2Type
+        , class Arg3Type
+        , class Arg4Type
+        , class Arg5Type
+        , class Arg6Type
+        , class Arg7Type
+        >
+struct ViewSubview /* { typedef ... type ; } */ ;
+
+}
+
+} /* namespace Kokkos */
+
+#include <impl/Kokkos_Atomic_View.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_VIEW_HPP */
+
diff --git a/lib/kokkos/core/src/Kokkos_hwloc.hpp b/lib/kokkos/core/src/Kokkos_hwloc.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ff713c95239197e57b51fafe51d9a6b69bb1472e
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_hwloc.hpp
@@ -0,0 +1,144 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_HWLOC_HPP
+#define KOKKOS_HWLOC_HPP
+
+#include <utility>
+
+namespace Kokkos {
+
+/** \brief  Minimal subset of logical 'hwloc' functionality available
+ *          from http://www.open-mpi.org/projects/hwloc/.
+ *
+ *  The calls are NOT thread safe in order to avoid mutexes,
+ *  memory allocations, or other actions which could give the
+ *  runtime system an opportunity to migrate the threads or
+ *  touch allocated memory during the function calls.
+ *
+ *  All calls to these functions should be performed by a thread
+ *  when it has guaranteed exclusive access; e.g., for OpenMP
+ *  within a 'critical' region.
+ */
+namespace hwloc {
+
+/** \brief  Query if hwloc is available */
+bool available();
+
+/** \brief  Query number of available NUMA regions.
+ *          This will be less than the hardware capacity
+ *          if the MPI process is pinned to a NUMA region.
+ */
+unsigned get_available_numa_count();
+
+/** \brief  Query number of available cores per NUMA regions.
+ *          This will be less than the hardware capacity
+ *          if the MPI process is pinned to a set of cores.
+ */
+unsigned get_available_cores_per_numa();
+
+/** \brief  Query number of available "hard" threads per core; i.e., hyperthreads */
+unsigned get_available_threads_per_core();
+
+} /* namespace hwloc */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+// Internal functions for binding persistent spawned threads.
+
+namespace Kokkos {
+namespace hwloc {
+
+/** \brief  Recommend mapping of threads onto cores.
+ *
+ * If thread_count == 0 then choose and set a value.
+ * If use_numa_count == 0 then choose and set a value.
+ * If use_cores_per_numa == 0 then choose and set a value.
+ *
+ * Return 0 if asynchronous,
+ * Return 1 if synchronous and threads_coord[0] is process core
+ */
+unsigned thread_mapping( const char * const label ,
+                         const bool allow_async ,
+                         unsigned & thread_count ,
+                         unsigned & use_numa_count ,
+                         unsigned & use_cores_per_numa ,
+                         std::pair<unsigned,unsigned> threads_coord[] );
+
+/** \brief  Query core-coordinate of the current thread
+ *          with respect to the core_topology.
+ *
+ *  As long as the thread is running within the
+ *  process binding the following condition holds.
+ *
+ *  core_coordinate.first  < core_topology.first
+ *  core_coordinate.second < core_topology.second
+ */
+std::pair<unsigned,unsigned> get_this_thread_coordinate();
+
+/** \brief  Bind the current thread to a core. */
+bool bind_this_thread( const std::pair<unsigned,unsigned> );
+
+
+/** \brief Can hwloc bind threads? */
+bool can_bind_threads();
+
+/** \brief  Bind the current thread to one of the cores in the list.
+ *          Set that entry to (~0,~0) and return the index.
+ *          If binding fails return ~0.
+ */
+unsigned bind_this_thread( const unsigned               coordinate_count ,
+                           std::pair<unsigned,unsigned> coordinate[] );
+
+/** \brief  Unbind the current thread back to the original process binding */
+bool unbind_this_thread();
+
+} /* namespace hwloc */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #define KOKKOS_HWLOC_HPP */
+
diff --git a/lib/kokkos/core/src/Makefile b/lib/kokkos/core/src/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..dc27d341ac8ee4a40150bc93476b994666189739
--- /dev/null
+++ b/lib/kokkos/core/src/Makefile
@@ -0,0 +1,124 @@
+KOKKOS_PATH = ../..
+
+PREFIX ?= /usr/local/lib/kokkos
+
+default: messages build-lib
+	echo "End Build"
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+	CXX = $(NVCC_WRAPPER)
+	CXXFLAGS ?= -O3
+	LINK = $(NVCC_WRAPPER)
+	LINKFLAGS ?= 
+else
+	CXX ?= g++
+	CXXFLAGS ?= -O3
+	LINK ?= g++
+	LINKFLAGS ?=  
+endif
+
+PWD = $(shell pwd)
+
+KOKKOS_HEADERS_INCLUDE = $(wildcard $(KOKKOS_PATH)/core/src/*.hpp)
+KOKKOS_HEADERS_INCLUDE_IMPL = $(wildcard $(KOKKOS_PATH)/core/src/impl/*.hpp)
+KOKKOS_HEADERS_INCLUDE += $(wildcard $(KOKKOS_PATH)/containers/src/*.hpp)
+KOKKOS_HEADERS_INCLUDE_IMPL += $(wildcard $(KOKKOS_PATH)/containers/src/impl/*.hpp)
+KOKKOS_HEADERS_INCLUDE += $(wildcard $(KOKKOS_PATH)/algorithms/src/*.hpp)
+
+CONDITIONAL_COPIES =
+
+ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+	KOKKOS_HEADERS_CUDA += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.hpp)
+	CONDITIONAL_COPIES += copy-cuda
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
+	KOKKOS_HEADERS_THREADS += $(wildcard $(KOKKOS_PATH)/core/src/Threads/*.hpp)
+	CONDITIONAL_COPIES += copy-threads
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_QTHREAD), 1)
+	KOKKOS_HEADERS_QTHREAD += $(wildcard $(KOKKOS_PATH)/core/src/Qthread/*.hpp)
+	CONDITIONAL_COPIES += copy-qthread
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
+	KOKKOS_HEADERS_OPENMP += $(wildcard $(KOKKOS_PATH)/core/src/OpenMP/*.hpp)
+	CONDITIONAL_COPIES += copy-openmp
+endif
+
+messages: 
+	echo "Start Build"
+
+build-makefile-kokkos:
+	rm -f Makefile.kokkos
+	echo "#Global Settings used to generate this library" >> Makefile.kokkos
+	echo "KOKKOS_PATH = $(PREFIX)" >> Makefile.kokkos
+	echo "KOKKOS_DEVICES = $(KOKKOS_DEVICES)" >> Makefile.kokkos
+	echo "KOKKOS_ARCH = $(KOKKOS_ARCH)" >> Makefile.kokkos
+	echo "KOKKOS_DEBUG = $(KOKKOS_DEBUG)" >> Makefile.kokkos
+	echo "KOKKOS_USE_TPLS = $(KOKKOS_USE_TPLS)" >> Makefile.kokkos
+	echo "KOKKOS_CXX_STANDARD = $(KOKKOS_CXX_STANDARD)" >> Makefile.kokkos
+	echo "KOKKOS_OPTIONS = $(KOKKOS_OPTIONS)" >> Makefile.kokkos
+	echo "KOKKOS_CUDA_OPTIONS = $(KOKKOS_CUDA_OPTIONS)" >> Makefile.kokkos
+	echo "CXX ?= $(CXX)" >> Makefile.kokkos 
+	echo "NVCC_WRAPPER ?= $(PREFIX)/bin/nvcc_wrapper" >> Makefile.kokkos
+	echo "" >> Makefile.kokkos  
+	echo "#Source and Header files of Kokkos relative to KOKKOS_PATH" >> Makefile.kokkos
+	echo "KOKKOS_HEADERS = $(KOKKOS_HEADERS)" >> Makefile.kokkos
+	echo "KOKKOS_SRC = $(KOKKOS_SRC)" >> Makefile.kokkos
+	echo "" >> Makefile.kokkos  
+	echo "#Variables used in application Makefiles" >> Makefile.kokkos
+	echo "KOKKOS_CPP_DEPENDS = $(KOKKOS_CPP_DEPENDS)" >> Makefile.kokkos
+	echo "KOKKOS_CXXFLAGS = $(KOKKOS_CXXFLAGS)" >> Makefile.kokkos
+	echo "KOKKOS_CPPFLAGS = $(KOKKOS_CPPFLAGS)" >> Makefile.kokkos
+	echo "KOKKOS_LINK_DEPENDS  = $(KOKKOS_LINK_DEPENDS)" >> Makefile.kokkos
+	echo "KOKKOS_LIBS = $(KOKKOS_LIBS)" >> Makefile.kokkos
+	echo "KOKKOS_LDFLAGS = $(KOKKOS_LDFLAGS)" >> Makefile.kokkos
+	sed \
+		-e 's|$(KOKKOS_PATH)/core/src|$(PREFIX)/include|g' \
+		-e 's|$(KOKKOS_PATH)/containers/src|$(PREFIX)/include|g' \
+		-e 's|$(KOKKOS_PATH)/algorithms/src|$(PREFIX)/include|g' \
+		-e 's|-L$(PWD)|-L$(PREFIX)/lib|g' \
+		-e 's|= libkokkos.a|= $(PREFIX)/lib/libkokkos.a|g' \
+		-e 's|= KokkosCore_config.h|= $(PREFIX)/include/KokkosCore_config.h|g' Makefile.kokkos \
+		> Makefile.kokkos.tmp
+	mv -f Makefile.kokkos.tmp Makefile.kokkos
+
+build-lib: build-makefile-kokkos $(KOKKOS_LINK_DEPENDS)
+
+mkdir: 
+	mkdir -p $(PREFIX)
+	mkdir -p $(PREFIX)/bin
+	mkdir -p $(PREFIX)/include
+	mkdir -p $(PREFIX)/lib
+	mkdir -p $(PREFIX)/include/impl
+
+copy-cuda: mkdir
+	mkdir -p $(PREFIX)/include/Cuda
+	cp $(KOKKOS_HEADERS_CUDA) $(PREFIX)/include/Cuda
+
+copy-threads: mkdir
+	mkdir -p $(PREFIX)/include/Threads
+	cp $(KOKKOS_HEADERS_THREADS) $(PREFIX)/include/Threads
+
+copy-qthread: mkdir
+	mkdir -p $(PREFIX)/include/Qthread
+	cp $(KOKKOS_HEADERS_QTHREAD) $(PREFIX)/include/Qthread
+
+copy-openmp: mkdir
+	mkdir -p $(PREFIX)/include/OpenMP
+	cp $(KOKKOS_HEADERS_OPENMP) $(PREFIX)/include/OpenMP
+
+install: mkdir $(CONDITIONAL_COPIES) build-lib 
+	cp $(NVCC_WRAPPER) $(PREFIX)/bin
+	cp $(KOKKOS_HEADERS_INCLUDE) $(PREFIX)/include
+	cp $(KOKKOS_HEADERS_INCLUDE_IMPL) $(PREFIX)/include/impl
+	cp Makefile.kokkos $(PREFIX)
+	cp libkokkos.a $(PREFIX)/lib
+	cp KokkosCore_config.h $(PREFIX)/include
+
+clean: kokkos-clean
+	rm -f Makefile.kokkos
diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..27ae5803cebef27646b16ef360d896ee919a9692
--- /dev/null
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp
@@ -0,0 +1,750 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_OPENMP_PARALLEL_HPP
+#define KOKKOS_OPENMP_PARALLEL_HPP
+
+#include <omp.h>
+#include <iostream>
+#include <Kokkos_Parallel.hpp>
+#include <OpenMP/Kokkos_OpenMPexec.hpp>
+#include <impl/Kokkos_FunctorAdapter.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class ... Traits >
+class ParallelFor< FunctorType
+                 , Kokkos::RangePolicy< Traits ... >
+                 , Kokkos::OpenMP 
+                 >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Traits ...  > Policy ;
+  typedef typename Policy::work_tag     WorkTag ;
+  typedef typename Policy::WorkRange    WorkRange ;
+  typedef typename Policy::member_type  Member ;
+
+  const FunctorType m_functor ;
+  const Policy      m_policy ;
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec_range( const FunctorType & functor
+            , const Member ibeg , const Member iend )
+    {
+      #ifdef KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
+      #ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+      #pragma ivdep
+      #endif
+      #endif
+      for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
+        functor( iwork );
+      }
+    }
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec_range( const FunctorType & functor
+            , const Member ibeg , const Member iend )
+    {
+      const TagType t{} ;
+      #ifdef KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
+      #ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+      #pragma ivdep
+      #endif
+      #endif
+      for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
+        functor( t , iwork );
+      }
+    }
+
+public:
+
+  inline void execute() const {
+    this->template execute_schedule<typename Policy::schedule_type::type>();
+  }
+
+  template<class Schedule>
+  inline
+  typename std::enable_if< std::is_same<Schedule,Kokkos::Static>::value >::type
+    execute_schedule() const
+    {
+      OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_for");
+      OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_for");
+
+#pragma omp parallel
+      {
+        OpenMPexec & exec = * OpenMPexec::get_thread_omp();
+
+        const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() );
+
+        ParallelFor::template exec_range< WorkTag >( m_functor , range.begin() , range.end() );
+      }
+/* END #pragma omp parallel */
+    }
+
+  template<class Schedule>
+  inline
+  typename std::enable_if< std::is_same<Schedule,Kokkos::Dynamic>::value >::type
+    execute_schedule() const
+    {
+      OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_for");
+      OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_for");
+
+#pragma omp parallel
+      {
+        OpenMPexec & exec = * OpenMPexec::get_thread_omp();
+
+        const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() );
+
+        exec.set_work_range(range.begin(),range.end(),m_policy.chunk_size());
+        exec.reset_steal_target();
+        #pragma omp barrier
+        
+        long work_index = exec.get_work_index();
+
+        while(work_index != -1) {
+          const Member begin = static_cast<Member>(work_index) * m_policy.chunk_size();
+          const Member end = begin + m_policy.chunk_size() < m_policy.end()?begin+m_policy.chunk_size():m_policy.end();
+          ParallelFor::template exec_range< WorkTag >( m_functor , begin, end );
+          work_index = exec.get_work_index();
+        }
+
+      }
+/* END #pragma omp parallel */
+    }
+
+  inline
+  ParallelFor( const FunctorType & arg_functor
+             , Policy arg_policy )
+    : m_functor( arg_functor )
+    , m_policy(  arg_policy )
+    {}
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class ReducerType, class ... Traits >
+class ParallelReduce< FunctorType
+                    , Kokkos::RangePolicy< Traits ...>
+                    , ReducerType
+                    , Kokkos::OpenMP
+                    >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Traits ... > Policy ;
+
+  typedef typename Policy::work_tag     WorkTag ;
+  typedef typename Policy::WorkRange    WorkRange ;
+  typedef typename Policy::member_type  Member ;
+
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
+  typedef typename ReducerConditional::type ReducerTypeFwd;
+
+  // Static Assert WorkTag void if ReducerType not InvalidType
+
+  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd, WorkTag > ValueInit ;
+  typedef Kokkos::Impl::FunctorValueJoin<   ReducerTypeFwd, WorkTag > ValueJoin ;
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+
+  const FunctorType   m_functor ;
+  const Policy        m_policy ;
+  const ReducerType   m_reducer ;
+  const pointer_type  m_result_ptr ;
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec_range( const FunctorType & functor
+            , const Member ibeg , const Member iend
+            , reference_type update )
+    {
+      #ifdef KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
+      #ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+      #pragma ivdep
+      #endif
+      #endif
+      for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
+        functor( iwork , update );
+      }
+    }
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec_range( const FunctorType & functor
+            , const Member ibeg , const Member iend
+            , reference_type update )
+    {
+      const TagType t{} ;
+      #ifdef KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
+      #ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+      #pragma ivdep
+      #endif
+      #endif
+      for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
+        functor( t , iwork , update );
+      }
+    }
+
+public:
+
+  inline void execute() const {
+    this->template execute_schedule<typename Policy::schedule_type::type>();
+  }
+
+  template<class Schedule>
+  inline
+  typename std::enable_if< std::is_same<Schedule,Kokkos::Static>::value >::type
+    execute_schedule() const
+    {
+      OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
+      OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_reduce");
+
+      OpenMPexec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 );
+
+#pragma omp parallel
+      {
+        OpenMPexec & exec = * OpenMPexec::get_thread_omp();
+        const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() );
+        ParallelReduce::template exec_range< WorkTag >
+          ( m_functor , range.begin() , range.end()
+          , ValueInit::init( ReducerConditional::select(m_functor , m_reducer), exec.scratch_reduce() ) );
+      }
+/* END #pragma omp parallel */
+
+      // Reduction:
+
+      const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() );
+
+      for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) {
+        ValueJoin::join( ReducerConditional::select(m_functor , m_reducer) , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
+      }
+
+      Kokkos::Impl::FunctorFinal<  ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );
+
+      if ( m_result_ptr ) {
+        const int n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
+
+        for ( int j = 0 ; j < n ; ++j ) { m_result_ptr[j] = ptr[j] ; }
+      }
+    }
+
+  template<class Schedule>
+  inline
+  typename std::enable_if< std::is_same<Schedule,Kokkos::Dynamic>::value >::type
+    execute_schedule() const
+    {
+      OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
+      OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_reduce");
+
+      OpenMPexec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 );
+
+#pragma omp parallel
+      {
+        OpenMPexec & exec = * OpenMPexec::get_thread_omp();
+        const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() );
+
+        exec.set_work_range(range.begin(),range.end(),m_policy.chunk_size());
+        exec.reset_steal_target();
+        #pragma omp barrier
+
+        long work_index = exec.get_work_index();
+
+        reference_type update = ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , exec.scratch_reduce() );
+        while(work_index != -1) {
+          const Member begin = static_cast<Member>(work_index) * m_policy.chunk_size();
+          const Member end = begin + m_policy.chunk_size() < m_policy.end()?begin+m_policy.chunk_size():m_policy.end();
+          ParallelReduce::template exec_range< WorkTag >
+            ( m_functor , begin,end
+            , update );
+          work_index = exec.get_work_index();
+        }
+      }
+/* END #pragma omp parallel */
+
+      // Reduction:
+
+      const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() );
+
+      for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) {
+        ValueJoin::join( ReducerConditional::select(m_functor , m_reducer) , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
+      }
+
+      Kokkos::Impl::FunctorFinal<  ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );
+
+      if ( m_result_ptr ) {
+        const int n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
+
+        for ( int j = 0 ; j < n ; ++j ) { m_result_ptr[j] = ptr[j] ; }
+      }
+    }
+
+  //----------------------------------------
+
+  template< class ViewType >
+  inline
+  ParallelReduce( const FunctorType & arg_functor
+                , Policy       arg_policy
+                , const ViewType    & arg_result_view
+                , typename std::enable_if<
+                           Kokkos::is_view< ViewType >::value &&
+                           !Kokkos::is_reducer_type<ReducerType>::value
+                  ,void*>::type = NULL)
+    : m_functor( arg_functor )
+    , m_policy(  arg_policy )
+    , m_reducer( InvalidType() )
+    , m_result_ptr(  arg_result_view.data() )
+    {
+      /*static_assert( std::is_same< typename ViewType::memory_space
+                                      , Kokkos::HostSpace >::value
+        , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
+    }
+
+  inline
+  ParallelReduce( const FunctorType & arg_functor
+                , Policy       arg_policy
+                , const ReducerType& reducer )
+    : m_functor( arg_functor )
+    , m_policy(  arg_policy )
+    , m_reducer( reducer )
+    , m_result_ptr(  reducer.result_view().data() )
+    {
+      /*static_assert( std::is_same< typename ViewType::memory_space
+                                      , Kokkos::HostSpace >::value
+        , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
+    }
+
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class ... Traits >
+class ParallelScan< FunctorType
+                  , Kokkos::RangePolicy< Traits ... >
+                  , Kokkos::OpenMP
+                  >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Traits ... > Policy ;
+
+  typedef typename Policy::work_tag     WorkTag ;
+  typedef typename Policy::WorkRange    WorkRange ;
+  typedef typename Policy::member_type  Member ;
+
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   FunctorType, WorkTag > ValueInit ;
+  typedef Kokkos::Impl::FunctorValueJoin<   FunctorType, WorkTag > ValueJoin ;
+  typedef Kokkos::Impl::FunctorValueOps<    FunctorType, WorkTag > ValueOps ;
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+
+  const FunctorType   m_functor ;
+  const Policy        m_policy ;
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec_range( const FunctorType & functor
+            , const Member ibeg , const Member iend
+            , reference_type update , const bool final )
+    {
+      #ifdef KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
+      #ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+      #pragma ivdep
+      #endif
+      #endif
+      for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
+        functor( iwork , update , final );
+      }
+    }
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec_range( const FunctorType & functor
+            , const Member ibeg , const Member iend
+            , reference_type update , const bool final )
+    {
+      const TagType t{} ;
+      #ifdef KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
+      #ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+      #pragma ivdep
+      #endif
+      #endif
+      for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
+        functor( t , iwork , update , final );
+      }
+    }
+
+public:
+
+  inline
+  void execute() const
+    {
+      OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_scan");
+      OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_scan");
+
+      OpenMPexec::resize_scratch( 2 * ValueTraits::value_size( m_functor ) , 0 );
+
+#pragma omp parallel
+      {
+        OpenMPexec & exec = * OpenMPexec::get_thread_omp();
+        const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() );
+        const pointer_type ptr =
+          pointer_type( exec.scratch_reduce() ) +
+          ValueTraits::value_count( m_functor );
+        ParallelScan::template exec_range< WorkTag >
+          ( m_functor , range.begin() , range.end()
+          , ValueInit::init( m_functor , ptr ) , false );
+      }
+/* END #pragma omp parallel */
+
+      {
+        const unsigned thread_count = OpenMPexec::pool_size();
+        const unsigned value_count  = ValueTraits::value_count( m_functor );
+
+        pointer_type ptr_prev = 0 ;
+
+        for ( unsigned rank_rev = thread_count ; rank_rev-- ; ) {
+
+          pointer_type ptr = pointer_type( OpenMPexec::pool_rev(rank_rev)->scratch_reduce() );
+
+          if ( ptr_prev ) {
+            for ( unsigned i = 0 ; i < value_count ; ++i ) { ptr[i] = ptr_prev[ i + value_count ] ; }
+            ValueJoin::join( m_functor , ptr + value_count , ptr );
+          }
+          else {
+            ValueInit::init( m_functor , ptr );
+          }
+
+          ptr_prev = ptr ;
+        }
+      }
+
+#pragma omp parallel
+      {
+        OpenMPexec & exec = * OpenMPexec::get_thread_omp();
+        const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() );
+        const pointer_type ptr = pointer_type( exec.scratch_reduce() );
+        ParallelScan::template exec_range< WorkTag >
+          ( m_functor , range.begin() , range.end()
+          , ValueOps::reference( ptr ) , true );
+      }
+/* END #pragma omp parallel */
+    }
+
+  //----------------------------------------
+
+  inline
+  ParallelScan( const FunctorType & arg_functor
+              , const Policy      & arg_policy )
+    : m_functor( arg_functor )
+    , m_policy(  arg_policy )
+  {}
+
+  //----------------------------------------
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class ... Properties >
+class ParallelFor< FunctorType
+                 , Kokkos::TeamPolicy< Properties ... >
+                 , Kokkos::OpenMP
+                 >
+{
+private:
+
+  typedef Kokkos::Impl::TeamPolicyInternal< Kokkos::OpenMP, Properties ... > Policy ;
+  typedef typename Policy::work_tag     WorkTag ;
+  typedef typename Policy::member_type  Member ;
+
+  const FunctorType  m_functor ;
+  const Policy       m_policy ;
+  const int          m_shmem_size ;
+
+  template< class TagType, class Schedule >
+  inline static
+  typename std::enable_if< std::is_same< TagType , void >::value && std::is_same<Schedule,Kokkos::Static>::value>::type
+  exec_team( const FunctorType & functor , Member member )
+    {
+      for ( ; member.valid_static() ; member.next_static() ) {
+        functor( member );
+      }
+    }
+
+  template< class TagType, class Schedule >
+  inline static
+  typename std::enable_if< (! std::is_same< TagType , void >::value) && std::is_same<Schedule,Kokkos::Static>::value >::type
+  exec_team( const FunctorType & functor , Member member )
+    {
+      const TagType t{} ;
+      for ( ; member.valid_static() ; member.next_static() ) {
+        functor( t , member );
+      }
+    }
+
+  template< class TagType, class Schedule >
+  inline static
+  typename std::enable_if< std::is_same< TagType , void >::value && std::is_same<Schedule,Kokkos::Dynamic>::value>::type
+  exec_team( const FunctorType & functor , Member member )
+    {
+      #pragma omp barrier
+      for ( ; member.valid_dynamic() ; member.next_dynamic() ) {
+        functor( member );
+      }
+    }
+
+  template< class TagType, class Schedule >
+  inline static
+  typename std::enable_if< (! std::is_same< TagType , void >::value) && std::is_same<Schedule,Kokkos::Dynamic>::value >::type
+  exec_team( const FunctorType & functor , Member member )
+    {
+      #pragma omp barrier
+      const TagType t{} ;
+      for ( ; member.valid_dynamic() ; member.next_dynamic() ) {
+        functor( t , member );
+      }
+    }
+
+public:
+
+  inline
+  void execute() const
+    {
+      OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_for");
+      OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_for");
+
+      const size_t team_reduce_size = Policy::member_type::team_reduce_size();
+
+      OpenMPexec::resize_scratch( 0 , team_reduce_size + m_shmem_size + m_policy.scratch_size(1));
+
+#pragma omp parallel
+      {
+        ParallelFor::template exec_team< WorkTag, typename Policy::schedule_type::type>
+          ( m_functor
+          , Member( * OpenMPexec::get_thread_omp(), m_policy, m_shmem_size, 0) );
+      }
+/* END #pragma omp parallel */
+    }
+
+  inline
+  ParallelFor( const FunctorType & arg_functor ,
+               const Policy      & arg_policy )
+    : m_functor( arg_functor )
+    , m_policy(  arg_policy )
+    , m_shmem_size( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
+    {}
+};
+
+
+template< class FunctorType , class ReducerType, class ... Properties >
+class ParallelReduce< FunctorType
+                    , Kokkos::TeamPolicy< Properties ... >
+                    , ReducerType
+                    , Kokkos::OpenMP
+                    >
+{
+private:
+
+  typedef Kokkos::Impl::TeamPolicyInternal< Kokkos::OpenMP, Properties ... >         Policy ;
+
+  typedef typename Policy::work_tag     WorkTag ;
+  typedef typename Policy::member_type  Member ;
+
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
+  typedef typename ReducerConditional::type ReducerTypeFwd;
+
+  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTag >  ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd , WorkTag >  ValueInit ;
+  typedef Kokkos::Impl::FunctorValueJoin<   ReducerTypeFwd , WorkTag >  ValueJoin ;
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+
+  const FunctorType  m_functor ;
+  const Policy       m_policy ;
+  const ReducerType  m_reducer ;
+  const pointer_type m_result_ptr ;
+  const int          m_shmem_size ;
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec_team( const FunctorType & functor , Member member , reference_type update )
+    {
+      for ( ; member.valid_static() ; member.next_static() ) {
+        functor( member , update );
+      }
+    }
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec_team( const FunctorType & functor , Member member , reference_type update )
+    {
+      const TagType t{} ;
+      for ( ; member.valid_static() ; member.next_static() ) {
+        functor( t , member , update );
+      }
+    }
+
+public:
+
+  inline
+  void execute() const
+    {
+      OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
+
+      const size_t team_reduce_size = Policy::member_type::team_reduce_size();
+
+      OpenMPexec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , team_reduce_size + m_shmem_size );
+
+#pragma omp parallel
+      {
+        OpenMPexec & exec = * OpenMPexec::get_thread_omp();
+
+        ParallelReduce::template exec_team< WorkTag >
+          ( m_functor
+          , Member( exec , m_policy , m_shmem_size, 0 )
+          , ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , exec.scratch_reduce() ) );
+      }
+/* END #pragma omp parallel */
+
+      {
+        const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() );
+
+        int max_active_threads = OpenMPexec::pool_size();
+        if( max_active_threads > m_policy.league_size()* m_policy.team_size() )
+          max_active_threads = m_policy.league_size()* m_policy.team_size();
+
+        for ( int i = 1 ; i < max_active_threads ; ++i ) {
+          ValueJoin::join( ReducerConditional::select(m_functor , m_reducer) , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
+        }
+
+        Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );
+
+        if ( m_result_ptr ) {
+          const int n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
+
+          for ( int j = 0 ; j < n ; ++j ) { m_result_ptr[j] = ptr[j] ; }
+        }
+      }
+    }
+
+  template< class ViewType >
+  inline
+  ParallelReduce( const FunctorType  & arg_functor ,
+                  const Policy       & arg_policy ,
+                  const ViewType     & arg_result ,
+                  typename std::enable_if<
+                    Kokkos::is_view< ViewType >::value &&
+                    !Kokkos::is_reducer_type<ReducerType>::value
+                    ,void*>::type = NULL)
+    : m_functor( arg_functor )
+    , m_policy(  arg_policy )
+    , m_reducer( InvalidType() )
+    , m_result_ptr( arg_result.ptr_on_device() )
+    , m_shmem_size( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
+    {}
+
+  inline
+  ParallelReduce( const FunctorType & arg_functor
+    , Policy       arg_policy
+    , const ReducerType& reducer )
+  : m_functor( arg_functor )
+  , m_policy(  arg_policy )
+  , m_reducer( reducer )
+  , m_result_ptr(  reducer.result_view().data() )
+  , m_shmem_size( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
+  {
+  /*static_assert( std::is_same< typename ViewType::memory_space
+                          , Kokkos::HostSpace >::value
+  , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
+  }
+
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* KOKKOS_OPENMP_PARALLEL_HPP */
+
diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.cpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3e22033f7c058dc6c084c445685c80beb8620da8
--- /dev/null
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.cpp
@@ -0,0 +1,329 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+
+#if defined( KOKKOS_HAVE_OPENMP ) && defined( KOKKOS_ENABLE_TASKPOLICY )
+
+#include <impl/Kokkos_TaskQueue_impl.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template class TaskQueue< Kokkos::OpenMP > ;
+
+//----------------------------------------------------------------------------
+
+TaskExec< Kokkos::OpenMP >::
+TaskExec()
+  : m_self_exec( 0 )
+  , m_team_exec( 0 )
+  , m_sync_mask( 0 )
+  , m_sync_value( 0 )
+  , m_sync_step( 0 )
+  , m_group_rank( 0 )
+  , m_team_rank( 0 )
+  , m_team_size( 1 )
+{
+}
+
+TaskExec< Kokkos::OpenMP >::
+TaskExec( Kokkos::Impl::OpenMPexec & arg_exec , int const arg_team_size )
+  : m_self_exec( & arg_exec )
+  , m_team_exec( arg_exec.pool_rev(arg_exec.pool_rank_rev() / arg_team_size) )
+  , m_sync_mask( 0 )
+  , m_sync_value( 0 )
+  , m_sync_step( 0 )
+  , m_group_rank( arg_exec.pool_rank_rev() / arg_team_size )
+  , m_team_rank(  arg_exec.pool_rank_rev() % arg_team_size )
+  , m_team_size(  arg_team_size )
+{
+  // This team spans
+  //    m_self_exec->pool_rev( team_size * group_rank )
+  //    m_self_exec->pool_rev( team_size * ( group_rank + 1 ) - 1 )
+
+  int64_t volatile * const sync = (int64_t *) m_self_exec->scratch_reduce();
+
+  sync[0] = int64_t(0) ;
+  sync[1] = int64_t(0) ;
+
+  for ( int i = 0 ; i < m_team_size ; ++i ) {
+    m_sync_value |= int64_t(1) << (8*i);
+    m_sync_mask  |= int64_t(3) << (8*i);
+  }
+
+  Kokkos::memory_fence();
+}
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+
+void TaskExec< Kokkos::OpenMP >::team_barrier_impl() const
+{
+  if ( m_team_exec->scratch_reduce_size() < int(2 * sizeof(int64_t)) ) {
+    Kokkos::abort("TaskQueue<OpenMP> scratch_reduce memory too small");
+  }
+
+  // Use team shared memory to synchronize.
+  // Alternate memory locations between barriers to avoid a sequence
+  // of barriers overtaking one another.
+
+  int64_t volatile * const sync =
+    ((int64_t *) m_team_exec->scratch_reduce()) + ( m_sync_step & 0x01 );
+
+  // This team member sets one byte within the sync variable
+  int8_t volatile * const sync_self =
+   ((int8_t *) sync) + m_team_rank ;
+
+#if 0
+fprintf( stdout
+       , "barrier group(%d) member(%d) step(%d) wait(%lx) : before(%lx)\n"
+       , m_group_rank
+       , m_team_rank
+       , m_sync_step
+       , m_sync_value
+       , *sync
+       );
+fflush(stdout);
+#endif
+
+  *sync_self = int8_t( m_sync_value & 0x03 ); // signal arrival
+
+  while ( m_sync_value != *sync ); // wait for team to arrive
+
+#if 0
+fprintf( stdout
+       , "barrier group(%d) member(%d) step(%d) wait(%lx) : after(%lx)\n"
+       , m_group_rank
+       , m_team_rank
+       , m_sync_step
+       , m_sync_value
+       , *sync
+       );
+fflush(stdout);
+#endif
+
+  ++m_sync_step ;
+
+  if ( 0 == ( 0x01 & m_sync_step ) ) { // Every other step
+    m_sync_value ^= m_sync_mask ;
+    if ( 1000 < m_sync_step ) m_sync_step = 0 ;
+  }
+}
+
+#endif
+
+//----------------------------------------------------------------------------
+
+void TaskQueueSpecialization< Kokkos::OpenMP >::execute
+  ( TaskQueue< Kokkos::OpenMP > * const queue )
+{
+  using execution_space = Kokkos::OpenMP ;
+  using queue_type      = TaskQueue< execution_space > ;
+  using task_root_type  = TaskBase< execution_space , void , void > ;
+  using PoolExec        = Kokkos::Impl::OpenMPexec ;
+  using Member          = TaskExec< execution_space > ;
+
+  task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
+
+  // Required:  team_size <= 8
+
+  const int team_size = PoolExec::pool_size(2); // Threads per core
+  // const int team_size = PoolExec::pool_size(1); // Threads per NUMA
+
+  if ( 8 < team_size ) {
+    Kokkos::abort("TaskQueue<OpenMP> unsupported team size");
+  }
+
+#pragma omp parallel
+  {
+    PoolExec & self = *PoolExec::get_thread_omp();
+
+    Member single_exec ;
+    Member team_exec( self , team_size );
+
+    // Team shared memory
+    task_root_type * volatile * const task_shared =
+      (task_root_type **) team_exec.m_team_exec->scratch_thread();
+
+// Barrier across entire OpenMP thread pool to insure initialization
+#pragma omp barrier
+
+    // Loop until all queues are empty and no tasks in flight
+
+    do {
+
+      task_root_type * task = 0 ;
+
+      // Each team lead attempts to acquire either a thread team task
+      // or a single thread task for the team.
+
+      if ( 0 == team_exec.team_rank() ) {
+
+        task = 0 < *((volatile int *) & queue->m_ready_count) ? end : 0 ;
+
+        // Loop by priority and then type
+        for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
+          for ( int j = 0 ; j < 2 && end == task ; ++j ) {
+            task = queue_type::pop_task( & queue->m_ready[i][j] );
+          }
+        }
+      }
+
+      // Team lead broadcast acquired task to team members:
+
+      if ( 1 < team_exec.team_size() ) {
+
+        if ( 0 == team_exec.team_rank() ) *task_shared = task ;
+
+        // Fence to be sure task_shared is stored before the barrier
+        Kokkos::memory_fence();
+
+        // Whole team waits for every team member to reach this statement
+        team_exec.team_barrier();
+
+        // Fence to be sure task_shared is stored
+        Kokkos::memory_fence();
+
+        task = *task_shared ;
+      }
+
+#if 0
+fprintf( stdout
+       , "\nexecute group(%d) member(%d) task_shared(0x%lx) task(0x%lx)\n"
+       , team_exec.m_group_rank
+       , team_exec.m_team_rank
+       , uintptr_t(task_shared)
+       , uintptr_t(task)
+       );
+fflush(stdout);
+#endif
+
+      if ( 0 == task ) break ; // 0 == m_ready_count
+
+      if ( end == task ) {
+        // All team members wait for whole team to reach this statement.
+        // Is necessary to prevent task_shared from being updated
+        // before it is read by all threads.
+        team_exec.team_barrier();
+      }
+      else if ( task_root_type::TaskTeam == task->m_task_type ) {
+        // Thread Team Task
+        (*task->m_apply)( task , & team_exec );
+
+        // The m_apply function performs a barrier
+
+        if ( 0 == team_exec.team_rank() ) {
+          // team member #0 completes the task, which may delete the task
+          queue->complete( task ); 
+        }
+      }
+      else {
+        // Single Thread Task
+
+        if ( 0 == team_exec.team_rank() ) {
+
+          (*task->m_apply)( task , & single_exec );
+
+          queue->complete( task ); 
+        }
+
+        // All team members wait for whole team to reach this statement.
+        // Not necessary to complete the task.
+        // Is necessary to prevent task_shared from being updated
+        // before it is read by all threads.
+        team_exec.team_barrier();
+      }
+    } while(1);
+  }
+// END #pragma omp parallel
+
+}
+
+void TaskQueueSpecialization< Kokkos::OpenMP >::
+  iff_single_thread_recursive_execute
+    ( TaskQueue< Kokkos::OpenMP > * const queue )
+{
+  using execution_space = Kokkos::OpenMP ;
+  using queue_type      = TaskQueue< execution_space > ;
+  using task_root_type  = TaskBase< execution_space , void , void > ;
+  using Member          = TaskExec< execution_space > ;
+
+  if ( 1 == omp_get_num_threads() ) {
+
+    task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
+
+    Member single_exec ;
+
+    task_root_type * task = end ;
+
+    do {
+
+      task = end ;
+
+      // Loop by priority and then type
+      for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
+        for ( int j = 0 ; j < 2 && end == task ; ++j ) {
+          task = queue_type::pop_task( & queue->m_ready[i][j] );
+        }
+      }
+
+      if ( end == task ) break ;
+
+      (*task->m_apply)( task , & single_exec );
+
+      queue->complete( task ); 
+
+    } while(1);
+  }
+}
+
+}} /* namespace Kokkos::Impl */
+
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_HAVE_OPENMP ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */
+
+
diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..2761247c40c930d1b454acfc373be2c8d8aaf4a3
--- /dev/null
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp
@@ -0,0 +1,356 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_IMPL_OPENMP_TASK_HPP
+#define KOKKOS_IMPL_OPENMP_TASK_HPP
+
+#if defined( KOKKOS_ENABLE_TASKPOLICY )
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+class TaskQueueSpecialization< Kokkos::OpenMP >
+{
+public:
+
+  using execution_space = Kokkos::OpenMP ;
+  using queue_type      = Kokkos::Impl::TaskQueue< execution_space > ;
+  using task_base_type  = Kokkos::Impl::TaskBase< execution_space , void , void > ;
+
+  // Must specify memory space
+  using memory_space = Kokkos::HostSpace ;
+
+  static
+  void iff_single_thread_recursive_execute( queue_type * const );
+
+  // Must provide task queue execution function
+  static void execute( queue_type * const );
+
+  // Must provide mechanism to set function pointer in
+  // execution space from the host process.
+  template< typename FunctorType >
+  static
+  void proc_set_apply( task_base_type::function_type * ptr )
+    {
+      using TaskType = TaskBase< Kokkos::OpenMP
+                               , typename FunctorType::value_type
+                               , FunctorType
+                               > ;
+       *ptr = TaskType::apply ;
+    }
+};
+
+extern template class TaskQueue< Kokkos::OpenMP > ;
+
+//----------------------------------------------------------------------------
+
+template<>
+class TaskExec< Kokkos::OpenMP >
+{
+private:
+
+  TaskExec( TaskExec && ) = delete ;
+  TaskExec( TaskExec const & ) = delete ;
+  TaskExec & operator = ( TaskExec && ) = delete ;
+  TaskExec & operator = ( TaskExec const & ) = delete ;
+
+
+  using PoolExec = Kokkos::Impl::OpenMPexec ;
+
+  friend class Kokkos::Impl::TaskQueue< Kokkos::OpenMP > ;
+  friend class Kokkos::Impl::TaskQueueSpecialization< Kokkos::OpenMP > ;
+
+  PoolExec * const m_self_exec ;  ///< This thread's thread pool data structure 
+  PoolExec * const m_team_exec ;  ///< Team thread's thread pool data structure
+  int64_t          m_sync_mask ;
+  int64_t mutable  m_sync_value ;
+  int     mutable  m_sync_step ;
+  int              m_group_rank ; ///< Which "team" subset of thread pool
+  int              m_team_rank ;  ///< Which thread within a team
+  int              m_team_size ;
+
+  TaskExec();
+  TaskExec( PoolExec & arg_exec , int arg_team_size );
+
+  void team_barrier_impl() const ;
+
+public:
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+  void * team_shared() const
+    { return m_team_exec ? m_team_exec->scratch_thread() : (void*) 0 ; }
+
+  int team_shared_size() const
+    { return m_team_exec ? m_team_exec->scratch_thread_size() : 0 ; }
+
+  /**\brief  Whole team enters this function call
+   *         before any teeam member returns from
+   *         this function call.
+   */
+  void team_barrier() const { if ( 1 < m_team_size ) team_barrier_impl(); }
+#else
+  KOKKOS_INLINE_FUNCTION void team_barrier() const {}
+  KOKKOS_INLINE_FUNCTION void * team_shared() const { return 0 ; }
+  KOKKOS_INLINE_FUNCTION int team_shared_size() const { return 0 ; }
+#endif
+
+  KOKKOS_INLINE_FUNCTION
+  int team_rank() const { return m_team_rank ; }
+
+  KOKKOS_INLINE_FUNCTION
+  int team_size() const { return m_team_size ; }
+};
+
+}} /* namespace Kokkos::Impl */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >
+TeamThreadRange
+  ( Impl::TaskExec< Kokkos::OpenMP > & thread
+  , const iType & count )
+{
+  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >(thread,count);
+}
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::OpenMP > >
+TeamThreadRange
+  ( Impl:: TaskExec< Kokkos::OpenMP > & thread
+  , const iType & start
+  , const iType & end )
+{
+  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::OpenMP > >(thread,start,end);
+}
+
+/** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all threads of the the calling thread team.
+ * This functionality requires C++11 support.
+*/
+template<typename iType, class Lambda>
+KOKKOS_INLINE_FUNCTION
+void parallel_for
+  ( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::OpenMP > >& loop_boundaries
+  , const Lambda& lambda
+  )
+{
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i);
+  }
+}
+
+template<typename iType, class Lambda, typename ValueType>
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  ( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::OpenMP > >& loop_boundaries
+  , const Lambda& lambda
+  , ValueType& initialized_result)
+{
+  int team_rank = loop_boundaries.thread.team_rank(); // member num within the team
+  ValueType result = initialized_result;
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i, result);
+  }
+
+  if ( 1 < loop_boundaries.thread.team_size() ) {
+
+    ValueType *shared = (ValueType*) loop_boundaries.thread.team_shared();
+
+    loop_boundaries.thread.team_barrier();
+    shared[team_rank] = result;
+
+    loop_boundaries.thread.team_barrier();
+
+    // reduce across threads to thread 0
+    if (team_rank == 0) {
+      for (int i = 1; i < loop_boundaries.thread.team_size(); i++) {
+        shared[0] += shared[i];
+      }
+    }
+
+    loop_boundaries.thread.team_barrier();
+
+    // broadcast result
+    initialized_result = shared[0];
+  }
+  else {
+    initialized_result = result ;
+  }
+}
+
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries,
+   const Lambda & lambda,
+   const JoinType & join,
+   ValueType& initialized_result)
+{
+  int team_rank = loop_boundaries.thread.team_rank(); // member num within the team
+  ValueType result = initialized_result;
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i, result);
+  }
+
+  if ( 1 < loop_boundaries.thread.team_size() ) {
+    ValueType *shared = (ValueType*) loop_boundaries.thread.team_shared();
+
+    loop_boundaries.thread.team_barrier();
+    shared[team_rank] = result;
+
+    loop_boundaries.thread.team_barrier();
+
+    // reduce across threads to thread 0
+    if (team_rank == 0) {
+      for (int i = 1; i < loop_boundaries.thread.team_size(); i++) {
+        join(shared[0], shared[i]);
+      }
+    }
+
+    loop_boundaries.thread.team_barrier();
+
+    // broadcast result
+    initialized_result = shared[0];
+  }
+  else {
+    initialized_result = result ;
+  }
+}
+
+// placeholder for future function
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries,
+   const Lambda & lambda,
+   ValueType& initialized_result)
+{
+}
+
+// placeholder for future function
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries,
+   const Lambda & lambda,
+   const JoinType & join,
+   ValueType& initialized_result)
+{
+}
+
+template< typename ValueType, typename iType, class Lambda >
+KOKKOS_INLINE_FUNCTION
+void parallel_scan
+  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries,
+   const Lambda & lambda)
+{
+  ValueType accum = 0 ;
+  ValueType val, local_total;
+  ValueType *shared = (ValueType*) loop_boundaries.thread.team_shared();
+  int team_size = loop_boundaries.thread.team_size();
+  int team_rank = loop_boundaries.thread.team_rank(); // member num within the team
+
+  // Intra-member scan
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    local_total = 0;
+    lambda(i,local_total,false);
+    val = accum;
+    lambda(i,val,true);
+    accum += local_total;
+  }
+
+  shared[team_rank] = accum;
+  loop_boundaries.thread.team_barrier();
+
+  // Member 0 do scan on accumulated totals
+  if (team_rank == 0) {
+    for( iType i = 1; i < team_size; i+=1) {
+      shared[i] += shared[i-1];
+    }
+    accum = 0; // Member 0 set accum to 0 in preparation for inter-member scan
+  }
+
+  loop_boundaries.thread.team_barrier();
+
+  // Inter-member scan adding in accumulated totals
+  if (team_rank != 0) { accum = shared[team_rank-1]; }
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    local_total = 0;
+    lambda(i,local_total,false);
+    val = accum;
+    lambda(i,val,true);
+    accum += local_total;
+  }
+}
+
+// placeholder for future function
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_scan
+  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries,
+   const Lambda & lambda)
+{
+}
+
+
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
+#endif /* #ifndef KOKKOS_IMPL_OPENMP_TASK_HPP */
+
diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.cpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7d06a2f66149f93bd43d6a4976ae9060b8833997
--- /dev/null
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.cpp
@@ -0,0 +1,408 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <stdio.h>
+#include <limits>
+#include <iostream>
+#include <vector>
+#include <Kokkos_Core.hpp>
+#include <impl/Kokkos_Error.hpp>
+#include <iostream>
+#include <impl/Kokkos_CPUDiscovery.hpp>
+#include <impl/Kokkos_Profiling_Interface.hpp>
+
+#ifdef KOKKOS_HAVE_OPENMP
+
+namespace Kokkos {
+namespace Impl {
+namespace {
+
+KOKKOS_INLINE_FUNCTION
+int kokkos_omp_in_parallel();
+
+int kokkos_omp_in_critical_region = ( Kokkos::HostSpace::register_in_parallel( kokkos_omp_in_parallel ) , 0 );
+
+KOKKOS_INLINE_FUNCTION
+int kokkos_omp_in_parallel()
+{
+#ifndef __CUDA_ARCH__
+  return omp_in_parallel() && ! kokkos_omp_in_critical_region ;
+#else
+  return 0;
+#endif
+}
+
+bool s_using_hwloc = false;
+
+} // namespace
+} // namespace Impl
+} // namespace Kokkos
+
+
+namespace Kokkos {
+namespace Impl {
+
+int OpenMPexec::m_map_rank[ OpenMPexec::MAX_THREAD_COUNT ] = { 0 };
+
+int OpenMPexec::m_pool_topo[ 4 ] = { 0 };
+
+OpenMPexec * OpenMPexec::m_pool[ OpenMPexec::MAX_THREAD_COUNT ] = { 0 };
+
+void OpenMPexec::verify_is_process( const char * const label )
+{
+  if ( omp_in_parallel() ) {
+    std::string msg( label );
+    msg.append( " ERROR: in parallel" );
+    Kokkos::Impl::throw_runtime_exception( msg );
+  }
+}
+
+void OpenMPexec::verify_initialized( const char * const label )
+{
+  if ( 0 == m_pool[0] ) {
+    std::string msg( label );
+    msg.append( " ERROR: not initialized" );
+    Kokkos::Impl::throw_runtime_exception( msg );
+  }
+
+  if ( omp_get_max_threads() != Kokkos::OpenMP::thread_pool_size(0) ) {
+    std::string msg( label );
+    msg.append( " ERROR: Initialized but threads modified inappropriately" );
+    Kokkos::Impl::throw_runtime_exception( msg );
+  }
+
+}
+
+void OpenMPexec::clear_scratch()
+{
+#pragma omp parallel
+  {
+    const int rank_rev = m_map_rank[ omp_get_thread_num() ];
+    typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ;
+    if ( m_pool[ rank_rev ] ) {
+      Record * const r = Record::get_record( m_pool[ rank_rev ] );
+      m_pool[ rank_rev ] = 0 ;
+      Record::decrement( r );
+    }
+  }
+/* END #pragma omp parallel */
+}
+
+void OpenMPexec::resize_scratch( size_t reduce_size , size_t thread_size )
+{
+  enum { ALIGN_MASK = Kokkos::Impl::MEMORY_ALIGNMENT - 1 };
+  enum { ALLOC_EXEC = ( sizeof(OpenMPexec) + ALIGN_MASK ) & ~ALIGN_MASK };
+
+  const size_t old_reduce_size = m_pool[0] ? m_pool[0]->m_scratch_reduce_end : 0 ;
+  const size_t old_thread_size = m_pool[0] ? m_pool[0]->m_scratch_thread_end - m_pool[0]->m_scratch_reduce_end : 0 ;
+
+  reduce_size = ( reduce_size + ALIGN_MASK ) & ~ALIGN_MASK ;
+  thread_size = ( thread_size + ALIGN_MASK ) & ~ALIGN_MASK ;
+
+  // Requesting allocation and old allocation is too small:
+
+  const bool allocate = ( old_reduce_size < reduce_size ) ||
+                        ( old_thread_size < thread_size );
+
+  if ( allocate ) {
+    if ( reduce_size < old_reduce_size ) { reduce_size = old_reduce_size ; }
+    if ( thread_size < old_thread_size ) { thread_size = old_thread_size ; }
+  }
+
+  const size_t alloc_size = allocate ? ALLOC_EXEC + reduce_size + thread_size : 0 ;
+  const int    pool_size  = m_pool_topo[0] ;
+
+  if ( allocate ) {
+
+    clear_scratch();
+
+#pragma omp parallel
+    {
+      const int rank_rev = m_map_rank[ omp_get_thread_num() ];
+      const int rank     = pool_size - ( rank_rev + 1 );
+
+      typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ;
+
+      Record * const r = Record::allocate( Kokkos::HostSpace()
+                                         , "openmp_scratch"
+                                         , alloc_size );
+
+      Record::increment( r );
+
+      m_pool[ rank_rev ] = reinterpret_cast<OpenMPexec*>( r->data() );
+
+      new ( m_pool[ rank_rev ] ) OpenMPexec( rank , ALLOC_EXEC , reduce_size , thread_size );
+    }
+/* END #pragma omp parallel */
+  }
+}
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+
+int OpenMP::is_initialized()
+{ return 0 != Impl::OpenMPexec::m_pool[0]; }
+
+void OpenMP::initialize( unsigned thread_count ,
+                         unsigned use_numa_count ,
+                         unsigned use_cores_per_numa )
+{
+  // Before any other call to OMP query the maximum number of threads
+  // and save the value for re-initialization unit testing.
+
+  //Using omp_get_max_threads(); is problematic in conjunction with
+  //Hwloc on Intel (essentially an initial call to the OpenMP runtime
+  //without a parallel region before will set a process mask for a single core
+  //The runtime will than bind threads for a parallel region to other cores on the
+  //entering the first parallel region and make the process mask the aggregate of
+  //the thread masks. The intend seems to be to make serial code run fast, if you
+  //compile with OpenMP enabled but don't actually use parallel regions or so
+  //static int omp_max_threads = omp_get_max_threads();
+  int nthreads = 0;
+  #pragma omp parallel
+  {
+    #pragma omp atomic
+    nthreads++;
+  }
+
+  static int omp_max_threads = nthreads;
+
+  const bool is_initialized = 0 != Impl::OpenMPexec::m_pool[0] ;
+
+  bool thread_spawn_failed = false ;
+
+  if ( ! is_initialized ) {
+
+    // Use hwloc thread pinning if concerned with locality.
+    // If spreading threads across multiple NUMA regions.
+    // If hyperthreading is enabled.
+    Impl::s_using_hwloc = hwloc::available() && (
+                            ( 1 < Kokkos::hwloc::get_available_numa_count() ) ||
+                            ( 1 < Kokkos::hwloc::get_available_threads_per_core() ) );
+
+    std::pair<unsigned,unsigned> threads_coord[ Impl::OpenMPexec::MAX_THREAD_COUNT ];
+
+    // If hwloc available then use it's maximum value.
+
+    if ( thread_count == 0 ) {
+      thread_count = Impl::s_using_hwloc
+      ? Kokkos::hwloc::get_available_numa_count() *
+        Kokkos::hwloc::get_available_cores_per_numa() *
+        Kokkos::hwloc::get_available_threads_per_core()
+      : omp_max_threads ;
+    }
+
+    if(Impl::s_using_hwloc)
+      hwloc::thread_mapping( "Kokkos::OpenMP::initialize" ,
+                           false /* do not allow asynchronous */ ,
+                           thread_count ,
+                           use_numa_count ,
+                           use_cores_per_numa ,
+                           threads_coord );
+
+    // Spawn threads:
+
+    omp_set_num_threads( thread_count );
+
+    // Verify OMP interaction:
+    if ( int(thread_count) != omp_get_max_threads() ) {
+      thread_spawn_failed = true ;
+    }
+
+    // Verify spawning and bind threads:
+#pragma omp parallel
+    {
+#pragma omp critical
+      {
+        if ( int(thread_count) != omp_get_num_threads() ) {
+          thread_spawn_failed = true ;
+        }
+
+        // Call to 'bind_this_thread' is not thread safe so place this whole block in a critical region.
+        // Call to 'new' may not be thread safe as well.
+
+        // Reverse the rank for threads so that the scan operation reduces to the highest rank thread.
+
+        const unsigned omp_rank    = omp_get_thread_num();
+        const unsigned thread_r    = Impl::s_using_hwloc && Kokkos::hwloc::can_bind_threads()
+                                   ? Kokkos::hwloc::bind_this_thread( thread_count , threads_coord )
+                                   : omp_rank ;
+
+        Impl::OpenMPexec::m_map_rank[ omp_rank ] = thread_r ;
+      }
+/* END #pragma omp critical */
+    }
+/* END #pragma omp parallel */
+
+    if ( ! thread_spawn_failed ) {
+      Impl::OpenMPexec::m_pool_topo[0] = thread_count ;
+      Impl::OpenMPexec::m_pool_topo[1] = Impl::s_using_hwloc ? thread_count / use_numa_count : thread_count;
+      Impl::OpenMPexec::m_pool_topo[2] = Impl::s_using_hwloc ? thread_count / ( use_numa_count * use_cores_per_numa ) : 1;
+
+      Impl::OpenMPexec::resize_scratch( 1024 , 1024 );
+    }
+  }
+
+  if ( is_initialized || thread_spawn_failed ) {
+    std::string msg("Kokkos::OpenMP::initialize ERROR");
+
+    if ( is_initialized ) { msg.append(" : already initialized"); }
+    if ( thread_spawn_failed ) { msg.append(" : failed spawning threads"); }
+
+    Kokkos::Impl::throw_runtime_exception(msg);
+  }
+
+  // Check for over-subscription
+  if( Impl::mpi_ranks_per_node() * long(thread_count) > Impl::processors_per_node() ) {
+    std::cout << "Kokkos::OpenMP::initialize WARNING: You are likely oversubscribing your CPU cores." << std::endl;
+    std::cout << "                                    Detected: " << Impl::processors_per_node() << " cores per node." << std::endl;
+    std::cout << "                                    Detected: " << Impl::mpi_ranks_per_node() << " MPI_ranks per node." << std::endl;
+    std::cout << "                                    Requested: " << thread_count << " threads per process." << std::endl;
+  }
+  // Init the array for used for arbitrarily sized atomics
+  Impl::init_lock_array_host_space();
+
+  #if (KOKKOS_ENABLE_PROFILING)
+    Kokkos::Profiling::initialize();
+  #endif
+}
+
+//----------------------------------------------------------------------------
+
+void OpenMP::finalize()
+{
+  Impl::OpenMPexec::verify_initialized( "OpenMP::finalize" );
+  Impl::OpenMPexec::verify_is_process( "OpenMP::finalize" );
+
+  Impl::OpenMPexec::clear_scratch();
+
+  Impl::OpenMPexec::m_pool_topo[0] = 0 ;
+  Impl::OpenMPexec::m_pool_topo[1] = 0 ;
+  Impl::OpenMPexec::m_pool_topo[2] = 0 ;
+
+  omp_set_num_threads(1);
+
+  if ( Impl::s_using_hwloc && Kokkos::hwloc::can_bind_threads() ) {
+    hwloc::unbind_this_thread();
+  }
+
+  #if (KOKKOS_ENABLE_PROFILING)
+    Kokkos::Profiling::finalize();
+  #endif
+}
+
+//----------------------------------------------------------------------------
+
+void OpenMP::print_configuration( std::ostream & s , const bool detail )
+{
+  Impl::OpenMPexec::verify_is_process( "OpenMP::print_configuration" );
+
+  s << "Kokkos::OpenMP" ;
+
+#if defined( KOKKOS_HAVE_OPENMP )
+  s << " KOKKOS_HAVE_OPENMP" ;
+#endif
+#if defined( KOKKOS_HAVE_HWLOC )
+
+  const unsigned numa_count_       = Kokkos::hwloc::get_available_numa_count();
+  const unsigned cores_per_numa   = Kokkos::hwloc::get_available_cores_per_numa();
+  const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core();
+
+  s << " hwloc[" << numa_count_ << "x" << cores_per_numa << "x" << threads_per_core << "]"
+    << " hwloc_binding_" << ( Impl::s_using_hwloc ? "enabled" : "disabled" )
+    ;
+#endif
+
+  const bool is_initialized = 0 != Impl::OpenMPexec::m_pool[0] ;
+
+  if ( is_initialized ) {
+    const int numa_count      = Kokkos::Impl::OpenMPexec::m_pool_topo[0] / Kokkos::Impl::OpenMPexec::m_pool_topo[1] ;
+    const int core_per_numa   = Kokkos::Impl::OpenMPexec::m_pool_topo[1] / Kokkos::Impl::OpenMPexec::m_pool_topo[2] ;
+    const int thread_per_core = Kokkos::Impl::OpenMPexec::m_pool_topo[2] ;
+
+    s << " thread_pool_topology[ " << numa_count
+      << " x " << core_per_numa
+      << " x " << thread_per_core
+      << " ]"
+      << std::endl ;
+
+    if ( detail ) {
+      std::vector< std::pair<unsigned,unsigned> > coord( Kokkos::Impl::OpenMPexec::m_pool_topo[0] );
+
+#pragma omp parallel
+      {
+#pragma omp critical
+        {
+          coord[ omp_get_thread_num() ] = hwloc::get_this_thread_coordinate();
+        }
+/* END #pragma omp critical */
+      }
+/* END #pragma omp parallel */
+
+      for ( unsigned i = 0 ; i < coord.size() ; ++i ) {
+        s << "  thread omp_rank[" << i << "]"
+          << " kokkos_rank[" << Impl::OpenMPexec::m_map_rank[ i ] << "]"
+          << " hwloc_coord[" << coord[i].first << "." << coord[i].second << "]"
+          << std::endl ;
+      }
+    }
+  }
+  else {
+    s << " not initialized" << std::endl ;
+  }
+}
+
+int OpenMP::concurrency() {
+  return thread_pool_size(0);
+}
+
+} // namespace Kokkos
+
+#endif //KOKKOS_HAVE_OPENMP
diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a01c9cb644e86f423409f1eeb56a014b68f87968
--- /dev/null
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.hpp
@@ -0,0 +1,1083 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_OPENMPEXEC_HPP
+#define KOKKOS_OPENMPEXEC_HPP
+
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_spinwait.hpp>
+
+#include <Kokkos_Atomic.hpp>
+#include <iostream>
+#include <sstream>
+#include <fstream>
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+/** \brief  Data for OpenMP thread execution */
+
+class OpenMPexec {
+public:
+
+  enum { MAX_THREAD_COUNT = 4096 };
+
+private:
+
+  static OpenMPexec * m_pool[ MAX_THREAD_COUNT ]; // Indexed by: m_pool_rank_rev
+
+  static int          m_pool_topo[ 4 ];
+  static int          m_map_rank[ MAX_THREAD_COUNT ];
+
+  friend class Kokkos::OpenMP ;
+
+  int const  m_pool_rank ;
+  int const  m_pool_rank_rev ;
+  int const  m_scratch_exec_end ;
+  int const  m_scratch_reduce_end ;
+  int const  m_scratch_thread_end ;
+
+  int volatile  m_barrier_state ;
+
+  // Members for dynamic scheduling
+  // Which thread am I stealing from currently
+  int m_current_steal_target;
+  // This thread's owned work_range
+  Kokkos::pair<long,long> m_work_range KOKKOS_ALIGN_16;
+  // Team Offset if one thread determines work_range for others
+  long m_team_work_index;
+
+  // Is this thread stealing (i.e. its owned work_range is exhausted
+  bool m_stealing;
+
+  OpenMPexec();
+  OpenMPexec( const OpenMPexec & );
+  OpenMPexec & operator = ( const OpenMPexec & );
+
+  static void clear_scratch();
+
+public:
+
+  // Topology of a cache coherent thread pool:
+  //   TOTAL = NUMA x GRAIN
+  //   pool_size( depth = 0 )
+  //   pool_size(0) = total number of threads
+  //   pool_size(1) = number of threads per NUMA
+  //   pool_size(2) = number of threads sharing finest grain memory hierarchy
+
+  inline static
+  int pool_size( int depth = 0 ) { return m_pool_topo[ depth ]; }
+
+  inline static
+  OpenMPexec * pool_rev( int pool_rank_rev ) { return m_pool[ pool_rank_rev ]; }
+
+  inline int pool_rank() const { return m_pool_rank ; }
+  inline int pool_rank_rev() const { return m_pool_rank_rev ; }
+
+  inline long team_work_index() const { return m_team_work_index ; }
+
+  inline int scratch_reduce_size() const
+    { return m_scratch_reduce_end - m_scratch_exec_end ; }
+
+  inline int scratch_thread_size() const
+    { return m_scratch_thread_end - m_scratch_reduce_end ; }
+
+  inline void * scratch_reduce() const { return ((char *) this) + m_scratch_exec_end ; }
+  inline void * scratch_thread() const { return ((char *) this) + m_scratch_reduce_end ; }
+
+  inline
+  void state_wait( int state )
+    { Impl::spinwait( m_barrier_state , state ); }
+
+  inline
+  void state_set( int state ) { m_barrier_state = state ; }
+
+  ~OpenMPexec() {}
+
+  OpenMPexec( const int arg_poolRank
+            , const int arg_scratch_exec_size
+            , const int arg_scratch_reduce_size
+            , const int arg_scratch_thread_size )
+    : m_pool_rank( arg_poolRank )
+    , m_pool_rank_rev( pool_size() - ( arg_poolRank + 1 ) )
+    , m_scratch_exec_end( arg_scratch_exec_size )
+    , m_scratch_reduce_end( m_scratch_exec_end   + arg_scratch_reduce_size )
+    , m_scratch_thread_end( m_scratch_reduce_end + arg_scratch_thread_size )
+    , m_barrier_state(0)
+    {}
+
+  static void finalize();
+
+  static void initialize( const unsigned  team_count ,
+                          const unsigned threads_per_team ,
+                          const unsigned numa_count ,
+                          const unsigned cores_per_numa );
+
+  static void verify_is_process( const char * const );
+  static void verify_initialized( const char * const );
+
+  static void resize_scratch( size_t reduce_size , size_t thread_size );
+
+  inline static
+  OpenMPexec * get_thread_omp() { return m_pool[ m_map_rank[ omp_get_thread_num() ] ]; }
+
+  /* Dynamic Scheduling related functionality */
+  // Initialize the work range for this thread
+  inline void set_work_range(const long& begin, const long& end, const long& chunk_size) {
+    m_work_range.first = (begin+chunk_size-1)/chunk_size;
+    m_work_range.second = end>0?(end+chunk_size-1)/chunk_size:m_work_range.first;
+  }
+
+  // Claim and index from this thread's range from the beginning
+  inline long get_work_index_begin () {
+    Kokkos::pair<long,long> work_range_new = m_work_range;
+    Kokkos::pair<long,long> work_range_old = work_range_new;
+    if(work_range_old.first>=work_range_old.second)
+      return -1;
+
+    work_range_new.first+=1;
+
+    bool success = false;
+    while(!success) {
+      work_range_new = Kokkos::atomic_compare_exchange(&m_work_range,work_range_old,work_range_new);
+      success = ( (work_range_new == work_range_old) || 
+                  (work_range_new.first>=work_range_new.second));
+      work_range_old = work_range_new;
+      work_range_new.first+=1;
+    }
+    if(work_range_old.first<work_range_old.second)
+      return work_range_old.first;
+    else
+      return -1;
+  }
+
+  // Claim and index from this thread's range from the end
+  inline long get_work_index_end () {
+    Kokkos::pair<long,long> work_range_new = m_work_range;
+    Kokkos::pair<long,long> work_range_old = work_range_new;
+    if(work_range_old.first>=work_range_old.second)
+      return -1;
+    work_range_new.second-=1;
+    bool success = false;
+    while(!success) {
+      work_range_new = Kokkos::atomic_compare_exchange(&m_work_range,work_range_old,work_range_new);
+      success = ( (work_range_new == work_range_old) ||
+                  (work_range_new.first>=work_range_new.second) );
+      work_range_old = work_range_new;
+      work_range_new.second-=1;
+    }
+    if(work_range_old.first<work_range_old.second)
+      return work_range_old.second-1;
+    else
+      return -1;
+  }
+
+  // Reset the steal target
+  inline void reset_steal_target() {
+    m_current_steal_target = (m_pool_rank+1)%m_pool_topo[0];
+    m_stealing = false;
+  }
+
+  // Reset the steal target
+  inline void reset_steal_target(int team_size) {
+    m_current_steal_target = (m_pool_rank_rev+team_size);
+    if(m_current_steal_target>=m_pool_topo[0])
+      m_current_steal_target = 0;//m_pool_topo[0]-1;
+    m_stealing = false;
+  }
+
+  // Get a steal target; start with my-rank + 1 and go round robin, until arriving at this threads rank
+  // Returns -1 fi no active steal target available
+  inline int get_steal_target() {
+    while(( m_pool[m_current_steal_target]->m_work_range.second <=
+            m_pool[m_current_steal_target]->m_work_range.first  ) &&
+          (m_current_steal_target!=m_pool_rank) ) {
+      m_current_steal_target = (m_current_steal_target+1)%m_pool_topo[0];
+    }
+    if(m_current_steal_target == m_pool_rank)
+      return -1;
+    else
+      return m_current_steal_target;
+  }
+
+  inline int get_steal_target(int team_size) {
+
+    while(( m_pool[m_current_steal_target]->m_work_range.second <=
+            m_pool[m_current_steal_target]->m_work_range.first  ) &&
+          (m_current_steal_target!=m_pool_rank_rev) ) {
+      if(m_current_steal_target + team_size < m_pool_topo[0])
+        m_current_steal_target = (m_current_steal_target+team_size);
+      else
+        m_current_steal_target = 0;
+    }
+
+    if(m_current_steal_target == m_pool_rank_rev)
+      return -1;
+    else
+      return m_current_steal_target;
+  }
+
+  inline long steal_work_index (int team_size = 0) {
+    long index = -1;
+    int steal_target = team_size>0?get_steal_target(team_size):get_steal_target();
+    while ( (steal_target != -1) && (index == -1)) {
+      index = m_pool[steal_target]->get_work_index_end();
+      if(index == -1)
+        steal_target = team_size>0?get_steal_target(team_size):get_steal_target();
+    }
+    return index;
+  }
+
+  // Get a work index. Claim from owned range until its exhausted, then steal from other thread
+  inline long get_work_index (int team_size = 0) {
+    long work_index = -1;
+    if(!m_stealing) work_index = get_work_index_begin();
+
+    if( work_index == -1) {
+      memory_fence();
+      m_stealing = true;
+      work_index = steal_work_index(team_size);
+    }
+    m_team_work_index = work_index;
+    memory_fence();
+    return work_index;
+  }
+
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+class OpenMPexecTeamMember {
+public:
+
+  enum { TEAM_REDUCE_SIZE = 512 };
+
+  /** \brief  Thread states for team synchronization */
+  enum { Active = 0 , Rendezvous = 1 };
+
+  typedef Kokkos::OpenMP                         execution_space ;
+  typedef execution_space::scratch_memory_space  scratch_memory_space ;
+
+  Impl::OpenMPexec    & m_exec ;
+  scratch_memory_space  m_team_shared ;
+  int                   m_team_scratch_size[2] ;
+  int                   m_team_base_rev ;
+  int                   m_team_rank_rev ;
+  int                   m_team_rank ;
+  int                   m_team_size ;
+  int                   m_league_rank ;
+  int                   m_league_end ;
+  int                   m_league_size ;
+
+  int                   m_chunk_size;
+  int                   m_league_chunk_end;
+  Impl::OpenMPexec    & m_team_lead_exec ;
+  int                   m_invalid_thread;
+  int                   m_team_alloc;
+
+  // Fan-in team threads, root of the fan-in which does not block returns true
+  inline
+  bool team_fan_in() const
+    {
+      memory_fence();
+      for ( int n = 1 , j ; ( ( j = m_team_rank_rev + n ) < m_team_size ) && ! ( m_team_rank_rev & n ) ; n <<= 1 ) {
+
+        m_exec.pool_rev( m_team_base_rev + j )->state_wait( Active );
+      }
+
+      if ( m_team_rank_rev ) {
+        m_exec.state_set( Rendezvous );
+        memory_fence();
+        m_exec.state_wait( Rendezvous );
+      }
+
+      return 0 == m_team_rank_rev ;
+    }
+
+  inline
+  void team_fan_out() const
+    {
+      memory_fence();
+      for ( int n = 1 , j ; ( ( j = m_team_rank_rev + n ) < m_team_size ) && ! ( m_team_rank_rev & n ) ; n <<= 1 ) {
+        m_exec.pool_rev( m_team_base_rev + j )->state_set( Active );
+        memory_fence();
+      }
+    }
+
+public:
+
+  KOKKOS_INLINE_FUNCTION
+  const execution_space::scratch_memory_space& team_shmem() const
+    { return m_team_shared.set_team_thread_mode(0,1,0) ; }
+
+  KOKKOS_INLINE_FUNCTION
+  const execution_space::scratch_memory_space& team_scratch(int) const
+    { return m_team_shared.set_team_thread_mode(0,1,0) ; }
+
+  KOKKOS_INLINE_FUNCTION
+  const execution_space::scratch_memory_space& thread_scratch(int) const
+    { return m_team_shared.set_team_thread_mode(0,team_size(),team_rank()) ; }
+
+  KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
+  KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
+  KOKKOS_INLINE_FUNCTION int team_rank() const { return m_team_rank ; }
+  KOKKOS_INLINE_FUNCTION int team_size() const { return m_team_size ; }
+
+  KOKKOS_INLINE_FUNCTION void team_barrier() const
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    {}
+#else
+    {
+      if ( 1 < m_team_size && !m_invalid_thread) {
+        team_fan_in();
+        team_fan_out();
+      }
+    }
+#endif
+
+  template<class ValueType>
+  KOKKOS_INLINE_FUNCTION
+  void team_broadcast(ValueType& value, const int& thread_id) const
+  {
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { }
+#else
+    // Make sure there is enough scratch space:
+    typedef typename if_c< sizeof(ValueType) < TEAM_REDUCE_SIZE
+                         , ValueType , void >::type type ;
+
+    type * const local_value = ((type*) m_exec.scratch_thread());
+    if(team_rank() == thread_id)
+      *local_value = value;
+    memory_fence();
+    team_barrier();
+    value = *local_value;
+#endif
+  }
+
+#ifdef KOKKOS_HAVE_CXX11
+  template< class ValueType, class JoinOp >
+  KOKKOS_INLINE_FUNCTION ValueType
+    team_reduce( const ValueType & value
+               , const JoinOp & op_in ) const
+  #if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { return ValueType(); }
+  #else
+    {
+      memory_fence();
+      typedef ValueType value_type;
+      const JoinLambdaAdapter<value_type,JoinOp> op(op_in);
+  #endif
+#else // KOKKOS_HAVE_CXX11
+  template< class JoinOp >
+  KOKKOS_INLINE_FUNCTION typename JoinOp::value_type
+    team_reduce( const typename JoinOp::value_type & value
+               , const JoinOp & op ) const
+  #if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { return typename JoinOp::value_type(); }
+  #else
+    {
+      typedef typename JoinOp::value_type value_type;
+  #endif
+#endif // KOKKOS_HAVE_CXX11
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      // Make sure there is enough scratch space:
+      typedef typename if_c< sizeof(value_type) < TEAM_REDUCE_SIZE
+                           , value_type , void >::type type ;
+
+      type * const local_value = ((type*) m_exec.scratch_thread());
+
+      // Set this thread's contribution
+      *local_value = value ;
+
+      // Fence to make sure the base team member has access:
+      memory_fence();
+
+      if ( team_fan_in() ) {
+        // The last thread to synchronize returns true, all other threads wait for team_fan_out()
+        type * const team_value  = ((type*) m_exec.pool_rev( m_team_base_rev )->scratch_thread());
+
+        // Join to the team value:
+        for ( int i = 1 ; i < m_team_size ; ++i ) {
+          op.join( *team_value , *((type*) m_exec.pool_rev( m_team_base_rev + i )->scratch_thread()) );
+        }
+        memory_fence();
+
+        // The base team member may "lap" the other team members,
+        // copy to their local value before proceeding.
+        for ( int i = 1 ; i < m_team_size ; ++i ) {
+          *((type*) m_exec.pool_rev( m_team_base_rev + i )->scratch_thread()) = *team_value ;
+        }
+
+        // Fence to make sure all team members have access
+        memory_fence();
+      }
+
+      team_fan_out();
+
+      return *((type volatile const *)local_value);
+    }
+#endif
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
+   *          with intra-team non-deterministic ordering accumulation.
+   *
+   *  The global inter-team accumulation value will, at the end of the
+   *  league's parallel execution, be the scan's total.
+   *  Parallel execution ordering of the league's teams is non-deterministic.
+   *  As such the base value for each team's scan operation is similarly
+   *  non-deterministic.
+   */
+  template< typename ArgType >
+  KOKKOS_INLINE_FUNCTION ArgType team_scan( const ArgType & value , ArgType * const global_accum ) const
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { return ArgType(); }
+#else
+    {
+      // Make sure there is enough scratch space:
+      typedef typename if_c< sizeof(ArgType) < TEAM_REDUCE_SIZE , ArgType , void >::type type ;
+
+      volatile type * const work_value  = ((type*) m_exec.scratch_thread());
+
+      *work_value = value ;
+
+      memory_fence();
+
+      if ( team_fan_in() ) {
+        // The last thread to synchronize returns true, all other threads wait for team_fan_out()
+        // m_team_base[0]                 == highest ranking team member
+        // m_team_base[ m_team_size - 1 ] == lowest ranking team member
+        //
+        // 1) copy from lower to higher rank, initialize lowest rank to zero
+        // 2) prefix sum from lowest to highest rank, skipping lowest rank
+
+        type accum = 0 ;
+
+        if ( global_accum ) {
+          for ( int i = m_team_size ; i-- ; ) {
+            type & val = *((type*) m_exec.pool_rev( m_team_base_rev + i )->scratch_thread());
+            accum += val ;
+          }
+          accum = atomic_fetch_add( global_accum , accum );
+        }
+
+        for ( int i = m_team_size ; i-- ; ) {
+          type & val = *((type*) m_exec.pool_rev( m_team_base_rev + i )->scratch_thread());
+          const type offset = accum ;
+          accum += val ;
+          val = offset ;
+        }
+
+        memory_fence();
+      }
+
+      team_fan_out();
+
+      return *work_value ;
+    }
+#endif
+
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering.
+   *
+   *  The highest rank thread can compute the reduction total as
+   *    reduction_total = dev.team_scan( value ) + value ;
+   */
+  template< typename Type >
+  KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value ) const
+    { return this-> template team_scan<Type>( value , 0 ); }
+
+  //----------------------------------------
+  // Private for the driver
+
+private:
+
+  typedef execution_space::scratch_memory_space space ;
+
+public:
+
+  template< class ... Properties >
+  inline
+  OpenMPexecTeamMember( Impl::OpenMPexec & exec
+                      , const TeamPolicyInternal< OpenMP, Properties ...> & team
+                      , const int shmem_size_L1
+                      , const int shmem_size_L2
+                      )
+    : m_exec( exec )
+    , m_team_shared(0,0)
+    , m_team_scratch_size{ shmem_size_L1 , shmem_size_L2 }
+    , m_team_base_rev(0)
+    , m_team_rank_rev(0)
+    , m_team_rank(0)
+    , m_team_size( team.team_size() )
+    , m_league_rank(0)
+    , m_league_end(0)
+    , m_league_size( team.league_size() )
+    , m_chunk_size( team.chunk_size()>0?team.chunk_size():team.team_iter() )
+    , m_league_chunk_end(0)
+    , m_team_lead_exec( *exec.pool_rev( team.team_alloc() * (m_exec.pool_rank_rev()/team.team_alloc()) ))
+    , m_team_alloc( team.team_alloc())
+    {
+      const int pool_rank_rev        = m_exec.pool_rank_rev();
+      const int pool_team_rank_rev   = pool_rank_rev % team.team_alloc();
+      const int pool_league_rank_rev = pool_rank_rev / team.team_alloc();
+      const int pool_num_teams       = OpenMP::thread_pool_size(0)/team.team_alloc();
+      const int chunks_per_team      = ( team.league_size() + m_chunk_size*pool_num_teams-1 ) / (m_chunk_size*pool_num_teams);
+            int league_iter_end      = team.league_size() - pool_league_rank_rev * chunks_per_team * m_chunk_size;
+            int league_iter_begin    = league_iter_end - chunks_per_team * m_chunk_size;
+      if (league_iter_begin < 0)     league_iter_begin = 0;
+      if (league_iter_end>team.league_size()) league_iter_end = team.league_size();
+
+      if ((team.team_alloc()>m_team_size)?
+          (pool_team_rank_rev >= m_team_size):
+          (m_exec.pool_size() - pool_num_teams*m_team_size > m_exec.pool_rank())
+         )
+        m_invalid_thread = 1;
+      else
+        m_invalid_thread = 0;
+
+      m_team_rank_rev  = pool_team_rank_rev ;
+      if ( pool_team_rank_rev < m_team_size && !m_invalid_thread ) {
+        m_team_base_rev  = team.team_alloc() * pool_league_rank_rev ;
+        m_team_rank_rev  = pool_team_rank_rev ;
+        m_team_rank      = m_team_size - ( m_team_rank_rev + 1 );
+        m_league_end     = league_iter_end ;
+        m_league_rank    = league_iter_begin ;
+        new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_scratch_size[0] ,
+                                             ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE + m_team_scratch_size[0],
+                                               0 );
+      }
+
+      if ( (m_team_rank_rev == 0) && (m_invalid_thread == 0) ) {
+        m_exec.set_work_range(m_league_rank,m_league_end,m_chunk_size);
+        m_exec.reset_steal_target(m_team_size);
+      }
+    }
+
+  bool valid_static() const
+    {
+      return m_league_rank < m_league_end ;
+    }
+
+  void next_static()
+    {
+      if ( m_league_rank < m_league_end ) {
+        team_barrier();
+        new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_scratch_size[0] ,
+                                             ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE + m_team_scratch_size[0],
+                                               0);
+      }
+      m_league_rank++;
+    }
+
+  bool valid_dynamic() {
+    if(m_invalid_thread)
+      return false;
+    if ((m_league_rank < m_league_chunk_end) && (m_league_rank < m_league_size)) {
+      return true;
+    }
+
+    if (  m_team_rank_rev == 0 ) {
+      m_team_lead_exec.get_work_index(m_team_alloc);
+    }
+    team_barrier();
+
+    long work_index = m_team_lead_exec.team_work_index();
+
+    m_league_rank = work_index * m_chunk_size;
+    m_league_chunk_end = (work_index +1 ) * m_chunk_size;
+
+    if(m_league_chunk_end > m_league_size) m_league_chunk_end = m_league_size;
+
+    if(m_league_rank>=0)
+      return true;
+    return false;
+  }
+
+  void next_dynamic() {
+    if(m_invalid_thread)
+      return;
+
+    if ( m_league_rank < m_league_chunk_end ) {
+      team_barrier();
+      new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_scratch_size[0] ,
+                                           ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE + m_team_scratch_size[0],
+                                             0);
+    }
+    m_league_rank++;
+  }
+
+  static inline int team_reduce_size() { return TEAM_REDUCE_SIZE ; }
+};
+
+
+
+template< class ... Properties >
+class TeamPolicyInternal< Kokkos::OpenMP, Properties ... >: public PolicyTraits<Properties ...>
+{
+public:
+
+  //! Tag this class as a kokkos execution policy
+  typedef TeamPolicyInternal      execution_policy ;
+
+  typedef PolicyTraits<Properties ... > traits;
+
+  TeamPolicyInternal& operator = (const TeamPolicyInternal& p) {
+    m_league_size = p.m_league_size;
+    m_team_size = p.m_team_size;
+    m_team_alloc = p.m_team_alloc;
+    m_team_iter = p.m_team_iter;
+    m_team_scratch_size[0] = p.m_team_scratch_size[0];
+    m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
+    m_team_scratch_size[1] = p.m_team_scratch_size[1];
+    m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
+    m_chunk_size = p.m_chunk_size;
+    return *this;
+  }
+
+  //----------------------------------------
+
+  template< class FunctorType >
+  inline static
+  int team_size_max( const FunctorType & )
+    { return traits::execution_space::thread_pool_size(1); }
+
+  template< class FunctorType >
+  inline static
+  int team_size_recommended( const FunctorType & )
+    { return traits::execution_space::thread_pool_size(2); }
+
+  template< class FunctorType >
+  inline static
+  int team_size_recommended( const FunctorType &, const int& )
+    { return traits::execution_space::thread_pool_size(2); }
+
+  //----------------------------------------
+
+private:
+
+  int m_league_size ;
+  int m_team_size ;
+  int m_team_alloc ;
+  int m_team_iter ;
+
+  size_t m_team_scratch_size[2];
+  size_t m_thread_scratch_size[2];
+
+  int m_chunk_size;
+
+  inline void init( const int league_size_request
+                  , const int team_size_request )
+    {
+      const int pool_size  = traits::execution_space::thread_pool_size(0);
+      const int team_max   = traits::execution_space::thread_pool_size(1);
+      const int team_grain = traits::execution_space::thread_pool_size(2);
+
+      m_league_size = league_size_request ;
+
+      m_team_size = team_size_request < team_max ?
+                    team_size_request : team_max ;
+
+      // Round team size up to a multiple of 'team_gain'
+      const int team_size_grain = team_grain * ( ( m_team_size + team_grain - 1 ) / team_grain );
+      const int team_count      = pool_size / team_size_grain ;
+
+      // Constraint : pool_size = m_team_alloc * team_count
+      m_team_alloc = pool_size / team_count ;
+
+      // Maxumum number of iterations each team will take:
+      m_team_iter  = ( m_league_size + team_count - 1 ) / team_count ;
+
+      set_auto_chunk_size();
+    }
+
+public:
+
+  inline int team_size()   const { return m_team_size ; }
+  inline int league_size() const { return m_league_size ; }
+  inline size_t scratch_size(const int& level, int team_size_ = -1) const {
+    if(team_size_ < 0)
+      team_size_ = m_team_size;
+    return m_team_scratch_size[level] + team_size_*m_thread_scratch_size[level] ;
+  }
+
+  /** \brief  Specify league size, request team size */
+  TeamPolicyInternal( typename traits::execution_space &
+            , int league_size_request
+            , int team_size_request
+            , int /* vector_length_request */ = 1 )
+            : m_team_scratch_size { 0 , 0 }
+            , m_thread_scratch_size { 0 , 0 }
+            , m_chunk_size(0)
+    { init( league_size_request , team_size_request ); }
+
+  TeamPolicyInternal( typename traits::execution_space &
+            , int league_size_request
+            , const Kokkos::AUTO_t & /* team_size_request */
+            , int /* vector_length_request */ = 1)
+            : m_team_scratch_size { 0 , 0 }
+            , m_thread_scratch_size { 0 , 0 }
+            , m_chunk_size(0)
+    { init( league_size_request , traits::execution_space::thread_pool_size(2) ); }
+
+  TeamPolicyInternal( int league_size_request
+            , int team_size_request
+            , int /* vector_length_request */ = 1 )
+            : m_team_scratch_size { 0 , 0 }
+            , m_thread_scratch_size { 0 , 0 }
+            , m_chunk_size(0)
+    { init( league_size_request , team_size_request ); }
+
+  TeamPolicyInternal( int league_size_request
+            , const Kokkos::AUTO_t & /* team_size_request */
+            , int /* vector_length_request */ = 1 )
+            : m_team_scratch_size { 0 , 0 }
+            , m_thread_scratch_size { 0 , 0 }
+            , m_chunk_size(0)
+    { init( league_size_request , traits::execution_space::thread_pool_size(2) ); }
+
+  inline int team_alloc() const { return m_team_alloc ; }
+  inline int team_iter()  const { return m_team_iter ; }
+
+  inline int chunk_size() const { return m_chunk_size ; }
+
+  /** \brief set chunk_size to a discrete value*/
+  inline TeamPolicyInternal set_chunk_size(typename traits::index_type chunk_size_) const {
+    TeamPolicyInternal p = *this;
+    p.m_chunk_size = chunk_size_;
+    return p;
+  }
+
+  inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team) const {
+    TeamPolicyInternal p = *this;
+    p.m_team_scratch_size[level] = per_team.value;
+    return p;
+  };
+
+  inline TeamPolicyInternal set_scratch_size(const int& level, const PerThreadValue& per_thread) const {
+    TeamPolicyInternal p = *this;
+    p.m_thread_scratch_size[level] = per_thread.value;
+    return p;
+  };
+
+  inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team, const PerThreadValue& per_thread) const {
+    TeamPolicyInternal p = *this;
+    p.m_team_scratch_size[level] = per_team.value;
+    p.m_thread_scratch_size[level] = per_thread.value;
+    return p;
+  };
+
+private:
+  /** \brief finalize chunk_size if it was set to AUTO*/
+  inline void set_auto_chunk_size() {
+
+    int concurrency = traits::execution_space::thread_pool_size(0)/m_team_alloc;
+    if( concurrency==0 ) concurrency=1;
+
+    if(m_chunk_size > 0) {
+      if(!Impl::is_integral_power_of_two( m_chunk_size ))
+        Kokkos::abort("TeamPolicy blocking granularity must be power of two" );
+    }
+
+    int new_chunk_size = 1;
+    while(new_chunk_size*100*concurrency < m_league_size)
+      new_chunk_size *= 2;
+    if(new_chunk_size < 128) {
+      new_chunk_size = 1;
+      while( (new_chunk_size*40*concurrency < m_league_size ) && (new_chunk_size<128) )
+        new_chunk_size*=2;
+    }
+    m_chunk_size = new_chunk_size;
+  }
+
+public:
+  typedef Impl::OpenMPexecTeamMember member_type ;
+};
+} // namespace Impl
+
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+inline
+int OpenMP::thread_pool_size( int depth )
+{
+  return Impl::OpenMPexec::pool_size(depth);
+}
+
+KOKKOS_INLINE_FUNCTION
+int OpenMP::thread_pool_rank()
+{
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+  return Impl::OpenMPexec::m_map_rank[ omp_get_thread_num() ];
+#else
+  return -1 ;
+#endif
+}
+
+} // namespace Kokkos
+
+
+namespace Kokkos {
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember>
+  TeamThreadRange(const Impl::OpenMPexecTeamMember& thread, const iType& count) {
+  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember>(thread,count);
+}
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember>
+  TeamThreadRange(const Impl::OpenMPexecTeamMember& thread, const iType& begin, const iType& end) {
+  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember>(thread,begin,end);
+}
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember >
+  ThreadVectorRange(const Impl::OpenMPexecTeamMember& thread, const iType& count) {
+  return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember >(thread,count);
+}
+
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadSingleStruct<Impl::OpenMPexecTeamMember> PerTeam(const Impl::OpenMPexecTeamMember& thread) {
+  return Impl::ThreadSingleStruct<Impl::OpenMPexecTeamMember>(thread);
+}
+
+KOKKOS_INLINE_FUNCTION
+Impl::VectorSingleStruct<Impl::OpenMPexecTeamMember> PerThread(const Impl::OpenMPexecTeamMember& thread) {
+  return Impl::VectorSingleStruct<Impl::OpenMPexecTeamMember>(thread);
+}
+} // namespace Kokkos
+
+namespace Kokkos {
+
+  /** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
+   *
+   * The range i=0..N-1 is mapped to all threads of the the calling thread team.
+   * This functionality requires C++11 support.*/
+template<typename iType, class Lambda>
+KOKKOS_INLINE_FUNCTION
+void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember>& loop_boundaries, const Lambda& lambda) {
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
+    lambda(i);
+}
+
+/** \brief  Inter-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all threads of the the calling thread team and a summation of
+ * val is performed and put into result. This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember>& loop_boundaries,
+                     const Lambda & lambda, ValueType& result) {
+
+  result = ValueType();
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    result+=tmp;
+  }
+
+  result = loop_boundaries.thread.team_reduce(result,Impl::JoinAdd<ValueType>());
+}
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
+ * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
+ * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
+ * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
+ * '1 for *'). This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember>& loop_boundaries,
+                     const Lambda & lambda, const JoinType& join, ValueType& init_result) {
+
+  ValueType result = init_result;
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    join(result,tmp);
+  }
+
+  init_result = loop_boundaries.thread.team_reduce(result,join);
+}
+
+} //namespace Kokkos
+
+
+namespace Kokkos {
+/** \brief  Intra-thread vector parallel_for. Executes lambda(iType i) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread.
+ * This functionality requires C++11 support.*/
+template<typename iType, class Lambda>
+KOKKOS_INLINE_FUNCTION
+void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember >&
+    loop_boundaries, const Lambda& lambda) {
+  #ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+  #pragma ivdep
+  #endif
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
+    lambda(i);
+}
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a summation of
+ * val is performed and put into result. This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember >&
+      loop_boundaries, const Lambda & lambda, ValueType& result) {
+  result = ValueType();
+#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    result+=tmp;
+  }
+}
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
+ * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
+ * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
+ * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
+ * '1 for *'). This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember >&
+      loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& init_result) {
+
+  ValueType result = init_result;
+#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    join(result,tmp);
+  }
+  init_result = result;
+}
+
+/** \brief  Intra-thread vector parallel exclusive prefix sum. Executes lambda(iType i, ValueType & val, bool final)
+ *          for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes in the thread and a scan operation is performed.
+ * Depending on the target execution space the operator might be called twice: once with final=false
+ * and once with final=true. When final==true val contains the prefix sum value. The contribution of this
+ * "i" needs to be added to val no matter whether final==true or not. In a serial execution
+ * (i.e. team_size==1) the operator is only called once with final==true. Scan_val will be set
+ * to the final sum value over all vector lanes.
+ * This functionality requires C++11 support.*/
+template< typename iType, class FunctorType >
+KOKKOS_INLINE_FUNCTION
+void parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember >&
+      loop_boundaries, const FunctorType & lambda) {
+
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ;
+  typedef typename ValueTraits::value_type value_type ;
+
+  value_type scan_val = value_type();
+
+#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,scan_val,true);
+  }
+}
+
+} // namespace Kokkos
+
+namespace Kokkos {
+
+template<class FunctorType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::VectorSingleStruct<Impl::OpenMPexecTeamMember>& single_struct, const FunctorType& lambda) {
+  lambda();
+}
+
+template<class FunctorType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::ThreadSingleStruct<Impl::OpenMPexecTeamMember>& single_struct, const FunctorType& lambda) {
+  if(single_struct.team_member.team_rank()==0) lambda();
+}
+
+template<class FunctorType, class ValueType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::VectorSingleStruct<Impl::OpenMPexecTeamMember>& single_struct, const FunctorType& lambda, ValueType& val) {
+  lambda(val);
+}
+
+template<class FunctorType, class ValueType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::ThreadSingleStruct<Impl::OpenMPexecTeamMember>& single_struct, const FunctorType& lambda, ValueType& val) {
+  if(single_struct.team_member.team_rank()==0) {
+    lambda(val);
+  }
+  single_struct.team_member.team_broadcast(val,0);
+}
+}
+
+#endif /* #ifndef KOKKOS_OPENMPEXEC_HPP */
+
diff --git a/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.cpp b/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3123a297c4478a3ec1f48525048945055311f032
--- /dev/null
+++ b/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.cpp
@@ -0,0 +1,511 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core_fwd.hpp>
+
+#if defined( KOKKOS_HAVE_QTHREAD )
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <iostream>
+#include <sstream>
+#include <utility>
+#include <Kokkos_Qthread.hpp>
+#include <Kokkos_Atomic.hpp>
+#include <impl/Kokkos_Error.hpp>
+
+// Defines to enable experimental Qthread functionality
+
+#define QTHREAD_LOCAL_PRIORITY
+#define CLONED_TASKS
+
+#include <qthread/qthread.h>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+namespace {
+
+enum { MAXIMUM_QTHREAD_WORKERS = 1024 };
+
+/** s_exec is indexed by the reverse rank of the workers
+ *  for faster fan-in / fan-out lookups
+ *  [ n - 1 , n - 2 , ... , 0 ]
+ */
+QthreadExec * s_exec[ MAXIMUM_QTHREAD_WORKERS ];
+
+int  s_number_shepherds            = 0 ;
+int  s_number_workers_per_shepherd = 0 ;
+int  s_number_workers              = 0 ;
+
+inline
+QthreadExec ** worker_exec()
+{
+  return s_exec + s_number_workers - ( qthread_shep() * s_number_workers_per_shepherd + qthread_worker_local(NULL) + 1 );
+}
+
+const int s_base_size = QthreadExec::align_alloc( sizeof(QthreadExec) );
+
+int s_worker_reduce_end   = 0 ; /* End of worker reduction memory    */
+int s_worker_shared_end   = 0 ; /* Total of worker scratch memory    */
+int s_worker_shared_begin = 0 ; /* Beginning of worker shared memory */
+
+QthreadExecFunctionPointer volatile s_active_function = 0 ;
+const void               * volatile s_active_function_arg = 0 ;
+
+} /* namespace */
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+int Qthread::is_initialized()
+{
+  return Impl::s_number_workers != 0 ;
+}
+
+int Qthread::concurrency()
+{
+  return Impl::s_number_workers_per_shepherd ;
+}
+
+int Qthread::in_parallel()
+{
+  return Impl::s_active_function != 0 ;
+}
+
+void Qthread::initialize( int thread_count )
+{
+  // Environment variable: QTHREAD_NUM_SHEPHERDS
+  // Environment variable: QTHREAD_NUM_WORKERS_PER_SHEP
+  // Environment variable: QTHREAD_HWPAR
+
+  {
+    char buffer[256];
+    snprintf(buffer,sizeof(buffer),"QTHREAD_HWPAR=%d",thread_count);
+    putenv(buffer);
+  }
+
+  const bool ok_init = ( QTHREAD_SUCCESS == qthread_initialize() ) &&
+                       ( thread_count    == qthread_num_shepherds() * qthread_num_workers_local(NO_SHEPHERD) ) &&
+                       ( thread_count    == qthread_num_workers() );
+
+  bool ok_symmetry = true ;
+
+  if ( ok_init ) {
+    Impl::s_number_shepherds            = qthread_num_shepherds();
+    Impl::s_number_workers_per_shepherd = qthread_num_workers_local(NO_SHEPHERD);
+    Impl::s_number_workers              = Impl::s_number_shepherds * Impl::s_number_workers_per_shepherd ;
+
+    for ( int i = 0 ; ok_symmetry && i < Impl::s_number_shepherds ; ++i ) {
+      ok_symmetry = ( Impl::s_number_workers_per_shepherd == qthread_num_workers_local(i) );
+    }
+  }
+
+  if ( ! ok_init || ! ok_symmetry ) {
+    std::ostringstream msg ;
+
+    msg << "Kokkos::Qthread::initialize(" << thread_count << ") FAILED" ;
+    msg << " : qthread_num_shepherds = " << qthread_num_shepherds();
+    msg << " : qthread_num_workers_per_shepherd = " << qthread_num_workers_local(NO_SHEPHERD);
+    msg << " : qthread_num_workers = " << qthread_num_workers();
+
+    if ( ! ok_symmetry ) {
+      msg << " : qthread_num_workers_local = {" ;
+      for ( int i = 0 ; i < Impl::s_number_shepherds ; ++i ) {
+        msg << " " << qthread_num_workers_local(i) ;
+      }
+      msg << " }" ;
+    }
+
+    Impl::s_number_workers   = 0 ;
+    Impl::s_number_shepherds = 0 ;
+    Impl::s_number_workers_per_shepherd = 0 ;
+
+    if ( ok_init ) { qthread_finalize(); }
+
+    Kokkos::Impl::throw_runtime_exception( msg.str() );
+  }
+
+  Impl::QthreadExec::resize_worker_scratch( 256 , 256 );
+
+  // Init the array for used for arbitrarily sized atomics
+  Impl::init_lock_array_host_space();
+
+}
+
+void Qthread::finalize()
+{
+  Impl::QthreadExec::clear_workers();
+
+  if ( Impl::s_number_workers ) {
+    qthread_finalize();
+  }
+
+  Impl::s_number_workers    = 0 ;
+  Impl::s_number_shepherds  = 0 ;
+  Impl::s_number_workers_per_shepherd = 0 ;
+}
+
+void Qthread::print_configuration( std::ostream & s , const bool detail )
+{
+  s << "Kokkos::Qthread {"
+    << " num_shepherds(" << Impl::s_number_shepherds << ")"
+    << " num_workers_per_shepherd(" << Impl::s_number_workers_per_shepherd << ")"
+    << " }" << std::endl ;
+}
+
+Qthread & Qthread::instance( int )
+{
+  static Qthread q ;
+  return q ;
+}
+
+void Qthread::fence()
+{
+}
+
+int Qthread::shepherd_size() const { return Impl::s_number_shepherds ; }
+int Qthread::shepherd_worker_size() const { return Impl::s_number_workers_per_shepherd ; }
+
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+namespace {
+
+aligned_t driver_exec_all( void * arg )
+{
+  QthreadExec & exec = **worker_exec();
+
+  (*s_active_function)( exec , s_active_function_arg );
+
+/*
+  fprintf( stdout
+         , "QthreadExec driver worker(%d:%d) shepherd(%d:%d) shepherd_worker(%d:%d) done\n"
+         , exec.worker_rank()
+         , exec.worker_size()
+         , exec.shepherd_rank()
+         , exec.shepherd_size()
+         , exec.shepherd_worker_rank()
+         , exec.shepherd_worker_size()
+         );
+  fflush(stdout);
+*/
+
+  return 0 ;
+}
+
+aligned_t driver_resize_worker_scratch( void * arg )
+{
+  static volatile int lock_begin = 0 ;
+  static volatile int lock_end   = 0 ;
+
+  QthreadExec ** const exec = worker_exec();
+
+  //----------------------------------------
+  // Serialize allocation for thread safety
+
+  while ( ! atomic_compare_exchange_strong( & lock_begin , 0 , 1 ) ); // Spin wait to claim lock
+
+  const bool ok = 0 == *exec ;
+
+  if ( ok ) { *exec = (QthreadExec *) malloc( s_base_size + s_worker_shared_end ); }
+
+  lock_begin = 0 ; // release lock
+
+  if ( ok ) { new( *exec ) QthreadExec(); }
+
+  //----------------------------------------
+  // Wait for all calls to complete to insure that each worker has executed.
+
+  if ( s_number_workers == 1 + atomic_fetch_add( & lock_end , 1 ) ) { lock_end = 0 ; }
+
+  while ( lock_end );
+
+/*
+  fprintf( stdout
+         , "QthreadExec resize worker(%d:%d) shepherd(%d:%d) shepherd_worker(%d:%d) done\n"
+         , (**exec).worker_rank()
+         , (**exec).worker_size()
+         , (**exec).shepherd_rank()
+         , (**exec).shepherd_size()
+         , (**exec).shepherd_worker_rank()
+         , (**exec).shepherd_worker_size()
+         );
+  fflush(stdout);
+*/
+
+  //----------------------------------------
+
+  if ( ! ok ) {
+    fprintf( stderr , "Kokkos::QthreadExec resize failed\n" );
+    fflush( stderr );
+  }
+
+  return 0 ;
+}
+
+void verify_is_process( const char * const label , bool not_active = false )
+{
+  const bool not_process = 0 != qthread_shep() || 0 != qthread_worker_local(NULL);
+  const bool is_active   = not_active && ( s_active_function || s_active_function_arg );
+
+  if ( not_process || is_active ) {
+    std::string msg( label );
+    msg.append( " : FAILED" );
+    if ( not_process ) msg.append(" : not called by main process");
+    if ( is_active )   msg.append(" : parallel execution in progress");
+    Kokkos::Impl::throw_runtime_exception( msg );
+  }
+}
+
+}
+
+int QthreadExec::worker_per_shepherd()
+{
+  return s_number_workers_per_shepherd ;
+}
+
+QthreadExec::QthreadExec()
+{
+  const int shepherd_rank        = qthread_shep();
+  const int shepherd_worker_rank = qthread_worker_local(NULL);
+  const int worker_rank          = shepherd_rank * s_number_workers_per_shepherd + shepherd_worker_rank ;
+
+  m_worker_base          = s_exec ;
+  m_shepherd_base        = s_exec + s_number_workers_per_shepherd * ( ( s_number_shepherds - ( shepherd_rank + 1 ) ) );
+  m_scratch_alloc        = ( (unsigned char *) this ) + s_base_size ;
+  m_reduce_end           = s_worker_reduce_end ;
+  m_shepherd_rank        = shepherd_rank ;
+  m_shepherd_size        = s_number_shepherds ;
+  m_shepherd_worker_rank = shepherd_worker_rank ;
+  m_shepherd_worker_size = s_number_workers_per_shepherd ;
+  m_worker_rank          = worker_rank ;
+  m_worker_size          = s_number_workers ;
+  m_worker_state         = QthreadExec::Active ;
+}
+
+void QthreadExec::clear_workers()
+{
+  for ( int iwork = 0 ; iwork < s_number_workers ; ++iwork ) {
+    QthreadExec * const exec = s_exec[iwork] ;
+    s_exec[iwork] = 0 ;
+    free( exec );
+  }
+}
+
+void QthreadExec::shared_reset( Qthread::scratch_memory_space & space )
+{
+  new( & space )
+    Qthread::scratch_memory_space(
+      ((unsigned char *) (**m_shepherd_base).m_scratch_alloc ) + s_worker_shared_begin ,
+      s_worker_shared_end - s_worker_shared_begin
+    );
+}
+
+void QthreadExec::resize_worker_scratch( const int reduce_size , const int shared_size )
+{
+  const int exec_all_reduce_alloc = align_alloc( reduce_size );
+  const int shepherd_scan_alloc   = align_alloc( 8 );
+  const int shepherd_shared_end   = exec_all_reduce_alloc + shepherd_scan_alloc + align_alloc( shared_size );
+
+  if ( s_worker_reduce_end < exec_all_reduce_alloc ||
+       s_worker_shared_end < shepherd_shared_end ) {
+
+/*
+  fprintf( stdout , "QthreadExec::resize\n");
+  fflush(stdout);
+*/
+
+    // Clear current worker memory before allocating new worker memory
+    clear_workers();
+
+    // Increase the buffers to an aligned allocation
+    s_worker_reduce_end   = exec_all_reduce_alloc ;
+    s_worker_shared_begin = exec_all_reduce_alloc + shepherd_scan_alloc ;
+    s_worker_shared_end   = shepherd_shared_end ;
+
+    // Need to query which shepherd this main 'process' is running...
+ 
+    const int main_shep = qthread_shep();
+
+    // Have each worker resize its memory for proper first-touch
+#if 0
+    for ( int jshep = 0 ; jshep < s_number_shepherds ; ++jshep ) {
+    for ( int i = jshep != main_shep ? 0 : 1 ; i < s_number_workers_per_shepherd ; ++i ) {
+      qthread_fork_to( driver_resize_worker_scratch , NULL , NULL , jshep );
+    }}
+#else
+    // If this function is used before the 'qthread.task_policy' unit test
+    // the 'qthread.task_policy' unit test fails with a seg-fault within libqthread.so.
+    for ( int jshep = 0 ; jshep < s_number_shepherds ; ++jshep ) {
+      const int num_clone = jshep != main_shep ? s_number_workers_per_shepherd : s_number_workers_per_shepherd - 1 ;
+
+      if ( num_clone ) {
+        const int ret = qthread_fork_clones_to_local_priority
+          ( driver_resize_worker_scratch   /* function */
+          , NULL                           /* function data block */
+          , NULL                           /* pointer to return value feb */
+          , jshep                          /* shepherd number */
+          , num_clone - 1                  /* number of instances - 1 */
+          );
+
+        assert(ret == QTHREAD_SUCCESS);
+      }
+    }
+#endif
+
+    driver_resize_worker_scratch( NULL );
+
+    // Verify all workers allocated
+
+    bool ok = true ;
+    for ( int iwork = 0 ; ok && iwork < s_number_workers ; ++iwork ) { ok = 0 != s_exec[iwork] ; }
+
+    if ( ! ok ) {
+      std::ostringstream msg ;
+      msg << "Kokkos::Impl::QthreadExec::resize : FAILED for workers {" ;
+      for ( int iwork = 0 ; iwork < s_number_workers ; ++iwork ) {
+         if ( 0 == s_exec[iwork] ) { msg << " " << ( s_number_workers - ( iwork + 1 ) ); }
+      }
+      msg << " }" ;
+      Kokkos::Impl::throw_runtime_exception( msg.str() );
+    }
+  }
+}
+
+void QthreadExec::exec_all( Qthread & , QthreadExecFunctionPointer func , const void * arg )
+{
+  verify_is_process("QthreadExec::exec_all(...)",true);
+
+/*
+  fprintf( stdout , "QthreadExec::exec_all\n");
+  fflush(stdout);
+*/
+
+  s_active_function     = func ;
+  s_active_function_arg = arg ;
+
+  // Need to query which shepherd this main 'process' is running...
+ 
+  const int main_shep = qthread_shep();
+
+#if 0
+  for ( int jshep = 0 , iwork = 0 ; jshep < s_number_shepherds ; ++jshep ) {
+  for ( int i = jshep != main_shep ? 0 : 1 ; i < s_number_workers_per_shepherd ; ++i , ++iwork ) {
+    qthread_fork_to( driver_exec_all , NULL , NULL , jshep );
+  }}
+#else
+  // If this function is used before the 'qthread.task_policy' unit test
+  // the 'qthread.task_policy' unit test fails with a seg-fault within libqthread.so.
+  for ( int jshep = 0 ; jshep < s_number_shepherds ; ++jshep ) {
+    const int num_clone = jshep != main_shep ? s_number_workers_per_shepherd : s_number_workers_per_shepherd - 1 ;
+
+    if ( num_clone ) {
+      const int ret = qthread_fork_clones_to_local_priority
+        ( driver_exec_all   /* function */
+        , NULL              /* function data block */
+        , NULL              /* pointer to return value feb */
+        , jshep             /* shepherd number */
+        , num_clone - 1     /* number of instances - 1 */
+        );
+
+      assert(ret == QTHREAD_SUCCESS);
+    }
+  }
+#endif
+
+  driver_exec_all( NULL );
+
+  s_active_function     = 0 ;
+  s_active_function_arg = 0 ;
+}
+
+void * QthreadExec::exec_all_reduce_result()
+{
+  return s_exec[0]->m_scratch_alloc ;
+}
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+namespace Kokkos {
+namespace Impl {
+
+QthreadTeamPolicyMember::QthreadTeamPolicyMember()
+  : m_exec( **worker_exec() )
+  , m_team_shared(0,0)
+  , m_team_size( 1 )
+  , m_team_rank( 0 )
+  , m_league_size(1)
+  , m_league_end(1)
+  , m_league_rank(0)
+{
+  m_exec.shared_reset( m_team_shared );
+}
+
+QthreadTeamPolicyMember::QthreadTeamPolicyMember( const QthreadTeamPolicyMember::TaskTeam & )
+  : m_exec( **worker_exec() )
+  , m_team_shared(0,0)
+  , m_team_size( s_number_workers_per_shepherd )
+  , m_team_rank( m_exec.shepherd_worker_rank() )
+  , m_league_size(1)
+  , m_league_end(1)
+  , m_league_rank(0)
+{
+  m_exec.shared_reset( m_team_shared );
+}
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_HAVE_QTHREAD ) */
+
diff --git a/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.hpp b/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f948eb2903b631e82727e670e84339383d5891c9
--- /dev/null
+++ b/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.hpp
@@ -0,0 +1,620 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_QTHREADEXEC_HPP
+#define KOKKOS_QTHREADEXEC_HPP
+
+#include <impl/Kokkos_spinwait.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+
+class QthreadExec ;
+
+typedef void (*QthreadExecFunctionPointer)( QthreadExec & , const void * );
+
+class QthreadExec {
+private:
+
+  enum { Inactive = 0 , Active = 1 };
+
+  const QthreadExec * const * m_worker_base ;
+  const QthreadExec * const * m_shepherd_base ;
+
+  void  * m_scratch_alloc ;  ///< Scratch memory [ reduce , team , shared ]
+  int     m_reduce_end ;     ///< End of scratch reduction memory
+
+  int     m_shepherd_rank ;
+  int     m_shepherd_size ;
+
+  int     m_shepherd_worker_rank ;
+  int     m_shepherd_worker_size ;
+
+  /*
+   *  m_worker_rank = m_shepherd_rank * m_shepherd_worker_size + m_shepherd_worker_rank
+   *  m_worker_size = m_shepherd_size * m_shepherd_worker_size
+   */
+  int     m_worker_rank ;
+  int     m_worker_size ;
+
+  int mutable volatile m_worker_state ;
+
+
+  friend class Kokkos::Qthread ;
+
+  ~QthreadExec();
+  QthreadExec( const QthreadExec & );
+  QthreadExec & operator = ( const QthreadExec & );
+
+public:
+
+  QthreadExec();
+
+  /** Execute the input function on all available Qthread workers */
+  static void exec_all( Qthread & , QthreadExecFunctionPointer , const void * );
+
+  //----------------------------------------
+  /** Barrier across all workers participating in the 'exec_all' */
+  void exec_all_barrier() const
+    {
+      const int rev_rank = m_worker_size - ( m_worker_rank + 1 );
+
+      int n , j ;
+
+      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
+        Impl::spinwait( m_worker_base[j]->m_worker_state , QthreadExec::Active );
+      }
+
+      if ( rev_rank ) {
+        m_worker_state = QthreadExec::Inactive ;
+        Impl::spinwait( m_worker_state , QthreadExec::Inactive );
+      }
+
+      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
+        m_worker_base[j]->m_worker_state = QthreadExec::Active ;
+      }
+    }
+
+  /** Barrier across workers within the shepherd with rank < team_rank */
+  void shepherd_barrier( const int team_size ) const
+    {
+      if ( m_shepherd_worker_rank < team_size ) {
+
+        const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
+
+        int n , j ;
+
+        for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
+          Impl::spinwait( m_shepherd_base[j]->m_worker_state , QthreadExec::Active );
+        }
+
+        if ( rev_rank ) {
+          m_worker_state = QthreadExec::Inactive ;
+          Impl::spinwait( m_worker_state , QthreadExec::Inactive );
+        }
+
+        for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
+          m_shepherd_base[j]->m_worker_state = QthreadExec::Active ;
+        }
+      }
+    }
+
+  //----------------------------------------
+  /** Reduce across all workers participating in the 'exec_all' */
+  template< class FunctorType , class ReducerType , class ArgTag >
+  inline
+  void exec_all_reduce( const FunctorType & func, const ReducerType & reduce ) const
+    {
+      typedef Kokkos::Impl::if_c< std::is_same<InvalidType, ReducerType>::value, FunctorType, ReducerType > ReducerConditional;
+      typedef typename ReducerConditional::type ReducerTypeFwd;
+      typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd, ArgTag > ValueJoin ;
+
+      const int rev_rank = m_worker_size - ( m_worker_rank + 1 );
+
+      int n , j ;
+
+      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
+        const QthreadExec & fan = *m_worker_base[j];
+
+        Impl::spinwait( fan.m_worker_state , QthreadExec::Active );
+
+        ValueJoin::join( ReducerConditional::select(func , reduce) , m_scratch_alloc , fan.m_scratch_alloc );
+      }
+
+      if ( rev_rank ) {
+        m_worker_state = QthreadExec::Inactive ;
+        Impl::spinwait( m_worker_state , QthreadExec::Inactive );
+      }
+
+      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
+        m_worker_base[j]->m_worker_state = QthreadExec::Active ;
+      }
+    }
+
+  //----------------------------------------
+  /** Scall across all workers participating in the 'exec_all' */
+  template< class FunctorType , class ArgTag >
+  inline
+  void exec_all_scan( const FunctorType & func ) const
+    {
+      typedef Kokkos::Impl::FunctorValueInit<   FunctorType , ArgTag > ValueInit ;
+      typedef Kokkos::Impl::FunctorValueJoin<   FunctorType , ArgTag > ValueJoin ;
+      typedef Kokkos::Impl::FunctorValueOps<    FunctorType , ArgTag > ValueOps ;
+
+      const int rev_rank = m_worker_size - ( m_worker_rank + 1 );
+
+      int n , j ;
+
+      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
+        Impl::spinwait( m_worker_base[j]->m_worker_state , QthreadExec::Active );
+      }
+
+      if ( rev_rank ) {
+        m_worker_state = QthreadExec::Inactive ;
+        Impl::spinwait( m_worker_state , QthreadExec::Inactive );
+      }
+      else {
+        // Root thread scans across values before releasing threads
+        // Worker data is in reverse order, so m_worker_base[0] is the
+        // highest ranking thread.
+
+        // Copy from lower ranking to higher ranking worker.
+        for ( int i = 1 ; i < m_worker_size ; ++i ) {
+          ValueOps::copy( func
+                        , m_worker_base[i-1]->m_scratch_alloc
+                        , m_worker_base[i]->m_scratch_alloc
+                        );
+        }
+
+        ValueInit::init( func , m_worker_base[m_worker_size-1]->m_scratch_alloc );
+
+        // Join from lower ranking to higher ranking worker.
+        // Value at m_worker_base[n-1] is zero so skip adding it to m_worker_base[n-2].
+        for ( int i = m_worker_size - 1 ; --i > 0 ; ) {
+          ValueJoin::join( func , m_worker_base[i-1]->m_scratch_alloc , m_worker_base[i]->m_scratch_alloc );
+        }
+      }
+
+      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
+        m_worker_base[j]->m_worker_state = QthreadExec::Active ;
+      }
+    }
+
+  //----------------------------------------
+
+  template< class Type>
+  inline
+  volatile Type * shepherd_team_scratch_value() const
+    { return (volatile Type*)(((unsigned char *) m_scratch_alloc) + m_reduce_end); }
+
+  template< class Type >
+  inline
+  void shepherd_broadcast( Type & value , const int team_size , const int team_rank ) const
+    {
+      if ( m_shepherd_base ) {
+        Type * const shared_value = m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
+        if ( m_shepherd_worker_rank == team_rank ) { *shared_value = value ; }
+        memory_fence();
+        shepherd_barrier( team_size );
+        value = *shared_value ;
+      }
+    }
+
+  template< class Type >
+  inline
+  Type shepherd_reduce( const int team_size , const Type & value ) const
+    {
+      *shepherd_team_scratch_value<Type>() = value ;
+
+      memory_fence();
+
+      const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
+
+      int n , j ;
+
+      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
+        Impl::spinwait( m_shepherd_base[j]->m_worker_state , QthreadExec::Active );
+      }
+
+      if ( rev_rank ) {
+        m_worker_state = QthreadExec::Inactive ;
+        Impl::spinwait( m_worker_state , QthreadExec::Inactive );
+      }
+      else {
+        Type & accum = * m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
+        for ( int i = 1 ; i < n ; ++i ) {
+          accum += * m_shepherd_base[i]->shepherd_team_scratch_value<Type>();
+        }
+        for ( int i = 1 ; i < n ; ++i ) {
+          * m_shepherd_base[i]->shepherd_team_scratch_value<Type>() = accum ;
+        }
+
+        memory_fence();
+      }
+
+      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
+        m_shepherd_base[j]->m_worker_state = QthreadExec::Active ;
+      }
+
+      return *shepherd_team_scratch_value<Type>();
+    }
+
+  template< class JoinOp >
+  inline
+  typename JoinOp::value_type
+    shepherd_reduce( const int team_size
+                   , const typename JoinOp::value_type & value
+                   , const JoinOp & op ) const
+    {
+      typedef typename JoinOp::value_type Type ;
+
+      *shepherd_team_scratch_value<Type>() = value ;
+
+      memory_fence();
+
+      const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
+
+      int n , j ;
+
+      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
+        Impl::spinwait( m_shepherd_base[j]->m_worker_state , QthreadExec::Active );
+      }
+
+      if ( rev_rank ) {
+        m_worker_state = QthreadExec::Inactive ;
+        Impl::spinwait( m_worker_state , QthreadExec::Inactive );
+      }
+      else {
+        volatile Type & accum = * m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
+        for ( int i = 1 ; i < team_size ; ++i ) {
+          op.join( accum , * m_shepherd_base[i]->shepherd_team_scratch_value<Type>() );
+        }
+        for ( int i = 1 ; i < team_size ; ++i ) {
+          * m_shepherd_base[i]->shepherd_team_scratch_value<Type>() = accum ;
+        }
+
+        memory_fence();
+      }
+
+      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
+        m_shepherd_base[j]->m_worker_state = QthreadExec::Active ;
+      }
+
+      return *shepherd_team_scratch_value<Type>();
+    }
+
+  template< class Type >
+  inline
+  Type shepherd_scan( const int team_size
+                    , const Type & value
+                    ,       Type * const global_value = 0 ) const
+    {
+      *shepherd_team_scratch_value<Type>() = value ;
+
+      memory_fence();
+
+      const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
+
+      int n , j ;
+
+      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
+        Impl::spinwait( m_shepherd_base[j]->m_worker_state , QthreadExec::Active );
+      }
+
+      if ( rev_rank ) {
+        m_worker_state = QthreadExec::Inactive ;
+        Impl::spinwait( m_worker_state , QthreadExec::Inactive );
+      }
+      else {
+        // Root thread scans across values before releasing threads
+        // Worker data is in reverse order, so m_shepherd_base[0] is the
+        // highest ranking thread.
+
+        // Copy from lower ranking to higher ranking worker.
+
+        Type accum = * m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
+        for ( int i = 1 ; i < team_size ; ++i ) {
+          const Type tmp = * m_shepherd_base[i]->shepherd_team_scratch_value<Type>();
+          accum += tmp ;
+          * m_shepherd_base[i-1]->shepherd_team_scratch_value<Type>() = tmp ;
+        }
+
+        * m_shepherd_base[team_size-1]->shepherd_team_scratch_value<Type>() =
+          global_value ? atomic_fetch_add( global_value , accum ) : 0 ;
+
+        // Join from lower ranking to higher ranking worker.
+        for ( int i = team_size ; --i ; ) {
+          * m_shepherd_base[i-1]->shepherd_team_scratch_value<Type>() += * m_shepherd_base[i]->shepherd_team_scratch_value<Type>();
+        }
+
+        memory_fence();
+      }
+
+      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
+        m_shepherd_base[j]->m_worker_state = QthreadExec::Active ;
+      }
+
+      return *shepherd_team_scratch_value<Type>();
+    }
+
+  //----------------------------------------
+
+  static inline
+  int align_alloc( int size )
+    {
+      enum { ALLOC_GRAIN = 1 << 6 /* power of two, 64bytes */};
+      enum { ALLOC_GRAIN_MASK = ALLOC_GRAIN - 1 };
+      return ( size + ALLOC_GRAIN_MASK ) & ~ALLOC_GRAIN_MASK ;
+    }
+
+  void shared_reset( Qthread::scratch_memory_space & );
+
+  void * exec_all_reduce_value() const { return m_scratch_alloc ; }
+
+  static void * exec_all_reduce_result();
+
+  static void resize_worker_scratch( const int reduce_size , const int shared_size );
+  static void clear_workers();
+
+  //----------------------------------------
+
+  inline int worker_rank() const { return m_worker_rank ; }
+  inline int worker_size() const { return m_worker_size ; }
+  inline int shepherd_worker_rank() const { return m_shepherd_worker_rank ; }
+  inline int shepherd_worker_size() const { return m_shepherd_worker_size ; }
+  inline int shepherd_rank() const { return m_shepherd_rank ; }
+  inline int shepherd_size() const { return m_shepherd_size ; }
+
+  static int worker_per_shepherd();
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+class QthreadTeamPolicyMember {
+private:
+
+  typedef Kokkos::Qthread                        execution_space ;
+  typedef execution_space::scratch_memory_space  scratch_memory_space ;
+
+
+        Impl::QthreadExec   & m_exec ;
+  scratch_memory_space        m_team_shared ;
+  const int                   m_team_size ;
+  const int                   m_team_rank ;
+  const int                   m_league_size ;
+  const int                   m_league_end ;
+        int                   m_league_rank ;
+
+public:
+
+  KOKKOS_INLINE_FUNCTION
+  const scratch_memory_space & team_shmem() const { return m_team_shared ; }
+
+  KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
+  KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
+  KOKKOS_INLINE_FUNCTION int team_rank() const { return m_team_rank ; }
+  KOKKOS_INLINE_FUNCTION int team_size() const { return m_team_size ; }
+
+  KOKKOS_INLINE_FUNCTION void team_barrier() const
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    {}
+#else
+    { m_exec.shepherd_barrier( m_team_size ); }
+#endif
+
+  template< typename Type >
+  KOKKOS_INLINE_FUNCTION Type team_broadcast( const Type & value , int rank ) const
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { return Type(); }
+#else
+    { return m_exec.template shepherd_broadcast<Type>( value , m_team_size , rank ); }
+#endif
+
+  template< typename Type >
+  KOKKOS_INLINE_FUNCTION Type team_reduce( const Type & value ) const
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { return Type(); }
+#else
+    { return m_exec.template shepherd_reduce<Type>( m_team_size , value ); }
+#endif
+
+  template< typename JoinOp >
+  KOKKOS_INLINE_FUNCTION typename JoinOp::value_type
+    team_reduce( const typename JoinOp::value_type & value
+               , const JoinOp & op ) const
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { return typename JoinOp::value_type(); }
+#else
+    { return m_exec.template shepherd_reduce<JoinOp>( m_team_size , value , op ); }
+#endif
+
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering.
+   *
+   *  The highest rank thread can compute the reduction total as
+   *    reduction_total = dev.team_scan( value ) + value ;
+   */
+  template< typename Type >
+  KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value ) const
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { return Type(); }
+#else
+    { return m_exec.template shepherd_scan<Type>( m_team_size , value ); }
+#endif
+
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
+   *          with intra-team non-deterministic ordering accumulation.
+   *
+   *  The global inter-team accumulation value will, at the end of the
+   *  league's parallel execution, be the scan's total.
+   *  Parallel execution ordering of the league's teams is non-deterministic.
+   *  As such the base value for each team's scan operation is similarly
+   *  non-deterministic.
+   */
+  template< typename Type >
+  KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value , Type * const global_accum ) const
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { return Type(); }
+#else
+    { return m_exec.template shepherd_scan<Type>( m_team_size , value , global_accum ); }
+#endif
+
+  //----------------------------------------
+  // Private driver for task-team parallel
+
+  struct TaskTeam {};
+
+  QthreadTeamPolicyMember();
+  explicit QthreadTeamPolicyMember( const TaskTeam & );
+
+  //----------------------------------------
+  // Private for the driver ( for ( member_type i(exec,team); i ; i.next_team() ) { ... }
+
+  // Initialize
+  template< class ... Properties >
+  QthreadTeamPolicyMember( Impl::QthreadExec & exec
+                         , const Kokkos::Impl::TeamPolicyInternal<Qthread,Properties...> & team )
+    : m_exec( exec )
+    , m_team_shared(0,0)
+    , m_team_size(   team.m_team_size )
+    , m_team_rank(   exec.shepherd_worker_rank() )
+    , m_league_size( team.m_league_size )
+    , m_league_end(  team.m_league_size - team.m_shepherd_iter * ( exec.shepherd_size() - ( exec.shepherd_rank() + 1 ) ) )
+    , m_league_rank( m_league_end > team.m_shepherd_iter ? m_league_end - team.m_shepherd_iter : 0 )
+  {
+    m_exec.shared_reset( m_team_shared );
+  }
+
+  // Continue
+  operator bool () const { return m_league_rank < m_league_end ; }
+
+  // iterate
+  void next_team() { ++m_league_rank ; m_exec.shared_reset( m_team_shared ); }
+};
+
+
+template< class ... Properties >
+class TeamPolicyInternal< Kokkos::Qthread , Properties ... >
+  : public PolicyTraits< Properties... >
+{
+private:
+
+  const int m_league_size ;
+  const int m_team_size ;
+  const int m_shepherd_iter ;
+
+public:
+
+  //! Tag this class as a kokkos execution policy
+  typedef TeamPolicyInternal  execution_policy ;
+  typedef Qthread             execution_space ;
+  typedef PolicyTraits< Properties ... >  traits ;
+
+  //----------------------------------------
+
+  template< class FunctorType >
+  inline static
+  int team_size_max( const FunctorType & )
+    { return Qthread::instance().shepherd_worker_size(); }
+
+  template< class FunctorType >
+  static int team_size_recommended( const FunctorType & f )
+    { return team_size_max( f ); }
+
+  template< class FunctorType >
+  inline static
+  int team_size_recommended( const FunctorType & f , const int& )
+    { return team_size_max( f ); }
+
+  //----------------------------------------
+
+  inline int team_size()   const { return m_team_size ; }
+  inline int league_size() const { return m_league_size ; }
+
+  // One active team per shepherd
+  TeamPolicyInternal( Kokkos::Qthread & q
+                    , const int league_size
+                    , const int team_size
+                    , const int /* vector_length */ = 0
+                    )
+    : m_league_size( league_size )
+    , m_team_size( team_size < q.shepherd_worker_size()
+                 ? team_size : q.shepherd_worker_size() )
+    , m_shepherd_iter( ( league_size + q.shepherd_size() - 1 ) / q.shepherd_size() )
+    {
+    }
+
+  // One active team per shepherd
+  TeamPolicyInternal( const int league_size
+                    , const int team_size
+                    , const int /* vector_length */ = 0
+                    )
+    : m_league_size( league_size )
+    , m_team_size( team_size < Qthread::instance().shepherd_worker_size()
+                 ? team_size : Qthread::instance().shepherd_worker_size() )
+    , m_shepherd_iter( ( league_size + Qthread::instance().shepherd_size() - 1 ) / Qthread::instance().shepherd_size() )
+    {
+    }
+
+  typedef Impl::QthreadTeamPolicyMember member_type ;
+
+  friend class Impl::QthreadTeamPolicyMember ;
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #define KOKKOS_QTHREADEXEC_HPP */
+
diff --git a/lib/kokkos/core/src/Qthread/Kokkos_Qthread_Parallel.hpp b/lib/kokkos/core/src/Qthread/Kokkos_Qthread_Parallel.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..5b6419289fc4874f1d97034aa7decd9be0eca147
--- /dev/null
+++ b/lib/kokkos/core/src/Qthread/Kokkos_Qthread_Parallel.hpp
@@ -0,0 +1,745 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_QTHREAD_PARALLEL_HPP
+#define KOKKOS_QTHREAD_PARALLEL_HPP
+
+#include <vector>
+
+#include <Kokkos_Parallel.hpp>
+
+#include <impl/Kokkos_StaticAssert.hpp>
+#include <impl/Kokkos_FunctorAdapter.hpp>
+
+#include <Qthread/Kokkos_QthreadExec.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+
+template< class FunctorType , class ... Traits >
+class ParallelFor< FunctorType
+                 , Kokkos::RangePolicy< Traits ... >
+                 , Kokkos::Qthread
+                 >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Traits ... >  Policy ;
+
+  typedef typename Policy::work_tag     WorkTag ;
+  typedef typename Policy::member_type  Member ;
+  typedef typename Policy::WorkRange    WorkRange ;
+
+  const FunctorType  m_functor ;
+  const Policy       m_policy ;
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec_range( const FunctorType & functor , const Member ibeg , const Member iend )
+    {
+      for ( Member i = ibeg ; i < iend ; ++i ) {
+        functor( i );
+      }
+    }
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec_range( const FunctorType & functor , const Member ibeg , const Member iend )
+    {
+      const TagType t{} ;
+      for ( Member i = ibeg ; i < iend ; ++i ) {
+        functor( t , i );
+      }
+    }
+
+  // Function is called once by every concurrent thread.
+  static void exec( QthreadExec & exec , const void * arg )
+  {
+    const ParallelFor & self = * ((const ParallelFor *) arg );
+
+    const WorkRange range( self.m_policy, exec.worker_rank(), exec.worker_size() );
+
+    ParallelFor::template exec_range< WorkTag > ( self.m_functor , range.begin() , range.end() );
+
+    // All threads wait for completion.
+    exec.exec_all_barrier();
+  }
+
+public:
+
+  inline
+  void execute() const
+    {
+      Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelFor::exec , this );
+
+    }
+
+  ParallelFor( const FunctorType & arg_functor
+             , const Policy      & arg_policy
+             )
+    : m_functor( arg_functor )
+    , m_policy(  arg_policy )
+    { }
+};
+
+//----------------------------------------------------------------------------
+
+template< class FunctorType , class ReducerType , class ... Traits >
+class ParallelReduce< FunctorType
+                    , Kokkos::RangePolicy< Traits ... >
+                    , ReducerType
+                    , Kokkos::Qthread
+                    >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Traits ... >  Policy ;
+
+  typedef typename Policy::work_tag     WorkTag ;
+  typedef typename Policy::WorkRange    WorkRange ;
+  typedef typename Policy::member_type  Member ;
+
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType, ReducerType>::value, FunctorType, ReducerType > ReducerConditional;
+  typedef typename ReducerConditional::type ReducerTypeFwd;
+
+  // Static Assert WorkTag void if ReducerType not InvalidType
+
+  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd, WorkTag > ValueInit ;
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+
+  const FunctorType   m_functor ;
+  const Policy        m_policy ;
+  const ReducerType   m_reducer ;
+  const pointer_type  m_result_ptr ;
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec_range( const FunctorType & functor
+            , const Member ibeg , const Member iend
+            , reference_type update )
+    {
+      for ( Member i = ibeg ; i < iend ; ++i ) {
+        functor( i , update );
+      }
+    }
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec_range( const FunctorType & functor
+            , const Member ibeg , const Member iend
+            , reference_type update )
+    {
+      const TagType t{} ;
+      for ( Member i = ibeg ; i < iend ; ++i ) {
+        functor( t , i , update );
+      }
+    }
+
+  static void exec( QthreadExec & exec , const void * arg )
+  {
+    const ParallelReduce & self = * ((const ParallelReduce *) arg );
+
+    const WorkRange range( self.m_policy, exec.worker_rank(), exec.worker_size() );
+
+    ParallelReduce::template exec_range< WorkTag >(
+      self.m_functor, range.begin(), range.end(),
+      ValueInit::init( ReducerConditional::select(self.m_functor , self.m_reducer)
+                     , exec.exec_all_reduce_value() ) );
+
+    exec.template exec_all_reduce< FunctorType, ReducerType, WorkTag >( self.m_functor, self.m_reducer );
+  }
+
+public:
+
+  inline
+  void execute() const
+    {
+      QthreadExec::resize_worker_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 );
+      Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelReduce::exec , this );
+
+      const pointer_type data = (pointer_type) QthreadExec::exec_all_reduce_result();
+
+      Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , data );
+
+      if ( m_result_ptr ) {
+        const unsigned n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
+        for ( unsigned i = 0 ; i < n ; ++i ) { m_result_ptr[i] = data[i]; }
+      }
+    }
+
+  template< class ViewType >
+  ParallelReduce( const FunctorType  & arg_functor
+                , const Policy       & arg_policy
+                , const ViewType & arg_result_view
+                , typename std::enable_if<Kokkos::is_view< ViewType >::value &&
+                                          !Kokkos::is_reducer_type< ReducerType >::value
+                                          , void*>::type = NULL)
+    : m_functor( arg_functor )
+    , m_policy( arg_policy )
+    , m_reducer( InvalidType() )
+    , m_result_ptr( arg_result_view.data() )
+    { }
+
+  ParallelReduce( const FunctorType & arg_functor
+                , Policy       arg_policy
+                , const ReducerType& reducer )
+    : m_functor( arg_functor )
+    , m_policy( arg_policy )
+    , m_reducer( reducer )
+    , m_result_ptr( reducer.result_view().data() )
+    { }
+};
+
+//----------------------------------------------------------------------------
+
+template< class FunctorType , class ... Properties >
+class ParallelFor< FunctorType
+                 , TeamPolicy< Properties ... >
+                 , Kokkos::Qthread >
+{
+private:
+
+  typedef Kokkos::Impl::TeamPolicyInternal< Kokkos::Qthread , Properties ... > Policy ;
+  typedef typename Policy::member_type  Member ;
+  typedef typename Policy::work_tag     WorkTag ;
+
+  const FunctorType  m_functor ;
+  const Policy       m_policy ;
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec_team( const FunctorType & functor , Member member )
+    {
+      while ( member ) {
+        functor( member );
+        member.team_barrier();
+        member.next_team();
+      }
+    }
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec_team( const FunctorType & functor , Member member )
+    {
+      const TagType t{} ;
+      while ( member ) {
+        functor( t , member );
+        member.team_barrier();
+        member.next_team();
+      }
+    }
+
+  static void exec( QthreadExec & exec , const void * arg )
+  {
+    const ParallelFor & self = * ((const ParallelFor *) arg );
+
+    ParallelFor::template exec_team< WorkTag >
+      ( self.m_functor , Member( exec , self.m_policy ) );
+
+    exec.exec_all_barrier();
+  }
+
+public:
+
+  inline
+  void execute() const
+    {
+      QthreadExec::resize_worker_scratch
+        ( /* reduction   memory */ 0
+        , /* team shared memory */ FunctorTeamShmemSize< FunctorType >::value( m_functor , m_policy.team_size() ) );
+      Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelFor::exec , this );
+    }
+
+  ParallelFor( const FunctorType & arg_functor ,
+               const Policy      & arg_policy )
+    : m_functor( arg_functor )
+    , m_policy( arg_policy )
+    { }
+};
+
+//----------------------------------------------------------------------------
+
+template< class FunctorType , class ReducerType , class ... Properties >
+class ParallelReduce< FunctorType
+                    , TeamPolicy< Properties... >
+                    , ReducerType
+                    , Kokkos::Qthread
+                    >
+{
+private:
+
+  typedef Kokkos::Impl::TeamPolicyInternal< Kokkos::Qthread , Properties ... > Policy ;
+
+  typedef typename Policy::work_tag     WorkTag ;
+  typedef typename Policy::member_type  Member ;
+
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
+  typedef typename ReducerConditional::type ReducerTypeFwd;
+
+  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTag >  ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd , WorkTag >  ValueInit ;
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+
+  const FunctorType  m_functor ;
+  const Policy       m_policy ;
+  const ReducerType  m_reducer ;
+  const pointer_type m_result_ptr ;
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec_team( const FunctorType & functor , Member member , reference_type update )
+    {
+      while ( member ) {
+        functor( member , update );
+        member.team_barrier();
+        member.next_team();
+      }
+    }
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec_team( const FunctorType & functor , Member member , reference_type update )
+    {
+      const TagType t{} ;
+      while ( member ) {
+        functor( t , member , update );
+        member.team_barrier();
+        member.next_team();
+      }
+    }
+
+  static void exec( QthreadExec & exec , const void * arg )
+  {
+    const ParallelReduce & self = * ((const ParallelReduce *) arg );
+
+    ParallelReduce::template exec_team< WorkTag >
+      ( self.m_functor
+      , Member( exec , self.m_policy )
+      , ValueInit::init( ReducerConditional::select( self.m_functor , self.m_reducer )
+                       , exec.exec_all_reduce_value() ) );
+
+    exec.template exec_all_reduce< FunctorType, ReducerType, WorkTag >( self.m_functor, self.m_reducer );
+  }
+
+public:
+
+  inline
+  void execute() const
+    {
+      QthreadExec::resize_worker_scratch
+        ( /* reduction   memory */ ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) )
+        , /* team shared memory */ FunctorTeamShmemSize< FunctorType >::value( m_functor , m_policy.team_size() ) );
+
+      Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelReduce::exec , this );
+
+      const pointer_type data = (pointer_type) QthreadExec::exec_all_reduce_result();
+
+      Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer), data );
+
+      if ( m_result_ptr ) {
+        const unsigned n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
+        for ( unsigned i = 0 ; i < n ; ++i ) { m_result_ptr[i] = data[i]; }
+      }
+    }
+
+  template< class ViewType >
+  ParallelReduce( const FunctorType & arg_functor
+                , const Policy      & arg_policy
+                , const ViewType    & arg_result
+                , typename std::enable_if<Kokkos::is_view< ViewType >::value &&
+                                          !Kokkos::is_reducer_type< ReducerType >::value
+                                          , void*>::type = NULL)
+    : m_functor( arg_functor )
+    , m_policy( arg_policy )
+    , m_reducer( InvalidType() )
+    , m_result_ptr( arg_result.ptr_on_device() )
+    { }
+
+  inline
+  ParallelReduce( const FunctorType & arg_functor
+                , Policy       arg_policy
+                , const ReducerType& reducer )
+  : m_functor( arg_functor )
+  , m_policy( arg_policy )
+  , m_reducer( reducer )
+  , m_result_ptr( reducer.result_view().data() )
+  { }
+};
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+template< class FunctorType , class ... Traits >
+class ParallelScan< FunctorType
+                  , Kokkos::RangePolicy< Traits ... >
+                  , Kokkos::Qthread
+                  >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Traits ... >  Policy ;
+
+  typedef typename Policy::work_tag     WorkTag ;
+  typedef typename Policy::WorkRange    WorkRange ;
+  typedef typename Policy::member_type  Member ;
+
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   FunctorType, WorkTag > ValueInit ;
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+
+  const FunctorType  m_functor ;
+  const Policy       m_policy ;
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec_range( const FunctorType & functor
+            , const Member ibeg , const Member iend
+            , reference_type update , const bool final )
+    {
+      for ( Member i = ibeg ; i < iend ; ++i ) {
+        functor( i , update , final );
+      }
+    }
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec_range( const FunctorType & functor
+            , const Member ibeg , const Member iend
+            , reference_type update , const bool final )
+    {
+      const TagType t{} ;
+      for ( Member i = ibeg ; i < iend ; ++i ) {
+        functor( t , i , update , final );
+      }
+    }
+
+  static void exec( QthreadExec & exec , const void * arg )
+  {
+    const ParallelScan & self = * ((const ParallelScan *) arg );
+
+    const WorkRange range( self.m_policy , exec.worker_rank() , exec.worker_size() );
+
+    // Initialize thread-local value
+    reference_type update = ValueInit::init( self.m_functor , exec.exec_all_reduce_value() );
+
+    ParallelScan::template exec_range< WorkTag >( self.m_functor, range.begin() , range.end() , update , false );
+
+    exec.template exec_all_scan< FunctorType , typename Policy::work_tag >( self.m_functor );
+
+    ParallelScan::template exec_range< WorkTag >( self.m_functor , range.begin() , range.end() , update , true );
+
+    exec.exec_all_barrier();
+  }
+
+public:
+
+  inline
+  void execute() const
+    {
+      QthreadExec::resize_worker_scratch( ValueTraits::value_size( m_functor ) , 0 );
+      Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelScan::exec , this );
+    }
+
+  ParallelScan( const FunctorType & arg_functor
+              , const Policy      & arg_policy
+              )
+    : m_functor( arg_functor )
+    , m_policy( arg_policy )
+    {
+    }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember>
+TeamThreadRange(const Impl::QthreadTeamPolicyMember& thread, const iType& count)
+{
+  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember>(thread,count);
+}
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember>
+TeamThreadRange( const Impl::QthreadTeamPolicyMember& thread
+               , const iType & begin
+               , const iType & end
+               )
+{
+  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember>(thread,begin,end);
+}
+
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember >
+  ThreadVectorRange(const Impl::QthreadTeamPolicyMember& thread, const iType& count) {
+  return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember >(thread,count);
+}
+
+
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadSingleStruct<Impl::QthreadTeamPolicyMember> PerTeam(const Impl::QthreadTeamPolicyMember& thread) {
+  return Impl::ThreadSingleStruct<Impl::QthreadTeamPolicyMember>(thread);
+}
+
+KOKKOS_INLINE_FUNCTION
+Impl::VectorSingleStruct<Impl::QthreadTeamPolicyMember> PerThread(const Impl::QthreadTeamPolicyMember& thread) {
+  return Impl::VectorSingleStruct<Impl::QthreadTeamPolicyMember>(thread);
+}
+
+} // namespace Kokkos
+
+namespace Kokkos {
+
+  /** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
+   *
+   * The range i=0..N-1 is mapped to all threads of the the calling thread team.
+   * This functionality requires C++11 support.*/
+template<typename iType, class Lambda>
+KOKKOS_INLINE_FUNCTION
+void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember>& loop_boundaries, const Lambda& lambda) {
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
+    lambda(i);
+}
+
+/** \brief  Inter-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all threads of the the calling thread team and a summation of
+ * val is performed and put into result. This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember>& loop_boundaries,
+                     const Lambda & lambda, ValueType& result) {
+
+  result = ValueType();
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    result+=tmp;
+  }
+
+  result = loop_boundaries.thread.team_reduce(result,Impl::JoinAdd<ValueType>());
+}
+
+#if defined( KOKKOS_HAVE_CXX11 )
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
+ * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
+ * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
+ * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
+ * '1 for *'). This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember>& loop_boundaries,
+                     const Lambda & lambda, const JoinType& join, ValueType& init_result) {
+
+  ValueType result = init_result;
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    join(result,tmp);
+  }
+
+  init_result = loop_boundaries.thread.team_reduce(result,Impl::JoinLambdaAdapter<ValueType,JoinType>(join));
+}
+
+#endif /* #if defined( KOKKOS_HAVE_CXX11 ) */
+
+} // namespace Kokkos
+
+namespace Kokkos {
+/** \brief  Intra-thread vector parallel_for. Executes lambda(iType i) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread.
+ * This functionality requires C++11 support.*/
+template<typename iType, class Lambda>
+KOKKOS_INLINE_FUNCTION
+void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember >&
+    loop_boundaries, const Lambda& lambda) {
+  #ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+  #pragma ivdep
+  #endif
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
+    lambda(i);
+}
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a summation of
+ * val is performed and put into result. This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember >&
+      loop_boundaries, const Lambda & lambda, ValueType& result) {
+  result = ValueType();
+#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    result+=tmp;
+  }
+}
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
+ * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
+ * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
+ * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
+ * '1 for *'). This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember >&
+      loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& init_result) {
+
+  ValueType result = init_result;
+#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    join(result,tmp);
+  }
+  init_result = result;
+}
+
+/** \brief  Intra-thread vector parallel exclusive prefix sum. Executes lambda(iType i, ValueType & val, bool final)
+ *          for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes in the thread and a scan operation is performed.
+ * Depending on the target execution space the operator might be called twice: once with final=false
+ * and once with final=true. When final==true val contains the prefix sum value. The contribution of this
+ * "i" needs to be added to val no matter whether final==true or not. In a serial execution
+ * (i.e. team_size==1) the operator is only called once with final==true. Scan_val will be set
+ * to the final sum value over all vector lanes.
+ * This functionality requires C++11 support.*/
+template< typename iType, class FunctorType >
+KOKKOS_INLINE_FUNCTION
+void parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember >&
+      loop_boundaries, const FunctorType & lambda) {
+
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ;
+  typedef typename ValueTraits::value_type value_type ;
+
+  value_type scan_val = value_type();
+
+#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,scan_val,true);
+  }
+}
+
+} // namespace Kokkos
+
+namespace Kokkos {
+
+template<class FunctorType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::VectorSingleStruct<Impl::QthreadTeamPolicyMember>& single_struct, const FunctorType& lambda) {
+  lambda();
+}
+
+template<class FunctorType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::ThreadSingleStruct<Impl::QthreadTeamPolicyMember>& single_struct, const FunctorType& lambda) {
+  if(single_struct.team_member.team_rank()==0) lambda();
+}
+
+template<class FunctorType, class ValueType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::VectorSingleStruct<Impl::QthreadTeamPolicyMember>& single_struct, const FunctorType& lambda, ValueType& val) {
+  lambda(val);
+}
+
+template<class FunctorType, class ValueType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::ThreadSingleStruct<Impl::QthreadTeamPolicyMember>& single_struct, const FunctorType& lambda, ValueType& val) {
+  if(single_struct.team_member.team_rank()==0) {
+    lambda(val);
+  }
+  single_struct.team_member.team_broadcast(val,0);
+}
+
+} // namespace Kokkos
+
+
+#endif /* #define KOKKOS_QTHREAD_PARALLEL_HPP */
+
diff --git a/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.cpp b/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8cc39d277c1949dc7f9587c09b77d5a71ffddeba
--- /dev/null
+++ b/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.cpp
@@ -0,0 +1,491 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+// Experimental unified task-data parallel manycore LDRD
+
+#include <Kokkos_Core_fwd.hpp>
+
+#if defined( KOKKOS_HAVE_QTHREAD )
+
+#include <stdio.h>
+
+#include <stdlib.h>
+#include <stdexcept>
+#include <iostream>
+#include <sstream>
+#include <string>
+
+#include <Kokkos_Atomic.hpp>
+#include <Qthread/Kokkos_Qthread_TaskPolicy.hpp>
+
+#if defined( KOKKOS_ENABLE_TASKPOLICY )
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+typedef TaskMember< Kokkos::Qthread , void , void > Task ;
+
+namespace {
+
+inline
+unsigned padded_sizeof_derived( unsigned sizeof_derived )
+{
+  return sizeof_derived +
+    ( sizeof_derived % sizeof(Task*) ? sizeof(Task*) - sizeof_derived % sizeof(Task*) : 0 );
+}
+
+// int lock_alloc_dealloc = 0 ;
+
+} // namespace
+
+void Task::deallocate( void * ptr )
+{
+  // Counting on 'free' thread safety so lock/unlock not required.
+  // However, isolate calls here to mitigate future need to introduce lock/unlock.
+
+  // lock
+
+  // while ( ! Kokkos::atomic_compare_exchange_strong( & lock_alloc_dealloc , 0 , 1 ) );
+
+  free( ptr );
+
+  // unlock
+
+  // Kokkos::atomic_compare_exchange_strong( & lock_alloc_dealloc , 1 , 0 );
+}
+
+void * Task::allocate( const unsigned arg_sizeof_derived
+                     , const unsigned arg_dependence_capacity )
+{
+  // Counting on 'malloc' thread safety so lock/unlock not required.
+  // However, isolate calls here to mitigate future need to introduce lock/unlock.
+
+  // lock
+
+  // while ( ! Kokkos::atomic_compare_exchange_strong( & lock_alloc_dealloc , 0 , 1 ) );
+
+  void * const ptr = malloc( padded_sizeof_derived( arg_sizeof_derived ) + arg_dependence_capacity * sizeof(Task*) );
+
+  // unlock
+
+  // Kokkos::atomic_compare_exchange_strong( & lock_alloc_dealloc , 1 , 0 );
+
+  return ptr ;
+}
+
+Task::~TaskMember()
+{
+
+}
+
+
+Task::TaskMember( const function_verify_type   arg_verify
+                , const function_dealloc_type  arg_dealloc
+                , const function_single_type   arg_apply_single
+                , const function_team_type     arg_apply_team
+                , volatile int &               arg_active_count
+                , const unsigned               arg_sizeof_derived
+                , const unsigned               arg_dependence_capacity
+                )
+  : m_dealloc( arg_dealloc )
+  , m_verify(  arg_verify )
+  , m_apply_single( arg_apply_single )
+  , m_apply_team( arg_apply_team )
+  , m_active_count( & arg_active_count )
+  , m_qfeb(0)
+  , m_dep( (Task **)( ((unsigned char *) this) + padded_sizeof_derived( arg_sizeof_derived ) ) )
+  , m_dep_capacity( arg_dependence_capacity )
+  , m_dep_size( 0 )
+  , m_ref_count( 0 )
+  , m_state( Kokkos::Experimental::TASK_STATE_CONSTRUCTING )
+{
+  qthread_empty( & m_qfeb ); // Set to full when complete
+  for ( unsigned i = 0 ; i < arg_dependence_capacity ; ++i ) m_dep[i] = 0 ;
+}
+
+Task::TaskMember( const function_dealloc_type  arg_dealloc
+                , const function_single_type   arg_apply_single
+                , const function_team_type     arg_apply_team
+                , volatile int &               arg_active_count
+                , const unsigned               arg_sizeof_derived
+                , const unsigned               arg_dependence_capacity
+                )
+  : m_dealloc( arg_dealloc )
+  , m_verify(  & Task::verify_type<void> )
+  , m_apply_single( arg_apply_single )
+  , m_apply_team( arg_apply_team )
+  , m_active_count( & arg_active_count )
+  , m_qfeb(0)
+  , m_dep( (Task **)( ((unsigned char *) this) + padded_sizeof_derived( arg_sizeof_derived ) ) )
+  , m_dep_capacity( arg_dependence_capacity )
+  , m_dep_size( 0 )
+  , m_ref_count( 0 )
+  , m_state( Kokkos::Experimental::TASK_STATE_CONSTRUCTING )
+{
+  qthread_empty( & m_qfeb ); // Set to full when complete
+  for ( unsigned i = 0 ; i < arg_dependence_capacity ; ++i ) m_dep[i] = 0 ;
+}
+
+//----------------------------------------------------------------------------
+
+void Task::throw_error_add_dependence() const
+{
+  std::cerr << "TaskMember< Qthread >::add_dependence ERROR"
+            << " state(" << m_state << ")"
+            << " dep_size(" << m_dep_size << ")"
+            << std::endl ;
+  throw std::runtime_error("TaskMember< Qthread >::add_dependence ERROR");
+}
+
+void Task::throw_error_verify_type()
+{
+  throw std::runtime_error("TaskMember< Qthread >::verify_type ERROR");
+}
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+void Task::assign( Task ** const lhs , Task * rhs , const bool no_throw )
+{
+  static const char msg_error_header[]      = "Kokkos::Impl::TaskManager<Kokkos::Qthread>::assign ERROR" ;
+  static const char msg_error_count[]       = ": negative reference count" ;
+  static const char msg_error_complete[]    = ": destroy task that is not complete" ;
+  static const char msg_error_dependences[] = ": destroy task that has dependences" ;
+  static const char msg_error_exception[]   = ": caught internal exception" ;
+
+  if ( rhs ) { Kokkos::atomic_fetch_add( & (*rhs).m_ref_count , 1 ); }
+
+  Task * const lhs_val = Kokkos::atomic_exchange( lhs , rhs );
+
+  if ( lhs_val ) {
+
+    const int count = Kokkos::atomic_fetch_add( & (*lhs_val).m_ref_count , -1 );
+
+    const char * msg_error = 0 ;
+
+    try {
+
+      if ( 1 == count ) {
+
+        // Reference count at zero, delete it
+
+        // Should only be deallocating a completed task
+        if ( (*lhs_val).m_state == Kokkos::Experimental::TASK_STATE_COMPLETE ) {
+
+          // A completed task should not have dependences...
+          for ( int i = 0 ; i < (*lhs_val).m_dep_size && 0 == msg_error ; ++i ) {
+            if ( (*lhs_val).m_dep[i] ) msg_error = msg_error_dependences ;
+          }
+        }
+        else {
+          msg_error = msg_error_complete ;
+        }
+
+        if ( 0 == msg_error ) {
+          // Get deletion function and apply it
+          const Task::function_dealloc_type d = (*lhs_val).m_dealloc ;
+
+          (*d)( lhs_val );
+        }
+      }
+      else if ( count <= 0 ) {
+        msg_error = msg_error_count ;
+      }
+    }
+    catch( ... ) {
+      if ( 0 == msg_error ) msg_error = msg_error_exception ;
+    }
+
+    if ( 0 != msg_error ) {
+      if ( no_throw ) {
+        std::cerr << msg_error_header << msg_error << std::endl ;
+        std::cerr.flush();
+      }
+      else {
+        std::string msg(msg_error_header);
+        msg.append(msg_error);
+        throw std::runtime_error( msg );
+      }
+    }
+  }
+}
+#endif
+
+
+//----------------------------------------------------------------------------
+
+void Task::closeout()
+{
+  enum { RESPAWN = int( Kokkos::Experimental::TASK_STATE_WAITING ) |
+                   int( Kokkos::Experimental::TASK_STATE_EXECUTING ) };
+
+#if 0
+fprintf( stdout
+       , "worker(%d.%d) task 0x%.12lx %s\n"
+       , qthread_shep()
+       , qthread_worker_local(NULL)
+       , reinterpret_cast<unsigned long>(this)
+       , ( m_state == RESPAWN ? "respawn" : "complete" )
+       );
+fflush(stdout);
+#endif
+
+  // When dependent tasks run there would be a race
+  // condition between destroying this task and
+  // querying the active count pointer from this task.
+  int volatile * const active_count = m_active_count ;
+
+  if ( m_state == RESPAWN ) {
+    // Task requests respawn, set state to waiting and reschedule the task
+    m_state = Kokkos::Experimental::TASK_STATE_WAITING ;
+    schedule();
+  }
+  else {
+
+    // Task did not respawn, is complete
+    m_state = Kokkos::Experimental::TASK_STATE_COMPLETE ;
+
+    // Release dependences before allowing dependent tasks to run.
+    // Otherwise there is a thread race condition for removing dependences.
+    for ( int i = 0 ; i < m_dep_size ; ++i ) {
+      assign( & m_dep[i] , 0 );
+    }
+
+    // Set qthread FEB to full so that dependent tasks are allowed to execute.
+    // This 'task' may be deleted immediately following this function call.
+    qthread_fill( & m_qfeb );
+
+    // The dependent task could now complete and destroy 'this' task
+    // before the call to 'qthread_fill' returns.  Therefore, for
+    // thread safety assume that 'this' task has now been destroyed.
+  }
+
+  // Decrement active task count before returning.
+  Kokkos::atomic_decrement( active_count );
+}
+
+aligned_t Task::qthread_func( void * arg )
+{
+  Task * const task = reinterpret_cast< Task * >(arg);
+
+  // First member of the team change state to executing.
+  // Use compare-exchange to avoid race condition with a respawn.
+  Kokkos::atomic_compare_exchange_strong( & task->m_state
+                                        , int(Kokkos::Experimental::TASK_STATE_WAITING)
+                                        , int(Kokkos::Experimental::TASK_STATE_EXECUTING)
+                                        );
+
+  if ( task->m_apply_team && ! task->m_apply_single ) {
+    Kokkos::Impl::QthreadTeamPolicyMember::TaskTeam task_team_tag ;
+
+    // Initialize team size and rank with shephered info
+    Kokkos::Impl::QthreadTeamPolicyMember member( task_team_tag );
+
+    (*task->m_apply_team)( task , member );
+
+#if 0
+fprintf( stdout
+       , "worker(%d.%d) task 0x%.12lx executed by member(%d:%d)\n"
+       , qthread_shep()
+       , qthread_worker_local(NULL)
+       , reinterpret_cast<unsigned long>(task)
+       , member.team_rank()
+       , member.team_size()
+       );
+fflush(stdout);
+#endif
+
+    member.team_barrier();
+    if ( member.team_rank() == 0 ) task->closeout();
+    member.team_barrier();
+  }
+  else if ( task->m_apply_team && task->m_apply_single == reinterpret_cast<function_single_type>(1) ) {
+    // Team hard-wired to one, no cloning
+    Kokkos::Impl::QthreadTeamPolicyMember member ;
+    (*task->m_apply_team)( task , member );
+    task->closeout();
+  }
+  else {
+    (*task->m_apply_single)( task );
+    task->closeout();
+  }
+
+#if 0
+fprintf( stdout
+       , "worker(%d.%d) task 0x%.12lx return\n"
+       , qthread_shep()
+       , qthread_worker_local(NULL)
+       , reinterpret_cast<unsigned long>(task)
+       );
+fflush(stdout);
+#endif
+
+  return 0 ;
+}
+
+void Task::respawn()
+{
+  // Change state from pure executing to ( waiting | executing )
+  // to avoid confusion with simply waiting.
+  Kokkos::atomic_compare_exchange_strong( & m_state
+                                        , int(Kokkos::Experimental::TASK_STATE_EXECUTING)
+                                        , int(Kokkos::Experimental::TASK_STATE_WAITING |
+                                              Kokkos::Experimental::TASK_STATE_EXECUTING)
+                                        );
+}
+
+void Task::schedule()
+{
+  // Is waiting for execution
+
+  // Increment active task count before spawning.
+  Kokkos::atomic_increment( m_active_count );
+
+  // spawn in qthread.  must malloc the precondition array and give to qthread.
+  // qthread will eventually free this allocation so memory will not be leaked.
+
+  // concern with thread safety of malloc, does this need to be guarded?
+  aligned_t ** qprecon = (aligned_t **) malloc( ( m_dep_size + 1 ) * sizeof(aligned_t *) );
+
+  qprecon[0] = reinterpret_cast<aligned_t *>( uintptr_t(m_dep_size) );
+
+  for ( int i = 0 ; i < m_dep_size ; ++i ) {
+    qprecon[i+1] = & m_dep[i]->m_qfeb ; // Qthread precondition flag
+  }
+
+  if ( m_apply_team && ! m_apply_single ) {
+    // If more than one shepherd spawn on a shepherd other than this shepherd
+    const int num_shepherd            = qthread_num_shepherds();
+    const int num_worker_per_shepherd = qthread_num_workers_local(NO_SHEPHERD);
+    const int this_shepherd           = qthread_shep();
+
+    int spawn_shepherd = ( this_shepherd + 1 ) % num_shepherd ;
+
+#if 0
+fprintf( stdout
+       , "worker(%d.%d) task 0x%.12lx spawning on shepherd(%d) clone(%d)\n"
+       , qthread_shep()
+       , qthread_worker_local(NULL)
+       , reinterpret_cast<unsigned long>(this)
+       , spawn_shepherd
+       , num_worker_per_shepherd - 1
+       );
+fflush(stdout);
+#endif
+
+    qthread_spawn_cloneable
+      ( & Task::qthread_func
+      , this
+      , 0
+      , NULL
+      , m_dep_size , qprecon /* dependences */
+      , spawn_shepherd
+      , unsigned( QTHREAD_SPAWN_SIMPLE | QTHREAD_SPAWN_LOCAL_PRIORITY )
+      , num_worker_per_shepherd - 1
+      );
+  }
+  else {
+    qthread_spawn( & Task::qthread_func /* function */
+                 , this                 /* function argument */
+                 , 0
+                 , NULL
+                 , m_dep_size , qprecon /* dependences */
+                 , NO_SHEPHERD
+                 , QTHREAD_SPAWN_SIMPLE /* allows optimization for non-blocking task */
+                 );
+  }
+}
+
+} // namespace Impl
+} // namespace Experimental
+} // namespace Kokkos
+
+namespace Kokkos {
+namespace Experimental {
+
+TaskPolicy< Kokkos::Qthread >::
+TaskPolicy
+  ( const unsigned /* arg_task_max_count */
+  , const unsigned /* arg_task_max_size */
+  , const unsigned arg_task_default_dependence_capacity 
+  , const unsigned arg_task_team_size
+  )
+  : m_default_dependence_capacity( arg_task_default_dependence_capacity )
+  , m_team_size( arg_task_team_size != 0 ? arg_task_team_size : unsigned(qthread_num_workers_local(NO_SHEPHERD)) )
+  , m_active_count_root(0)
+  , m_active_count( m_active_count_root )
+{
+  const unsigned num_worker_per_shepherd = unsigned( qthread_num_workers_local(NO_SHEPHERD) );
+
+  if ( m_team_size != 1 && m_team_size != num_worker_per_shepherd ) {
+    std::ostringstream msg ;
+    msg << "Kokkos::Experimental::TaskPolicy< Kokkos::Qthread >( "
+        << "default_depedence = " << arg_task_default_dependence_capacity
+        << " , team_size = " << arg_task_team_size
+        << " ) ERROR, valid team_size arguments are { (omitted) , 1 , " << num_worker_per_shepherd << " }" ;
+    Kokkos::Impl::throw_runtime_exception(msg.str());
+  }
+}
+
+TaskPolicy< Kokkos::Qthread >::member_type &
+TaskPolicy< Kokkos::Qthread >::member_single()
+{
+  static member_type s ;
+  return s ;
+}
+
+void wait( Kokkos::Experimental::TaskPolicy< Kokkos::Qthread > & policy )
+{
+  volatile int * const active_task_count = & policy.m_active_count ;
+  while ( *active_task_count ) qthread_yield();
+}
+
+} // namespace Experimental
+} // namespace Kokkos
+
+#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
+#endif /* #if defined( KOKKOS_HAVE_QTHREAD ) */
+
diff --git a/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.hpp b/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..22a565503dd59626057bae12ef01cb9abdb994f9
--- /dev/null
+++ b/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.hpp
@@ -0,0 +1,664 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+// Experimental unified task-data parallel manycore LDRD
+
+#ifndef KOKKOS_QTHREAD_TASKPOLICY_HPP
+#define KOKKOS_QTHREAD_TASKPOLICY_HPP
+
+#include <string>
+#include <typeinfo>
+#include <stdexcept>
+
+//----------------------------------------------------------------------------
+// Defines to enable experimental Qthread functionality
+
+#define QTHREAD_LOCAL_PRIORITY
+#define CLONED_TASKS
+
+#include <qthread.h>
+
+#undef QTHREAD_LOCAL_PRIORITY
+#undef CLONED_TASKS
+
+//----------------------------------------------------------------------------
+
+#include <Kokkos_Qthread.hpp>
+#include <Kokkos_TaskPolicy.hpp>
+#include <Kokkos_View.hpp>
+
+#include <impl/Kokkos_FunctorAdapter.hpp>
+
+#if defined( KOKKOS_ENABLE_TASKPOLICY )
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template<>
+class TaskMember< Kokkos::Qthread , void , void >
+{
+public:
+
+  typedef TaskMember * (* function_verify_type) ( TaskMember * );
+  typedef void         (* function_single_type) ( TaskMember * );
+  typedef void         (* function_team_type)   ( TaskMember * , Kokkos::Impl::QthreadTeamPolicyMember & );
+  typedef void         (* function_dealloc_type)( TaskMember * );
+
+private:
+
+  const function_dealloc_type  m_dealloc ;       ///< Deallocation
+  const function_verify_type   m_verify ;        ///< Result type verification
+  const function_single_type   m_apply_single ;  ///< Apply function
+  const function_team_type     m_apply_team ;    ///< Apply function
+  int volatile * const         m_active_count ;  ///< Count of active tasks on this policy
+  aligned_t                    m_qfeb ;          ///< Qthread full/empty bit
+  TaskMember ** const          m_dep ;           ///< Dependences
+  const int                    m_dep_capacity ;  ///< Capacity of dependences
+  int                          m_dep_size ;      ///< Actual count of dependences
+  int                          m_ref_count ;     ///< Reference count
+  int                          m_state ;         ///< State of the task
+
+  TaskMember() /* = delete */ ;
+  TaskMember( const TaskMember & ) /* = delete */ ;
+  TaskMember & operator = ( const TaskMember & ) /* = delete */ ;
+
+  static aligned_t qthread_func( void * arg );
+
+  static void * allocate( const unsigned arg_sizeof_derived , const unsigned arg_dependence_capacity );
+  static void   deallocate( void * );
+
+  void throw_error_add_dependence() const ;
+  static void throw_error_verify_type();
+
+  template < class DerivedTaskType >
+  static
+  void deallocate( TaskMember * t )
+    {
+      DerivedTaskType * ptr = static_cast< DerivedTaskType * >(t);
+      ptr->~DerivedTaskType();
+      deallocate( (void *) ptr );
+    }
+
+  void schedule();
+  void closeout();
+
+protected :
+
+  ~TaskMember();
+
+  // Used by TaskMember< Qthread , ResultType , void >
+  TaskMember( const function_verify_type   arg_verify
+            , const function_dealloc_type  arg_dealloc
+            , const function_single_type   arg_apply_single
+            , const function_team_type     arg_apply_team
+            , volatile int &               arg_active_count
+            , const unsigned               arg_sizeof_derived
+            , const unsigned               arg_dependence_capacity
+            );
+
+  // Used for TaskMember< Qthread , void , void >
+  TaskMember( const function_dealloc_type  arg_dealloc
+            , const function_single_type   arg_apply_single
+            , const function_team_type     arg_apply_team
+            , volatile int &               arg_active_count
+            , const unsigned               arg_sizeof_derived
+            , const unsigned               arg_dependence_capacity
+            );
+
+public:
+
+  template< typename ResultType >
+  KOKKOS_FUNCTION static
+  TaskMember * verify_type( TaskMember * t )
+    {
+      enum { check_type = ! Kokkos::Impl::is_same< ResultType , void >::value };
+
+      if ( check_type && t != 0 ) {
+
+        // Verify that t->m_verify is this function
+        const function_verify_type self = & TaskMember::template verify_type< ResultType > ;
+
+        if ( t->m_verify != self ) {
+          t = 0 ;
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+          throw_error_verify_type();
+#endif
+        }
+      }
+      return t ;
+    }
+
+  //----------------------------------------
+  /*  Inheritence Requirements on task types:
+   *    typedef  FunctorType::value_type  value_type ;
+   *    class DerivedTaskType
+   *      : public TaskMember< Qthread , value_type , FunctorType >
+   *      { ... };
+   *    class TaskMember< Qthread , value_type , FunctorType >
+   *      : public TaskMember< Qthread , value_type , void >
+   *      , public Functor
+   *      { ... };
+   *  If value_type != void
+   *    class TaskMember< Qthread , value_type , void >
+   *      : public TaskMember< Qthread , void , void >
+   *
+   *  Allocate space for DerivedTaskType followed by TaskMember*[ dependence_capacity ]
+   *
+   */
+
+  /** \brief  Allocate and construct a single-thread task */
+  template< class DerivedTaskType >
+  static
+  TaskMember * create_single( const typename DerivedTaskType::functor_type &  arg_functor
+                            , volatile int &                                  arg_active_count
+                            , const unsigned                                  arg_dependence_capacity )
+    {
+      typedef typename DerivedTaskType::functor_type  functor_type ;
+      typedef typename functor_type::value_type       value_type ;
+
+      DerivedTaskType * const task =
+        new( allocate( sizeof(DerivedTaskType) , arg_dependence_capacity ) )
+          DerivedTaskType( & TaskMember::template deallocate< DerivedTaskType >
+                         , & TaskMember::template apply_single< functor_type , value_type >
+                         , 0
+                         , arg_active_count
+                         , sizeof(DerivedTaskType)
+                         , arg_dependence_capacity
+                         , arg_functor );
+
+      return static_cast< TaskMember * >( task );
+    }
+
+  /** \brief  Allocate and construct a team-thread task */
+  template< class DerivedTaskType >
+  static
+  TaskMember * create_team( const typename DerivedTaskType::functor_type &  arg_functor
+                          , volatile int &                                  arg_active_count
+                          , const unsigned                                  arg_dependence_capacity
+                          , const bool                                      arg_is_team )
+    {
+      typedef typename DerivedTaskType::functor_type  functor_type ;
+      typedef typename functor_type::value_type       value_type ;
+
+      const function_single_type flag = reinterpret_cast<function_single_type>( arg_is_team ? 0 : 1 );
+
+      DerivedTaskType * const task =
+        new( allocate( sizeof(DerivedTaskType) , arg_dependence_capacity ) )
+          DerivedTaskType( & TaskMember::template deallocate< DerivedTaskType >
+                         , flag
+                         , & TaskMember::template apply_team< functor_type , value_type >
+                         , arg_active_count
+                         , sizeof(DerivedTaskType)
+                         , arg_dependence_capacity
+                         , arg_functor );
+
+      return static_cast< TaskMember * >( task );
+    }
+
+  void respawn();
+  void spawn()
+    {
+       m_state = Kokkos::Experimental::TASK_STATE_WAITING ;
+       schedule();
+    }
+
+  //----------------------------------------
+
+  typedef FutureValueTypeIsVoidError get_result_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  get_result_type get() const { return get_result_type() ; }
+
+  KOKKOS_INLINE_FUNCTION
+  Kokkos::Experimental::TaskState get_state() const { return Kokkos::Experimental::TaskState( m_state ); }
+
+  //----------------------------------------
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+  static
+  void assign( TaskMember ** const lhs , TaskMember * const rhs , const bool no_throw = false );
+#else
+  KOKKOS_INLINE_FUNCTION static
+  void assign( TaskMember ** const lhs , TaskMember * const rhs , const bool no_throw = false ) {}
+#endif
+
+  KOKKOS_INLINE_FUNCTION
+  TaskMember * get_dependence( int i ) const
+    { return ( Kokkos::Experimental::TASK_STATE_EXECUTING == m_state && 0 <= i && i < m_dep_size ) ? m_dep[i] : (TaskMember*) 0 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  int get_dependence() const
+    { return m_dep_size ; }
+
+  KOKKOS_INLINE_FUNCTION
+  void clear_dependence()
+    {
+      for ( int i = 0 ; i < m_dep_size ; ++i ) assign( m_dep + i , 0 );
+      m_dep_size = 0 ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void add_dependence( TaskMember * before )
+    {
+      if ( ( Kokkos::Experimental::TASK_STATE_CONSTRUCTING == m_state ||
+             Kokkos::Experimental::TASK_STATE_EXECUTING    == m_state ) &&
+           m_dep_size < m_dep_capacity ) {
+        assign( m_dep + m_dep_size , before );
+        ++m_dep_size ;
+      }
+      else {
+        throw_error_add_dependence();
+      }
+    }
+
+  //----------------------------------------
+
+  template< class FunctorType , class ResultType >
+  KOKKOS_INLINE_FUNCTION static
+  void apply_single( typename Kokkos::Impl::enable_if< ! Kokkos::Impl::is_same< ResultType , void >::value , TaskMember * >::type t )
+    {
+      typedef TaskMember< Kokkos::Qthread , ResultType , FunctorType > derived_type ;
+
+      // TaskMember< Kokkos::Qthread , ResultType , FunctorType >
+      //   : public TaskMember< Kokkos::Qthread , ResultType , void >
+      //   , public FunctorType
+      //   { ... };
+
+      derived_type & m = * static_cast< derived_type * >( t );
+
+      Kokkos::Impl::FunctorApply< FunctorType , void , ResultType & >::apply( (FunctorType &) m , & m.m_result );
+    }
+
+  template< class FunctorType , class ResultType >
+  KOKKOS_INLINE_FUNCTION static
+  void apply_single( typename Kokkos::Impl::enable_if< Kokkos::Impl::is_same< ResultType , void >::value , TaskMember * >::type t )
+    {
+      typedef TaskMember< Kokkos::Qthread , ResultType , FunctorType > derived_type ;
+
+      // TaskMember< Kokkos::Qthread , ResultType , FunctorType >
+      //   : public TaskMember< Kokkos::Qthread , ResultType , void >
+      //   , public FunctorType
+      //   { ... };
+
+      derived_type & m = * static_cast< derived_type * >( t );
+
+      Kokkos::Impl::FunctorApply< FunctorType , void , void >::apply( (FunctorType &) m );
+    }
+
+  //----------------------------------------
+
+  template< class FunctorType , class ResultType >
+  KOKKOS_INLINE_FUNCTION static
+  void apply_team( typename Kokkos::Impl::enable_if< ! Kokkos::Impl::is_same< ResultType , void >::value , TaskMember * >::type t
+                 , Kokkos::Impl::QthreadTeamPolicyMember & member )
+    {
+      typedef TaskMember< Kokkos::Qthread , ResultType , FunctorType > derived_type ;
+
+      derived_type & m = * static_cast< derived_type * >( t );
+
+      m.FunctorType::apply( member , m.m_result );
+    }
+
+  template< class FunctorType , class ResultType >
+  KOKKOS_INLINE_FUNCTION static
+  void apply_team( typename Kokkos::Impl::enable_if< Kokkos::Impl::is_same< ResultType , void >::value , TaskMember * >::type t
+                 , Kokkos::Impl::QthreadTeamPolicyMember & member )
+    {
+      typedef TaskMember< Kokkos::Qthread , ResultType , FunctorType > derived_type ;
+
+      derived_type & m = * static_cast< derived_type * >( t );
+
+      m.FunctorType::apply( member );
+    }
+};
+
+//----------------------------------------------------------------------------
+/** \brief  Base class for tasks with a result value in the Qthread execution space.
+ *
+ *  The FunctorType must be void because this class is accessed by the
+ *  Future class for the task and result value.
+ *
+ *  Must be derived from TaskMember<S,void,void> 'root class' so the Future class
+ *  can correctly static_cast from the 'root class' to this class.
+ */
+template < class ResultType >
+class TaskMember< Kokkos::Qthread , ResultType , void >
+  : public TaskMember< Kokkos::Qthread , void , void >
+{
+public:
+
+  ResultType  m_result ;
+
+  typedef const ResultType & get_result_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  get_result_type get() const { return m_result ; }
+
+protected:
+
+  typedef TaskMember< Kokkos::Qthread , void , void >  task_root_type ;
+  typedef task_root_type::function_dealloc_type        function_dealloc_type ;
+  typedef task_root_type::function_single_type         function_single_type ;
+  typedef task_root_type::function_team_type           function_team_type ;
+
+  inline
+  TaskMember( const function_dealloc_type  arg_dealloc
+            , const function_single_type   arg_apply_single
+            , const function_team_type     arg_apply_team
+            , volatile int &               arg_active_count
+            , const unsigned               arg_sizeof_derived
+            , const unsigned               arg_dependence_capacity
+            )
+    : task_root_type( & task_root_type::template verify_type< ResultType >
+                    , arg_dealloc
+                    , arg_apply_single
+                    , arg_apply_team
+                    , arg_active_count
+                    , arg_sizeof_derived
+                    , arg_dependence_capacity )
+    , m_result()
+    {}
+};
+
+template< class ResultType , class FunctorType >
+class TaskMember< Kokkos::Qthread , ResultType , FunctorType >
+  : public TaskMember< Kokkos::Qthread , ResultType , void >
+  , public FunctorType
+{
+public:
+
+  typedef FunctorType  functor_type ;
+
+  typedef TaskMember< Kokkos::Qthread , void , void >        task_root_type ;
+  typedef TaskMember< Kokkos::Qthread , ResultType , void >  task_base_type ;
+  typedef task_root_type::function_dealloc_type              function_dealloc_type ;
+  typedef task_root_type::function_single_type               function_single_type ;
+  typedef task_root_type::function_team_type                 function_team_type ;
+
+  inline
+  TaskMember( const function_dealloc_type  arg_dealloc
+            , const function_single_type   arg_apply_single
+            , const function_team_type     arg_apply_team
+            , volatile int &               arg_active_count
+            , const unsigned               arg_sizeof_derived
+            , const unsigned               arg_dependence_capacity
+            , const functor_type &         arg_functor
+            )
+    : task_base_type( arg_dealloc
+                    , arg_apply_single
+                    , arg_apply_team
+                    , arg_active_count
+                    , arg_sizeof_derived
+                    , arg_dependence_capacity )
+    , functor_type( arg_functor )
+    {}
+};
+
+} /* namespace Impl */
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+
+void wait( TaskPolicy< Kokkos::Qthread > & );
+
+template<>
+class TaskPolicy< Kokkos::Qthread >
+{
+public:
+
+  typedef Kokkos::Qthread                        execution_space ;
+  typedef TaskPolicy                             execution_policy ;
+  typedef Kokkos::Impl::QthreadTeamPolicyMember  member_type ;
+
+private:
+
+  typedef Impl::TaskMember< execution_space , void , void > task_root_type ;
+
+  template< class FunctorType >
+  static inline
+  const task_root_type * get_task_root( const FunctorType * f )
+    {
+      typedef Impl::TaskMember< execution_space , typename FunctorType::value_type , FunctorType > task_type ;
+      return static_cast< const task_root_type * >( static_cast< const task_type * >(f) );
+    }
+
+  template< class FunctorType >
+  static inline
+  task_root_type * get_task_root( FunctorType * f )
+    {
+      typedef Impl::TaskMember< execution_space , typename FunctorType::value_type , FunctorType > task_type ;
+      return static_cast< task_root_type * >( static_cast< task_type * >(f) );
+    }
+
+  unsigned        m_default_dependence_capacity ;
+  unsigned        m_team_size ;
+  volatile int    m_active_count_root ;
+  volatile int &  m_active_count ;
+
+public:
+
+  TaskPolicy
+    ( const unsigned arg_task_max_count
+    , const unsigned arg_task_max_size
+    , const unsigned arg_task_default_dependence_capacity = 4
+    , const unsigned arg_task_team_size = 0 /* choose default */
+    );
+
+  KOKKOS_FUNCTION TaskPolicy() = default ;
+  KOKKOS_FUNCTION TaskPolicy( TaskPolicy && rhs ) = default ;
+  KOKKOS_FUNCTION TaskPolicy( const TaskPolicy & rhs ) = default ;
+  KOKKOS_FUNCTION TaskPolicy & operator = ( TaskPolicy && rhs ) = default ;
+  KOKKOS_FUNCTION TaskPolicy & operator = ( const TaskPolicy & rhs ) = default ;
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  int allocated_task_count() const { return m_active_count ; }
+
+  template< class ValueType >
+  const Future< ValueType , execution_space > &
+    spawn( const Future< ValueType , execution_space > & f 
+         , const bool priority = false ) const
+      {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+        f.m_task->spawn();
+#endif
+        return f ;
+      }
+
+  // Create single-thread task
+
+  template< class FunctorType >
+  KOKKOS_INLINE_FUNCTION
+  Future< typename FunctorType::value_type , execution_space >
+  task_create( const FunctorType & functor
+             , const unsigned dependence_capacity = ~0u ) const
+    {
+      typedef typename FunctorType::value_type value_type ;
+      typedef Impl::TaskMember< execution_space , value_type , FunctorType >  task_type ;
+      return Future< value_type , execution_space >(
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+        task_root_type::create_single< task_type >
+          ( functor
+          , m_active_count
+          , ( ~0u == dependence_capacity ? m_default_dependence_capacity : dependence_capacity )
+          )
+#endif
+        );
+    }
+
+  template< class FunctorType >
+  Future< typename FunctorType::value_type , execution_space >
+  proc_create( const FunctorType & functor
+             , const unsigned dependence_capacity = ~0u ) const
+    { return task_create( functor , dependence_capacity ); }
+
+  // Create thread-team task
+
+  template< class FunctorType >
+  KOKKOS_INLINE_FUNCTION
+  Future< typename FunctorType::value_type , execution_space >
+  task_create_team( const FunctorType & functor
+                  , const unsigned dependence_capacity = ~0u ) const
+    {
+      typedef typename FunctorType::value_type  value_type ;
+      typedef Impl::TaskMember< execution_space , value_type , FunctorType >  task_type ;
+
+      return Future< value_type , execution_space >(
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+        task_root_type::create_team< task_type >
+          ( functor
+          , m_active_count
+          , ( ~0u == dependence_capacity ? m_default_dependence_capacity : dependence_capacity )
+          , 1 < m_team_size
+          )
+#endif
+        );
+    }
+
+  template< class FunctorType >
+  KOKKOS_INLINE_FUNCTION
+  Future< typename FunctorType::value_type , execution_space >
+  proc_create_team( const FunctorType & functor
+                  , const unsigned dependence_capacity = ~0u ) const
+    { return task_create_team( functor , dependence_capacity ); }
+
+  // Add dependence
+  template< class A1 , class A2 , class A3 , class A4 >
+  void add_dependence( const Future<A1,A2> & after
+                     , const Future<A3,A4> & before
+                     , typename Kokkos::Impl::enable_if
+                        < Kokkos::Impl::is_same< typename Future<A1,A2>::execution_space , execution_space >::value
+                          &&
+                          Kokkos::Impl::is_same< typename Future<A3,A4>::execution_space , execution_space >::value
+                        >::type * = 0
+                      )
+    {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      after.m_task->add_dependence( before.m_task );
+#endif
+    }
+
+  //----------------------------------------
+  // Functions for an executing task functor to query dependences,
+  // set new dependences, and respawn itself.
+
+  template< class FunctorType >
+  Future< void , execution_space >
+  get_dependence( const FunctorType * task_functor , int i ) const
+    {
+      return Future<void,execution_space>(
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+        get_task_root(task_functor)->get_dependence(i)
+#endif
+        );
+    }
+
+  template< class FunctorType >
+  int get_dependence( const FunctorType * task_functor ) const
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { return get_task_root(task_functor)->get_dependence(); }
+#else
+    { return 0 ; }
+#endif
+
+  template< class FunctorType >
+  void clear_dependence( FunctorType * task_functor ) const
+    {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      get_task_root(task_functor)->clear_dependence();
+#endif
+    }
+
+  template< class FunctorType , class A3 , class A4 >
+  void add_dependence( FunctorType * task_functor
+                     , const Future<A3,A4> & before
+                     , typename Kokkos::Impl::enable_if
+                        < Kokkos::Impl::is_same< typename Future<A3,A4>::execution_space , execution_space >::value
+                        >::type * = 0
+                      )
+    {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      get_task_root(task_functor)->add_dependence( before.m_task );
+#endif
+    }
+
+  template< class FunctorType >
+  void respawn( FunctorType * task_functor 
+              , const bool priority = false ) const
+    {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      get_task_root(task_functor)->respawn();
+#endif
+    }
+
+  template< class FunctorType >
+  void respawn_needing_memory( FunctorType * task_functor ) const
+    {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      get_task_root(task_functor)->respawn();
+#endif
+    }
+
+  static member_type & member_single();
+
+  friend void wait( TaskPolicy< Kokkos::Qthread > & );
+};
+
+} /* namespace Experimental */
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
+#endif /* #define KOKKOS_QTHREAD_TASK_HPP */
+
diff --git a/lib/kokkos/core/src/Qthread/README b/lib/kokkos/core/src/Qthread/README
new file mode 100644
index 0000000000000000000000000000000000000000..6e6c86a9efc2680916e2556bda28914833e6749d
--- /dev/null
+++ b/lib/kokkos/core/src/Qthread/README
@@ -0,0 +1,25 @@
+
+# This Qthreads back-end uses an experimental branch of the Qthreads repository with special #define options.
+
+# Cloning repository and branch:
+
+git clone git@github.com:Qthreads/qthreads.git qthreads
+
+cd qthreads
+
+# checkout branch with "cloned tasks"
+
+git checkout dev-kokkos
+
+# Configure/autogen
+
+sh autogen.sh
+
+# configure with 'hwloc' installation:
+
+./configure CFLAGS="-DCLONED_TASKS -DQTHREAD_LOCAL_PRIORITY" --with-hwloc=${HWLOCDIR} --prefix=${INSTALLDIR}
+
+# install
+
+make install
+
diff --git a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5f0b8f70cd8ef36dd153b7bcbb84c42300f4fa6e
--- /dev/null
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
@@ -0,0 +1,826 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core_fwd.hpp>
+
+#if defined( KOKKOS_HAVE_PTHREAD ) || defined( KOKKOS_HAVE_WINTHREAD )
+
+#include <stdint.h>
+#include <limits>
+#include <utility>
+#include <iostream>
+#include <sstream>
+#include <Kokkos_Core.hpp>
+#include <impl/Kokkos_Error.hpp>
+#include <impl/Kokkos_CPUDiscovery.hpp>
+#include <impl/Kokkos_Profiling_Interface.hpp>
+
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+namespace {
+
+ThreadsExec                  s_threads_process ;
+ThreadsExec                * s_threads_exec[  ThreadsExec::MAX_THREAD_COUNT ] = { 0 };
+pthread_t                    s_threads_pid[   ThreadsExec::MAX_THREAD_COUNT ] = { 0 };
+std::pair<unsigned,unsigned> s_threads_coord[ ThreadsExec::MAX_THREAD_COUNT ];
+
+int s_thread_pool_size[3] = { 0 , 0 , 0 };
+
+unsigned s_current_reduce_size = 0 ;
+unsigned s_current_shared_size = 0 ;
+
+void (* volatile s_current_function)( ThreadsExec & , const void * );
+const void * volatile s_current_function_arg = 0 ;
+
+struct Sentinel {
+  Sentinel()
+  {
+    HostSpace::register_in_parallel( ThreadsExec::in_parallel );
+  }
+
+  ~Sentinel()
+  {
+    if ( s_thread_pool_size[0] ||
+         s_thread_pool_size[1] ||
+         s_thread_pool_size[2] ||
+         s_current_reduce_size ||
+         s_current_shared_size ||
+         s_current_function ||
+         s_current_function_arg ||
+         s_threads_exec[0] ) {
+      std::cerr << "ERROR : Process exiting without calling Kokkos::Threads::terminate()" << std::endl ;
+    }
+  }
+};
+
+inline
+unsigned fan_size( const unsigned rank , const unsigned size )
+{
+  const unsigned rank_rev = size - ( rank + 1 );
+  unsigned count = 0 ;
+  for ( unsigned n = 1 ; ( rank_rev + n < size ) && ! ( rank_rev & n ) ; n <<= 1 ) { ++count ; }
+  return count ;
+}
+
+} // namespace
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+void execute_function_noop( ThreadsExec & , const void * ) {}
+
+void ThreadsExec::driver(void)
+{
+  ThreadsExec this_thread ;
+
+  while ( ThreadsExec::Active == this_thread.m_pool_state ) {
+
+    (*s_current_function)( this_thread , s_current_function_arg );
+
+    // Deactivate thread and wait for reactivation
+    this_thread.m_pool_state = ThreadsExec::Inactive ;
+
+    wait_yield( this_thread.m_pool_state , ThreadsExec::Inactive );
+  }
+}
+
+ThreadsExec::ThreadsExec()
+  : m_pool_base(0)
+  , m_scratch(0)
+  , m_scratch_reduce_end(0)
+  , m_scratch_thread_end(0)
+  , m_numa_rank(0)
+  , m_numa_core_rank(0)
+  , m_pool_rank(0)
+  , m_pool_size(0)
+  , m_pool_fan_size(0)
+  , m_pool_state( ThreadsExec::Terminating )
+{
+  if ( & s_threads_process != this ) {
+
+    // A spawned thread
+
+    ThreadsExec * const nil = 0 ;
+
+    // Which entry in 's_threads_exec', possibly determined from hwloc binding
+    const int entry = ((size_t)s_current_function_arg) < size_t(s_thread_pool_size[0])
+                    ? ((size_t)s_current_function_arg)
+                    : size_t(Kokkos::hwloc::bind_this_thread( s_thread_pool_size[0] , s_threads_coord ));
+
+    // Given a good entry set this thread in the 's_threads_exec' array
+    if ( entry < s_thread_pool_size[0] &&
+         nil == atomic_compare_exchange( s_threads_exec + entry , nil , this ) ) {
+
+      const std::pair<unsigned,unsigned> coord = Kokkos::hwloc::get_this_thread_coordinate();
+
+      m_numa_rank       = coord.first ;
+      m_numa_core_rank  = coord.second ;
+      m_pool_base       = s_threads_exec ;
+      m_pool_rank       = s_thread_pool_size[0] - ( entry + 1 );
+      m_pool_rank_rev   = s_thread_pool_size[0] - ( pool_rank() + 1 );
+      m_pool_size       = s_thread_pool_size[0] ;
+      m_pool_fan_size   = fan_size( m_pool_rank , m_pool_size );
+      m_pool_state      = ThreadsExec::Active ;
+
+      s_threads_pid[ m_pool_rank ] = pthread_self();
+
+      // Inform spawning process that the threads_exec entry has been set.
+      s_threads_process.m_pool_state = ThreadsExec::Active ;
+    }
+    else {
+      // Inform spawning process that the threads_exec entry could not be set.
+      s_threads_process.m_pool_state = ThreadsExec::Terminating ;
+    }
+  }
+  else {
+    // Enables 'parallel_for' to execute on unitialized Threads device
+    m_pool_rank  = 0 ;
+    m_pool_size  = 1 ;
+    m_pool_state = ThreadsExec::Inactive ;
+
+    s_threads_pid[ m_pool_rank ] = pthread_self();
+  }
+}
+
+ThreadsExec::~ThreadsExec()
+{
+  const unsigned entry = m_pool_size - ( m_pool_rank + 1 );
+
+  typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ;
+
+  if ( m_scratch ) {
+    Record * const r = Record::get_record( m_scratch );
+
+    m_scratch = 0 ;
+
+    Record::decrement( r );
+  }
+
+  m_pool_base   = 0 ;
+  m_scratch_reduce_end = 0 ;
+  m_scratch_thread_end = 0 ;
+  m_numa_rank      = 0 ;
+  m_numa_core_rank = 0 ;
+  m_pool_rank      = 0 ;
+  m_pool_size      = 0 ;
+  m_pool_fan_size  = 0 ;
+
+  m_pool_state  = ThreadsExec::Terminating ;
+
+  if ( & s_threads_process != this && entry < MAX_THREAD_COUNT ) {
+    ThreadsExec * const nil = 0 ;
+
+    atomic_compare_exchange( s_threads_exec + entry , this , nil );
+
+    s_threads_process.m_pool_state = ThreadsExec::Terminating ;
+  }
+}
+
+
+int ThreadsExec::get_thread_count()
+{
+  return s_thread_pool_size[0] ;
+}
+
+ThreadsExec * ThreadsExec::get_thread( const int init_thread_rank )
+{
+  ThreadsExec * const th =
+    init_thread_rank < s_thread_pool_size[0]
+    ? s_threads_exec[ s_thread_pool_size[0] - ( init_thread_rank + 1 ) ] : 0 ;
+
+  if ( 0 == th || th->m_pool_rank != init_thread_rank ) {
+    std::ostringstream msg ;
+    msg << "Kokkos::Impl::ThreadsExec::get_thread ERROR : "
+        << "thread " << init_thread_rank << " of " << s_thread_pool_size[0] ;
+    if ( 0 == th ) {
+      msg << " does not exist" ;
+    }
+    else {
+      msg << " has wrong thread_rank " << th->m_pool_rank ;
+    }
+    Kokkos::Impl::throw_runtime_exception( msg.str() );
+  }
+
+  return th ;
+}
+
+//----------------------------------------------------------------------------
+
+void ThreadsExec::execute_sleep( ThreadsExec & exec , const void * )
+{
+  ThreadsExec::global_lock();
+  ThreadsExec::global_unlock();
+
+  const int n = exec.m_pool_fan_size ;
+  const int rank_rev = exec.m_pool_size - ( exec.m_pool_rank + 1 );
+
+  for ( int i = 0 ; i < n ; ++i ) {
+    Impl::spinwait( exec.m_pool_base[ rank_rev + (1<<i) ]->m_pool_state , ThreadsExec::Active );
+  }
+
+  exec.m_pool_state = ThreadsExec::Inactive ;
+}
+
+}
+}
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+void ThreadsExec::verify_is_process( const std::string & name , const bool initialized )
+{
+  if ( ! is_process() ) {
+    std::string msg( name );
+    msg.append( " FAILED : Called by a worker thread, can only be called by the master process." );
+    Kokkos::Impl::throw_runtime_exception( msg );
+  }
+
+  if ( initialized && 0 == s_thread_pool_size[0] ) {
+    std::string msg( name );
+    msg.append( " FAILED : Threads not initialized." );
+    Kokkos::Impl::throw_runtime_exception( msg );
+  }
+}
+
+int ThreadsExec::in_parallel()
+{
+  // A thread function is in execution and
+  // the function argument is not the special threads process argument and
+  // the master process is a worker or is not the master process.
+  return s_current_function &&
+         ( & s_threads_process != s_current_function_arg ) &&
+         ( s_threads_process.m_pool_base || ! is_process() );
+}
+
+// Wait for root thread to become inactive
+void ThreadsExec::fence()
+{
+  if ( s_thread_pool_size[0] ) {
+    // Wait for the root thread to complete:
+    Impl::spinwait( s_threads_exec[0]->m_pool_state , ThreadsExec::Active );
+  }
+
+  s_current_function     = 0 ;
+  s_current_function_arg = 0 ;
+
+  // Make sure function and arguments are cleared before
+  // potentially re-activating threads with a subsequent launch.
+  memory_fence();
+}
+
+/** \brief  Begin execution of the asynchronous functor */
+void ThreadsExec::start( void (*func)( ThreadsExec & , const void * ) , const void * arg )
+{
+  verify_is_process("ThreadsExec::start" , true );
+
+  if ( s_current_function || s_current_function_arg ) {
+    Kokkos::Impl::throw_runtime_exception( std::string( "ThreadsExec::start() FAILED : already executing" ) );
+  }
+
+  s_current_function     = func ;
+  s_current_function_arg = arg ;
+
+  // Make sure function and arguments are written before activating threads.
+  memory_fence();
+
+  // Activate threads:
+  for ( int i = s_thread_pool_size[0] ; 0 < i-- ; ) {
+    s_threads_exec[i]->m_pool_state = ThreadsExec::Active ;
+  }
+
+  if ( s_threads_process.m_pool_size ) {
+    // Master process is the root thread, run it:
+    (*func)( s_threads_process , arg );
+    s_threads_process.m_pool_state = ThreadsExec::Inactive ;
+  }
+}
+
+//----------------------------------------------------------------------------
+
+bool ThreadsExec::sleep()
+{
+  verify_is_process("ThreadsExec::sleep", true );
+
+  if ( & execute_sleep == s_current_function ) return false ;
+
+  fence();
+
+  ThreadsExec::global_lock();
+
+  s_current_function = & execute_sleep ;
+
+  // Activate threads:
+  for ( unsigned i = s_thread_pool_size[0] ; 0 < i ; ) {
+    s_threads_exec[--i]->m_pool_state = ThreadsExec::Active ;
+  }
+
+  return true ;
+}
+
+bool ThreadsExec::wake()
+{
+  verify_is_process("ThreadsExec::wake", true );
+
+  if ( & execute_sleep != s_current_function ) return false ;
+
+  ThreadsExec::global_unlock();
+
+  if ( s_threads_process.m_pool_base ) {
+    execute_sleep( s_threads_process , 0 );
+    s_threads_process.m_pool_state = ThreadsExec::Inactive ;
+  }
+
+  fence();
+
+  return true ;
+}
+
+//----------------------------------------------------------------------------
+
+void ThreadsExec::execute_serial( void (*func)( ThreadsExec & , const void * ) )
+{
+  s_current_function = func ;
+  s_current_function_arg = & s_threads_process ;
+
+  // Make sure function and arguments are written before activating threads.
+  memory_fence();
+
+  const unsigned begin = s_threads_process.m_pool_base ? 1 : 0 ;
+
+  for ( unsigned i = s_thread_pool_size[0] ; begin < i ; ) {
+    ThreadsExec & th = * s_threads_exec[ --i ];
+
+    th.m_pool_state = ThreadsExec::Active ;
+
+    wait_yield( th.m_pool_state , ThreadsExec::Active );
+  }
+
+  if ( s_threads_process.m_pool_base ) {
+    s_threads_process.m_pool_state = ThreadsExec::Active ;
+    (*func)( s_threads_process , 0 );
+    s_threads_process.m_pool_state = ThreadsExec::Inactive ;
+  }
+
+  s_current_function_arg = 0 ;
+  s_current_function = 0 ;
+
+  // Make sure function and arguments are cleared before proceeding.
+  memory_fence();
+}
+
+//----------------------------------------------------------------------------
+
+void * ThreadsExec::root_reduce_scratch()
+{
+  return s_threads_process.reduce_memory();
+}
+
+void ThreadsExec::execute_resize_scratch( ThreadsExec & exec , const void * )
+{
+  typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ;
+
+  if ( exec.m_scratch ) {
+    Record * const r = Record::get_record( exec.m_scratch );
+
+    exec.m_scratch = 0 ;
+
+    Record::decrement( r );
+  }
+
+  exec.m_scratch_reduce_end = s_threads_process.m_scratch_reduce_end ;
+  exec.m_scratch_thread_end = s_threads_process.m_scratch_thread_end ;
+
+  if ( s_threads_process.m_scratch_thread_end ) {
+
+    // Allocate tracked memory:
+    {
+      Record * const r = Record::allocate( Kokkos::HostSpace() , "thread_scratch" , s_threads_process.m_scratch_thread_end );
+
+      Record::increment( r );
+
+      exec.m_scratch = r->data();
+    }
+
+    unsigned * ptr = reinterpret_cast<unsigned *>( exec.m_scratch );
+
+    unsigned * const end = ptr + s_threads_process.m_scratch_thread_end / sizeof(unsigned);
+
+    // touch on this thread
+    while ( ptr < end ) *ptr++ = 0 ;
+  }
+}
+
+void * ThreadsExec::resize_scratch( size_t reduce_size , size_t thread_size )
+{
+  enum { ALIGN_MASK = Kokkos::Impl::MEMORY_ALIGNMENT - 1 };
+
+  fence();
+
+  const size_t old_reduce_size = s_threads_process.m_scratch_reduce_end ;
+  const size_t old_thread_size = s_threads_process.m_scratch_thread_end - s_threads_process.m_scratch_reduce_end ;
+
+  reduce_size = ( reduce_size + ALIGN_MASK ) & ~ALIGN_MASK ;
+  thread_size = ( thread_size + ALIGN_MASK ) & ~ALIGN_MASK ;
+
+  // Increase size or deallocate completely.
+
+  if ( ( old_reduce_size < reduce_size ) ||
+       ( old_thread_size < thread_size ) ||
+       ( ( reduce_size == 0 && thread_size == 0 ) &&
+         ( old_reduce_size != 0 || old_thread_size != 0 ) ) ) {
+
+    verify_is_process( "ThreadsExec::resize_scratch" , true );
+
+    s_threads_process.m_scratch_reduce_end = reduce_size ;
+    s_threads_process.m_scratch_thread_end = reduce_size + thread_size ;
+
+    execute_serial( & execute_resize_scratch );
+
+    s_threads_process.m_scratch = s_threads_exec[0]->m_scratch ;
+  }
+
+  return s_threads_process.m_scratch ;
+}
+
+//----------------------------------------------------------------------------
+
+void ThreadsExec::print_configuration( std::ostream & s , const bool detail )
+{
+  verify_is_process("ThreadsExec::print_configuration",false);
+
+  fence();
+
+  const unsigned numa_count       = Kokkos::hwloc::get_available_numa_count();
+  const unsigned cores_per_numa   = Kokkos::hwloc::get_available_cores_per_numa();
+  const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core();
+
+  // Forestall compiler warnings for unused variables.
+  (void) numa_count;
+  (void) cores_per_numa;
+  (void) threads_per_core;
+
+  s << "Kokkos::Threads" ;
+
+#if defined( KOKKOS_HAVE_PTHREAD )
+  s << " KOKKOS_HAVE_PTHREAD" ;
+#endif
+#if defined( KOKKOS_HAVE_HWLOC )
+  s << " hwloc[" << numa_count << "x" << cores_per_numa << "x" << threads_per_core << "]" ;
+#endif
+
+  if ( s_thread_pool_size[0] ) {
+    s << " threads[" << s_thread_pool_size[0] << "]"
+      << " threads_per_numa[" << s_thread_pool_size[1] << "]"
+      << " threads_per_core[" << s_thread_pool_size[2] << "]"
+      ;
+    if ( 0 == s_threads_process.m_pool_base ) { s << " Asynchronous" ; }
+    s << " ReduceScratch[" << s_current_reduce_size << "]"
+      << " SharedScratch[" << s_current_shared_size << "]" ;
+    s << std::endl ;
+
+    if ( detail ) {
+
+      for ( int i = 0 ; i < s_thread_pool_size[0] ; ++i ) {
+
+        ThreadsExec * const th = s_threads_exec[i] ;
+
+        if ( th ) {
+
+          const int rank_rev = th->m_pool_size - ( th->m_pool_rank + 1 );
+
+          s << " Thread[ " << th->m_pool_rank << " : "
+            << th->m_numa_rank << "." << th->m_numa_core_rank << " ]" ;
+
+          s << " Fan{" ;
+          for ( int j = 0 ; j < th->m_pool_fan_size ; ++j ) {
+            ThreadsExec * const thfan = th->m_pool_base[rank_rev+(1<<j)] ;
+            s << " [ " << thfan->m_pool_rank << " : "
+              << thfan->m_numa_rank << "." << thfan->m_numa_core_rank << " ]" ;
+          }
+          s << " }" ;
+
+          if ( th == & s_threads_process ) {
+            s << " is_process" ;
+          }
+        }
+        s << std::endl ;
+      }
+    }
+  }
+  else {
+    s << " not initialized" << std::endl ;
+  }
+}
+
+//----------------------------------------------------------------------------
+
+int ThreadsExec::is_initialized()
+{ return 0 != s_threads_exec[0] ; }
+
+void ThreadsExec::initialize( unsigned thread_count ,
+                              unsigned use_numa_count ,
+                              unsigned use_cores_per_numa ,
+                              bool allow_asynchronous_threadpool )
+{
+  static const Sentinel sentinel ;
+
+  const bool is_initialized = 0 != s_thread_pool_size[0] ;
+
+  unsigned thread_spawn_failed = 0 ;
+
+  for ( int i = 0; i < ThreadsExec::MAX_THREAD_COUNT ; i++)
+    s_threads_exec[i] = NULL;
+
+  if ( ! is_initialized ) {
+
+    // If thread_count, use_numa_count, or use_cores_per_numa are zero
+    // then they will be given default values based upon hwloc detection
+    // and allowed asynchronous execution.
+
+    const bool hwloc_avail = Kokkos::hwloc::available();
+    const bool hwloc_can_bind = hwloc_avail && Kokkos::hwloc::can_bind_threads();
+
+    if ( thread_count == 0 ) {
+      thread_count = hwloc_avail
+      ? Kokkos::hwloc::get_available_numa_count() *
+        Kokkos::hwloc::get_available_cores_per_numa() *
+        Kokkos::hwloc::get_available_threads_per_core()
+      : 1 ;
+    }
+
+    const unsigned thread_spawn_begin =
+      hwloc::thread_mapping( "Kokkos::Threads::initialize" ,
+                             allow_asynchronous_threadpool ,
+                             thread_count ,
+                             use_numa_count ,
+                             use_cores_per_numa ,
+                             s_threads_coord );
+
+    const std::pair<unsigned,unsigned> proc_coord = s_threads_coord[0] ;
+
+    if ( thread_spawn_begin ) {
+      // Synchronous with s_threads_coord[0] as the process core
+      // Claim entry #0 for binding the process core.
+      s_threads_coord[0] = std::pair<unsigned,unsigned>(~0u,~0u);
+    }
+
+    s_thread_pool_size[0] = thread_count ;
+    s_thread_pool_size[1] = s_thread_pool_size[0] / use_numa_count ;
+    s_thread_pool_size[2] = s_thread_pool_size[1] / use_cores_per_numa ;
+    s_current_function = & execute_function_noop ; // Initialization work function
+
+    for ( unsigned ith = thread_spawn_begin ; ith < thread_count ; ++ith ) {
+
+      s_threads_process.m_pool_state = ThreadsExec::Inactive ;
+
+      // If hwloc available then spawned thread will
+      // choose its own entry in 's_threads_coord'
+      // otherwise specify the entry.
+      s_current_function_arg = (void*)static_cast<uintptr_t>( hwloc_can_bind ? ~0u : ith );
+
+      // Make sure all outstanding memory writes are complete
+      // before spawning the new thread.
+      memory_fence();
+
+      // Spawn thread executing the 'driver()' function.
+      // Wait until spawned thread has attempted to initialize.
+      // If spawning and initialization is successfull then
+      // an entry in 's_threads_exec' will be assigned.
+      if ( ThreadsExec::spawn() ) {
+        wait_yield( s_threads_process.m_pool_state , ThreadsExec::Inactive );
+      }
+      if ( s_threads_process.m_pool_state == ThreadsExec::Terminating ) break ;
+    }
+
+    // Wait for all spawned threads to deactivate before zeroing the function.
+
+    for ( unsigned ith = thread_spawn_begin ; ith < thread_count ; ++ith ) {
+      // Try to protect against cache coherency failure by casting to volatile.
+      ThreadsExec * const th = ((ThreadsExec * volatile *)s_threads_exec)[ith] ;
+      if ( th ) {
+        wait_yield( th->m_pool_state , ThreadsExec::Active );
+      }
+      else {
+        ++thread_spawn_failed ;
+      }
+    }
+
+    s_current_function     = 0 ;
+    s_current_function_arg = 0 ;
+    s_threads_process.m_pool_state = ThreadsExec::Inactive ;
+
+    memory_fence();
+
+    if ( ! thread_spawn_failed ) {
+      // Bind process to the core on which it was located before spawning occured
+      if (hwloc_can_bind) {
+        Kokkos::hwloc::bind_this_thread( proc_coord );
+      }
+
+      if ( thread_spawn_begin ) { // Include process in pool.
+        const std::pair<unsigned,unsigned> coord = Kokkos::hwloc::get_this_thread_coordinate();
+
+        s_threads_exec[0]                   = & s_threads_process ;
+        s_threads_process.m_numa_rank       = coord.first ;
+        s_threads_process.m_numa_core_rank  = coord.second ;
+        s_threads_process.m_pool_base       = s_threads_exec ;
+        s_threads_process.m_pool_rank       = thread_count - 1 ; // Reversed for scan-compatible reductions
+        s_threads_process.m_pool_size       = thread_count ;
+        s_threads_process.m_pool_fan_size   = fan_size( s_threads_process.m_pool_rank , s_threads_process.m_pool_size );
+        s_threads_pid[ s_threads_process.m_pool_rank ] = pthread_self();
+      }
+      else {
+        s_threads_process.m_pool_base = 0 ;
+        s_threads_process.m_pool_rank = 0 ;
+        s_threads_process.m_pool_size = 0 ;
+        s_threads_process.m_pool_fan_size = 0 ;
+      }
+
+      // Initial allocations:
+      ThreadsExec::resize_scratch( 1024 , 1024 );
+    }
+    else {
+      s_thread_pool_size[0] = 0 ;
+      s_thread_pool_size[1] = 0 ;
+      s_thread_pool_size[2] = 0 ;
+    }
+  }
+
+  if ( is_initialized || thread_spawn_failed ) {
+
+    std::ostringstream msg ;
+
+    msg << "Kokkos::Threads::initialize ERROR" ;
+
+    if ( is_initialized ) {
+      msg << " : already initialized" ;
+    }
+    if ( thread_spawn_failed ) {
+      msg << " : failed to spawn " << thread_spawn_failed << " threads" ;
+    }
+
+    Kokkos::Impl::throw_runtime_exception( msg.str() );
+  }
+
+  // Check for over-subscription
+  if( Impl::mpi_ranks_per_node() * long(thread_count) > Impl::processors_per_node() ) {
+    std::cout << "Kokkos::Threads::initialize WARNING: You are likely oversubscribing your CPU cores." << std::endl;
+    std::cout << "                                    Detected: " << Impl::processors_per_node() << " cores per node." << std::endl;
+    std::cout << "                                    Detected: " << Impl::mpi_ranks_per_node() << " MPI_ranks per node." << std::endl;
+    std::cout << "                                    Requested: " << thread_count << " threads per process." << std::endl;
+  }
+
+  // Init the array for used for arbitrarily sized atomics
+  Impl::init_lock_array_host_space();
+
+  #if (KOKKOS_ENABLE_PROFILING)
+    Kokkos::Profiling::initialize();
+  #endif
+}
+
+//----------------------------------------------------------------------------
+
+void ThreadsExec::finalize()
+{
+  verify_is_process("ThreadsExec::finalize",false);
+
+  fence();
+
+  resize_scratch(0,0);
+
+  const unsigned begin = s_threads_process.m_pool_base ? 1 : 0 ;
+
+  for ( unsigned i = s_thread_pool_size[0] ; begin < i-- ; ) {
+
+    if ( s_threads_exec[i] ) {
+
+      s_threads_exec[i]->m_pool_state = ThreadsExec::Terminating ;
+
+      wait_yield( s_threads_process.m_pool_state , ThreadsExec::Inactive );
+
+      s_threads_process.m_pool_state = ThreadsExec::Inactive ;
+    }
+
+    s_threads_pid[i] = 0 ;
+  }
+
+  if ( s_threads_process.m_pool_base ) {
+    ( & s_threads_process )->~ThreadsExec();
+    s_threads_exec[0] = 0 ;
+  }
+
+  if (Kokkos::hwloc::can_bind_threads() ) {
+    Kokkos::hwloc::unbind_this_thread();
+  }
+
+  s_thread_pool_size[0] = 0 ;
+  s_thread_pool_size[1] = 0 ;
+  s_thread_pool_size[2] = 0 ;
+
+  // Reset master thread to run solo.
+  s_threads_process.m_numa_rank       = 0 ;
+  s_threads_process.m_numa_core_rank  = 0 ;
+  s_threads_process.m_pool_base       = 0 ;
+  s_threads_process.m_pool_rank       = 0 ;
+  s_threads_process.m_pool_size       = 1 ;
+  s_threads_process.m_pool_fan_size   = 0 ;
+  s_threads_process.m_pool_state = ThreadsExec::Inactive ;
+
+  #if (KOKKOS_ENABLE_PROFILING)
+    Kokkos::Profiling::finalize();
+  #endif
+}
+
+//----------------------------------------------------------------------------
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+int Threads::concurrency() {
+  return thread_pool_size(0);
+}
+
+Threads & Threads::instance(int)
+{
+  static Threads t ;
+  return t ;
+}
+
+int Threads::thread_pool_size( int depth )
+{
+  return Impl::s_thread_pool_size[depth];
+}
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+int Threads::thread_pool_rank()
+{
+  const pthread_t pid = pthread_self();
+  int i = 0;
+  while ( ( i < Impl::s_thread_pool_size[0] ) && ( pid != Impl::s_threads_pid[i] ) ) { ++i ; }
+  return i ;
+}
+#endif
+
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_HAVE_PTHREAD ) || defined( KOKKOS_HAVE_WINTHREAD ) */
+
diff --git a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..4ec1450d0f8270c735c83e194d6d6243aa2aa415
--- /dev/null
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
@@ -0,0 +1,631 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_THREADSEXEC_HPP
+#define KOKKOS_THREADSEXEC_HPP
+
+#include <stdio.h>
+
+#include <utility>
+#include <impl/Kokkos_spinwait.hpp>
+#include <impl/Kokkos_FunctorAdapter.hpp>
+
+#include <Kokkos_Atomic.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+class ThreadsExec {
+public:
+
+  // Fan array has log_2(NT) reduction threads plus 2 scan threads
+  // Currently limited to 16k threads.
+  enum { MAX_FAN_COUNT    = 16 };
+  enum { MAX_THREAD_COUNT = 1 << ( MAX_FAN_COUNT - 2 ) };
+  enum { VECTOR_LENGTH    = 8 };
+
+  /** \brief States of a worker thread */
+  enum { Terminating ///<  Termination in progress
+       , Inactive    ///<  Exists, waiting for work
+       , Active      ///<  Exists, performing work
+       , Rendezvous  ///<  Exists, waiting in a barrier or reduce
+
+       , ScanCompleted
+       , ScanAvailable
+       , ReductionAvailable
+       };
+
+private:
+
+  friend class Kokkos::Threads ;
+
+  // Fan-in operations' root is the highest ranking thread
+  // to place the 'scan' reduction intermediate values on
+  // the threads that need them.
+  // For a simple reduction the thread location is arbitrary.
+
+  ThreadsExec * const * m_pool_base ; ///< Base for pool fan-in
+
+  void *        m_scratch ;
+  int           m_scratch_reduce_end ;
+  int           m_scratch_thread_end ;
+  int           m_numa_rank ;
+  int           m_numa_core_rank ;
+  int           m_pool_rank ;
+  int           m_pool_rank_rev ;
+  int           m_pool_size ;
+  int           m_pool_fan_size ;
+  int volatile  m_pool_state ;  ///< State for global synchronizations
+
+  // Members for dynamic scheduling
+  // Which thread am I stealing from currently
+  int m_current_steal_target;
+  // This thread's owned work_range
+  Kokkos::pair<long,long> m_work_range KOKKOS_ALIGN_16;
+  // Team Offset if one thread determines work_range for others
+  long m_team_work_index;
+
+  // Is this thread stealing (i.e. its owned work_range is exhausted
+  bool m_stealing;
+
+  static void global_lock();
+  static void global_unlock();
+  static bool spawn();
+
+  static void execute_resize_scratch( ThreadsExec & , const void * );
+  static void execute_sleep(          ThreadsExec & , const void * );
+
+  ThreadsExec( const ThreadsExec & );
+  ThreadsExec & operator = ( const ThreadsExec & );
+
+  static void execute_serial( void (*)( ThreadsExec & , const void * ) );
+
+public:
+
+  KOKKOS_INLINE_FUNCTION int pool_size() const { return m_pool_size ; }
+  KOKKOS_INLINE_FUNCTION int pool_rank() const { return m_pool_rank ; }
+  KOKKOS_INLINE_FUNCTION int numa_rank() const { return m_numa_rank ; }
+  KOKKOS_INLINE_FUNCTION int numa_core_rank() const { return m_numa_core_rank ; }
+  inline long team_work_index() const { return m_team_work_index ; }
+
+  static int get_thread_count();
+  static ThreadsExec * get_thread( const int init_thread_rank );
+
+  inline void * reduce_memory() const { return m_scratch ; }
+  KOKKOS_INLINE_FUNCTION  void * scratch_memory() const
+    { return reinterpret_cast<unsigned char *>(m_scratch) + m_scratch_reduce_end ; }
+
+  KOKKOS_INLINE_FUNCTION  int volatile & state() { return m_pool_state ; }
+  KOKKOS_INLINE_FUNCTION  ThreadsExec * const * pool_base() const { return m_pool_base ; }
+
+  static void driver(void);
+
+  ~ThreadsExec();
+  ThreadsExec();
+
+  static void * resize_scratch( size_t reduce_size , size_t thread_size );
+
+  static void * root_reduce_scratch();
+
+  static bool is_process();
+
+  static void verify_is_process( const std::string & , const bool initialized );
+
+  static int is_initialized();
+
+  static void initialize( unsigned thread_count ,
+                          unsigned use_numa_count ,
+                          unsigned use_cores_per_numa ,
+                          bool allow_asynchronous_threadpool );
+
+  static void finalize();
+
+  /* Given a requested team size, return valid team size */
+  static unsigned team_size_valid( unsigned );
+
+  static void print_configuration( std::ostream & , const bool detail = false );
+
+  //------------------------------------
+
+  static void wait_yield( volatile int & , const int );
+
+  //------------------------------------
+  // All-thread functions:
+
+  inline
+  int all_reduce( const int value )
+    {
+      // Make sure there is enough scratch space:
+      const int rev_rank = m_pool_size - ( m_pool_rank + 1 );
+
+      *((volatile int*) reduce_memory()) = value ;
+
+      memory_fence();
+
+      // Fan-in reduction with highest ranking thread as the root
+      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
+        // Wait: Active -> Rendezvous
+        Impl::spinwait( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
+      }
+
+      if ( rev_rank ) {
+        m_pool_state = ThreadsExec::Rendezvous ;
+        // Wait: Rendezvous -> Active
+        Impl::spinwait( m_pool_state , ThreadsExec::Rendezvous );
+      }
+      else {
+        // Root thread does the reduction and broadcast
+
+        int accum = 0 ;
+
+        for ( int rank = 0 ; rank < m_pool_size ; ++rank ) {
+          accum += *((volatile int *) get_thread( rank )->reduce_memory());
+        }
+
+        for ( int rank = 0 ; rank < m_pool_size ; ++rank ) {
+          *((volatile int *) get_thread( rank )->reduce_memory()) = accum ;
+        }
+
+        memory_fence();
+
+        for ( int rank = 0 ; rank < m_pool_size ; ++rank ) {
+          get_thread( rank )->m_pool_state = ThreadsExec::Active ;
+        }
+      }
+
+      return *((volatile int*) reduce_memory());
+    }
+
+  inline
+  void barrier( )
+    {
+      // Make sure there is enough scratch space:
+      const int rev_rank = m_pool_size - ( m_pool_rank + 1 );
+
+      memory_fence();
+
+      // Fan-in reduction with highest ranking thread as the root
+      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
+        // Wait: Active -> Rendezvous
+        Impl::spinwait( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
+      }
+
+      if ( rev_rank ) {
+        m_pool_state = ThreadsExec::Rendezvous ;
+        // Wait: Rendezvous -> Active
+        Impl::spinwait( m_pool_state , ThreadsExec::Rendezvous );
+      }
+      else {
+        // Root thread does the reduction and broadcast
+
+        memory_fence();
+
+        for ( int rank = 0 ; rank < m_pool_size ; ++rank ) {
+          get_thread( rank )->m_pool_state = ThreadsExec::Active ;
+        }
+      }
+    }
+
+  //------------------------------------
+  // All-thread functions:
+
+  template< class FunctorType , class ArgTag >
+  inline
+  void fan_in_reduce( const FunctorType & f ) const
+    {
+      typedef Kokkos::Impl::FunctorValueJoin< FunctorType , ArgTag > Join ;
+      typedef Kokkos::Impl::FunctorFinal<     FunctorType , ArgTag > Final ;
+
+      const int rev_rank  = m_pool_size - ( m_pool_rank + 1 );
+
+      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
+
+        ThreadsExec & fan = *m_pool_base[ rev_rank + ( 1 << i ) ] ;
+
+        Impl::spinwait( fan.m_pool_state , ThreadsExec::Active );
+
+        Join::join( f , reduce_memory() , fan.reduce_memory() );
+      }
+
+      if ( ! rev_rank ) {
+        Final::final( f , reduce_memory() );
+      }
+    }
+
+  inline
+  void fan_in() const
+    {
+      const int rev_rank = m_pool_size - ( m_pool_rank + 1 );
+
+      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
+        Impl::spinwait( m_pool_base[rev_rank+(1<<i)]->m_pool_state , ThreadsExec::Active );
+      }
+    }
+
+  template< class FunctorType , class ArgTag >
+  inline
+  void scan_large( const FunctorType & f )
+    {
+      // Sequence of states:
+      //  0) Active             : entry and exit state
+      //  1) ReductionAvailable : reduction value available
+      //  2) ScanAvailable      : inclusive scan value available
+      //  3) Rendezvous         : All threads inclusive scan value are available
+      //  4) ScanCompleted      : exclusive scan value copied
+
+      typedef Kokkos::Impl::FunctorValueTraits< FunctorType , ArgTag > Traits ;
+      typedef Kokkos::Impl::FunctorValueJoin<   FunctorType , ArgTag > Join ;
+      typedef Kokkos::Impl::FunctorValueInit<   FunctorType , ArgTag > Init ;
+
+      typedef typename Traits::value_type scalar_type ;
+
+      const int      rev_rank = m_pool_size - ( m_pool_rank + 1 );
+      const unsigned count    = Traits::value_count( f );
+
+      scalar_type * const work_value = (scalar_type *) reduce_memory();
+
+      //--------------------------------
+      // Fan-in reduction with highest ranking thread as the root
+      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
+        ThreadsExec & fan = *m_pool_base[ rev_rank + (1<<i) ];
+
+        // Wait: Active -> ReductionAvailable (or ScanAvailable)
+        Impl::spinwait( fan.m_pool_state , ThreadsExec::Active );
+        Join::join( f , work_value , fan.reduce_memory() );
+      }
+
+      // Copy reduction value to scan value before releasing from this phase.
+      for ( unsigned i = 0 ; i < count ; ++i ) { work_value[i+count] = work_value[i] ; }
+
+      if ( rev_rank ) {
+
+        // Set: Active -> ReductionAvailable
+        m_pool_state = ThreadsExec::ReductionAvailable ;
+
+        // Wait for contributing threads' scan value to be available.
+        if ( ( 1 << m_pool_fan_size ) < ( m_pool_rank + 1 ) ) {
+          ThreadsExec & th = *m_pool_base[ rev_rank + ( 1 << m_pool_fan_size ) ] ;
+
+          // Wait: Active             -> ReductionAvailable
+          // Wait: ReductionAvailable -> ScanAvailable
+          Impl::spinwait( th.m_pool_state , ThreadsExec::Active );
+          Impl::spinwait( th.m_pool_state , ThreadsExec::ReductionAvailable );
+
+          Join::join( f , work_value + count , ((scalar_type *)th.reduce_memory()) + count );
+        }
+
+        // This thread has completed inclusive scan
+        // Set: ReductionAvailable -> ScanAvailable
+        m_pool_state = ThreadsExec::ScanAvailable ;
+
+        // Wait for all threads to complete inclusive scan
+        // Wait: ScanAvailable -> Rendezvous
+        Impl::spinwait( m_pool_state , ThreadsExec::ScanAvailable );
+      }
+
+      //--------------------------------
+
+      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
+        ThreadsExec & fan = *m_pool_base[ rev_rank + (1<<i) ];
+        // Wait: ReductionAvailable -> ScanAvailable
+        Impl::spinwait( fan.m_pool_state , ThreadsExec::ReductionAvailable );
+        // Set: ScanAvailable -> Rendezvous
+        fan.m_pool_state = ThreadsExec::Rendezvous ;
+      }
+
+      // All threads have completed the inclusive scan.
+      // All non-root threads are in the Rendezvous state.
+      // Threads are free to overwrite their reduction value.
+      //--------------------------------
+
+      if ( ( rev_rank + 1 ) < m_pool_size ) {
+        // Exclusive scan: copy the previous thread's inclusive scan value
+
+        ThreadsExec & th = *m_pool_base[ rev_rank + 1 ] ; // Not the root thread
+
+        const scalar_type * const src_value = ((scalar_type *)th.reduce_memory()) + count ;
+
+        for ( unsigned j = 0 ; j < count ; ++j ) { work_value[j] = src_value[j]; }
+      }
+      else {
+        (void) Init::init( f , work_value );
+      }
+
+      //--------------------------------
+      // Wait for all threads to copy previous thread's inclusive scan value
+      // Wait for all threads: Rendezvous -> ScanCompleted
+      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
+        Impl::spinwait( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Rendezvous );
+      }
+      if ( rev_rank ) {
+        // Set: ScanAvailable -> ScanCompleted
+        m_pool_state = ThreadsExec::ScanCompleted ;
+        // Wait: ScanCompleted -> Active
+        Impl::spinwait( m_pool_state , ThreadsExec::ScanCompleted );
+      }
+      // Set: ScanCompleted -> Active
+      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
+        m_pool_base[ rev_rank + (1<<i) ]->m_pool_state = ThreadsExec::Active ;
+      }
+    }
+
+  template< class FunctorType , class ArgTag >
+  inline
+  void scan_small( const FunctorType & f )
+    {
+      typedef Kokkos::Impl::FunctorValueTraits< FunctorType , ArgTag > Traits ;
+      typedef Kokkos::Impl::FunctorValueJoin<   FunctorType , ArgTag > Join ;
+      typedef Kokkos::Impl::FunctorValueInit<   FunctorType , ArgTag > Init ;
+
+      typedef typename Traits::value_type scalar_type ;
+
+      const int      rev_rank = m_pool_size - ( m_pool_rank + 1 );
+      const unsigned count    = Traits::value_count( f );
+
+      scalar_type * const work_value = (scalar_type *) reduce_memory();
+
+      //--------------------------------
+      // Fan-in reduction with highest ranking thread as the root
+      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
+        // Wait: Active -> Rendezvous
+        Impl::spinwait( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
+      }
+
+      for ( unsigned i = 0 ; i < count ; ++i ) { work_value[i+count] = work_value[i]; }
+
+      if ( rev_rank ) {
+        m_pool_state = ThreadsExec::Rendezvous ;
+        // Wait: Rendezvous -> Active
+        Impl::spinwait( m_pool_state , ThreadsExec::Rendezvous );
+      }
+      else {
+        // Root thread does the thread-scan before releasing threads
+
+        scalar_type * ptr_prev = 0 ;
+
+        for ( int rank = 0 ; rank < m_pool_size ; ++rank ) {
+          scalar_type * const ptr = (scalar_type *) get_thread( rank )->reduce_memory();
+          if ( rank ) {
+            for ( unsigned i = 0 ; i < count ; ++i ) { ptr[i] = ptr_prev[ i + count ]; }
+            Join::join( f , ptr + count , ptr );
+          }
+          else {
+            (void) Init::init( f , ptr );
+          }
+          ptr_prev = ptr ;
+        }
+      }
+
+      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
+        m_pool_base[ rev_rank + (1<<i) ]->m_pool_state = ThreadsExec::Active ;
+      }
+    }
+
+  //------------------------------------
+  /** \brief  Wait for previous asynchronous functor to
+   *          complete and release the Threads device.
+   *          Acquire the Threads device and start this functor.
+   */
+  static void start( void (*)( ThreadsExec & , const void * ) , const void * );
+
+  static int  in_parallel();
+  static void fence();
+  static bool sleep();
+  static bool wake();
+
+  /* Dynamic Scheduling related functionality */
+  // Initialize the work range for this thread
+  inline void set_work_range(const long& begin, const long& end, const long& chunk_size) {
+    m_work_range.first = (begin+chunk_size-1)/chunk_size;
+    m_work_range.second = end>0?(end+chunk_size-1)/chunk_size:m_work_range.first;
+  }
+
+  // Claim and index from this thread's range from the beginning
+  inline long get_work_index_begin () {
+    Kokkos::pair<long,long> work_range_new = m_work_range;
+    Kokkos::pair<long,long> work_range_old = work_range_new;
+    if(work_range_old.first>=work_range_old.second)
+      return -1;
+
+    work_range_new.first+=1;
+
+    bool success = false;
+    while(!success) {
+      work_range_new = Kokkos::atomic_compare_exchange(&m_work_range,work_range_old,work_range_new);
+      success = ( (work_range_new == work_range_old) ||
+                  (work_range_new.first>=work_range_new.second));
+      work_range_old = work_range_new;
+      work_range_new.first+=1;
+    }
+    if(work_range_old.first<work_range_old.second)
+      return work_range_old.first;
+    else
+      return -1;
+  }
+
+  // Claim and index from this thread's range from the end
+  inline long get_work_index_end () {
+    Kokkos::pair<long,long> work_range_new = m_work_range;
+    Kokkos::pair<long,long> work_range_old = work_range_new;
+    if(work_range_old.first>=work_range_old.second)
+      return -1;
+    work_range_new.second-=1;
+    bool success = false;
+    while(!success) {
+      work_range_new = Kokkos::atomic_compare_exchange(&m_work_range,work_range_old,work_range_new);
+      success = ( (work_range_new == work_range_old) ||
+                  (work_range_new.first>=work_range_new.second) );
+      work_range_old = work_range_new;
+      work_range_new.second-=1;
+    }
+    if(work_range_old.first<work_range_old.second)
+      return work_range_old.second-1;
+    else
+      return -1;
+  }
+
+  // Reset the steal target
+  inline void reset_steal_target() {
+    m_current_steal_target = (m_pool_rank+1)%pool_size();
+    m_stealing = false;
+  }
+
+  // Reset the steal target
+  inline void reset_steal_target(int team_size) {
+    m_current_steal_target = (m_pool_rank_rev+team_size);
+    if(m_current_steal_target>=pool_size())
+      m_current_steal_target = 0;//pool_size()-1;
+    m_stealing = false;
+  }
+
+  // Get a steal target; start with my-rank + 1 and go round robin, until arriving at this threads rank
+  // Returns -1 fi no active steal target available
+  inline int get_steal_target() {
+    while(( m_pool_base[m_current_steal_target]->m_work_range.second <=
+            m_pool_base[m_current_steal_target]->m_work_range.first  ) &&
+          (m_current_steal_target!=m_pool_rank) ) {
+      m_current_steal_target = (m_current_steal_target+1)%pool_size();
+    }
+    if(m_current_steal_target == m_pool_rank)
+      return -1;
+    else
+      return m_current_steal_target;
+  }
+
+  inline int get_steal_target(int team_size) {
+
+    while(( m_pool_base[m_current_steal_target]->m_work_range.second <=
+            m_pool_base[m_current_steal_target]->m_work_range.first  ) &&
+          (m_current_steal_target!=m_pool_rank_rev) ) {
+      if(m_current_steal_target + team_size < pool_size())
+        m_current_steal_target = (m_current_steal_target+team_size);
+      else
+        m_current_steal_target = 0;
+    }
+
+    if(m_current_steal_target == m_pool_rank_rev)
+      return -1;
+    else
+      return m_current_steal_target;
+  }
+
+  inline long steal_work_index (int team_size = 0) {
+    long index = -1;
+    int steal_target = team_size>0?get_steal_target(team_size):get_steal_target();
+    while ( (steal_target != -1) && (index == -1)) {
+      index = m_pool_base[steal_target]->get_work_index_end();
+      if(index == -1)
+        steal_target = team_size>0?get_steal_target(team_size):get_steal_target();
+    }
+    return index;
+  }
+
+  // Get a work index. Claim from owned range until its exhausted, then steal from other thread
+  inline long get_work_index (int team_size = 0) {
+    long work_index = -1;
+    if(!m_stealing) work_index = get_work_index_begin();
+
+    if( work_index == -1) {
+      memory_fence();
+      m_stealing = true;
+      work_index = steal_work_index(team_size);
+    }
+
+    m_team_work_index = work_index;
+    memory_fence();
+    return work_index;
+  }
+
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+inline int Threads::in_parallel()
+{ return Impl::ThreadsExec::in_parallel(); }
+
+inline int Threads::is_initialized()
+{ return Impl::ThreadsExec::is_initialized(); }
+
+inline void Threads::initialize(
+  unsigned threads_count ,
+  unsigned use_numa_count ,
+  unsigned use_cores_per_numa ,
+  bool allow_asynchronous_threadpool )
+{
+  Impl::ThreadsExec::initialize( threads_count , use_numa_count , use_cores_per_numa , allow_asynchronous_threadpool );
+}
+
+inline void Threads::finalize()
+{
+  Impl::ThreadsExec::finalize();
+}
+
+inline void Threads::print_configuration( std::ostream & s , const bool detail )
+{
+  Impl::ThreadsExec::print_configuration( s , detail );
+}
+
+inline bool Threads::sleep()
+{ return Impl::ThreadsExec::sleep() ; }
+
+inline bool Threads::wake()
+{ return Impl::ThreadsExec::wake() ; }
+
+inline void Threads::fence()
+{ Impl::ThreadsExec::fence() ; }
+
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #define KOKKOS_THREADSEXEC_HPP */
+
diff --git a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec_base.cpp b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec_base.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ce09248678de618495f3e3e4a9fc75a0ce48e28c
--- /dev/null
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec_base.cpp
@@ -0,0 +1,255 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core_fwd.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_HAVE_PTHREAD )
+
+/* Standard 'C' Linux libraries */
+
+#include <pthread.h>
+#include <sched.h>
+#include <errno.h>
+
+/* Standard C++ libaries */
+
+#include <cstdlib>
+#include <string>
+#include <iostream>
+#include <stdexcept>
+
+#include <Kokkos_Threads.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+namespace {
+
+pthread_mutex_t host_internal_pthread_mutex = PTHREAD_MUTEX_INITIALIZER ;
+
+// Pthreads compatible driver.
+// Recovery from an exception would require constant intra-thread health
+// verification; which would negatively impact runtime.  As such simply
+// abort the process.
+
+void * internal_pthread_driver( void * )
+{
+  try {
+    ThreadsExec::driver();
+  }
+  catch( const std::exception & x ) {
+    std::cerr << "Exception thrown from worker thread: " << x.what() << std::endl ;
+    std::cerr.flush();
+    std::abort();
+  }
+  catch( ... ) {
+    std::cerr << "Exception thrown from worker thread" << std::endl ;
+    std::cerr.flush();
+    std::abort();
+  }
+  return NULL ;
+}
+
+} // namespace
+
+//----------------------------------------------------------------------------
+// Spawn a thread
+
+bool ThreadsExec::spawn()
+{
+  bool result = false ;
+
+  pthread_attr_t attr ;
+
+  if ( 0 == pthread_attr_init( & attr ) ||
+       0 == pthread_attr_setscope(       & attr, PTHREAD_SCOPE_SYSTEM ) ||
+       0 == pthread_attr_setdetachstate( & attr, PTHREAD_CREATE_DETACHED ) ) {
+
+    pthread_t pt ;
+
+    result = 0 == pthread_create( & pt, & attr, internal_pthread_driver, 0 );
+  }
+
+  pthread_attr_destroy( & attr );
+
+  return result ;
+}
+
+//----------------------------------------------------------------------------
+
+bool ThreadsExec::is_process()
+{
+  static const pthread_t master_pid = pthread_self();
+
+  return pthread_equal( master_pid , pthread_self() );
+}
+
+void ThreadsExec::global_lock()
+{
+  pthread_mutex_lock( & host_internal_pthread_mutex );
+}
+
+void ThreadsExec::global_unlock()
+{
+  pthread_mutex_unlock( & host_internal_pthread_mutex );
+}
+
+//----------------------------------------------------------------------------
+
+void ThreadsExec::wait_yield( volatile int & flag , const int value )
+{
+  while ( value == flag ) { sched_yield(); }
+}
+
+} // namespace Impl
+} // namespace Kokkos
+
+/* end #if defined( KOKKOS_HAVE_PTHREAD ) */
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#elif defined( KOKKOS_HAVE_WINTHREAD )
+
+/* Windows libraries */
+#include <winsock2.h>
+#include <windows.h>
+#include <process.h>
+
+/* Standard C++ libaries */
+
+#include <cstdlib>
+#include <string>
+#include <iostream>
+#include <stdexcept>
+
+#include <Kokkos_Threads.hpp>
+
+//----------------------------------------------------------------------------
+// Driver for each created pthread
+
+namespace Kokkos {
+namespace Impl {
+namespace {
+
+unsigned WINAPI internal_winthread_driver( void * arg )
+{
+  ThreadsExec::driver();
+
+  return 0 ;
+}
+
+class ThreadLockWindows {
+private:
+  CRITICAL_SECTION  m_handle ;
+
+  ~ThreadLockWindows()
+  { DeleteCriticalSection( & m_handle ); }
+
+  ThreadLockWindows();
+  { InitializeCriticalSection( & m_handle ); }
+
+  ThreadLockWindows( const ThreadLockWindows & );
+  ThreadLockWindows & operator = ( const ThreadLockWindows & );
+
+public:
+
+  static ThreadLockWindows & singleton();
+
+  void lock()
+  { EnterCriticalSection( & m_handle ); }
+
+  void unlock()
+  { LeaveCriticalSection( & m_handle ); }
+};
+
+ThreadLockWindows & ThreadLockWindows::singleton()
+{ static ThreadLockWindows self ; return self ; }
+
+} // namespace <>
+} // namespace Kokkos
+} // namespace Impl
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+// Spawn this thread
+
+bool ThreadsExec::spawn()
+{
+  unsigned Win32ThreadID = 0 ;
+
+  HANDLE handle =
+    _beginthreadex(0,0,internal_winthread_driver,0,0, & Win32ThreadID );
+
+  return ! handle ;
+}
+
+bool ThreadsExec::is_process() { return true ; }
+
+void ThreadsExec::global_lock()
+{ ThreadLockWindows::singleton().lock(); }
+
+void ThreadsExec::global_unlock()
+{ ThreadLockWindows::singleton().unlock(); }
+
+void ThreadsExec::wait_yield( volatile int & flag , const int value ) {}
+{
+  while ( value == flag ) { Sleep(0); }
+}
+
+} // namespace Impl
+} // namespace Kokkos
+
+#endif /* end #elif defined( KOKKOS_HAVE_WINTHREAD ) */
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+
+
diff --git a/lib/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp b/lib/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..3407ffaa54149499d5046ae887a3b415627287b6
--- /dev/null
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp
@@ -0,0 +1,932 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_THREADSTEAM_HPP
+#define KOKKOS_THREADSTEAM_HPP
+
+#include <stdio.h>
+
+#include <utility>
+#include <impl/Kokkos_spinwait.hpp>
+#include <impl/Kokkos_FunctorAdapter.hpp>
+
+#include <Kokkos_Atomic.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+
+template< class > struct ThreadsExecAdapter ;
+
+//----------------------------------------------------------------------------
+
+class ThreadsExecTeamMember {
+private:
+
+  enum { TEAM_REDUCE_SIZE = 512 };
+
+  typedef Kokkos::Threads execution_space ;
+  typedef execution_space::scratch_memory_space space ;
+
+  ThreadsExec * const   m_exec ;
+  ThreadsExec * const * m_team_base ; ///< Base for team fan-in
+  space                 m_team_shared ;
+  int                   m_team_shared_size ;
+  int                   m_team_size ;
+  int                   m_team_rank ;
+  int                   m_team_rank_rev ;
+  int                   m_league_size ;
+  int                   m_league_end ;
+  int                   m_league_rank ;
+
+  int                   m_chunk_size;
+  int                   m_league_chunk_end;
+
+  int                   m_invalid_thread;
+  int                   m_team_alloc;
+
+  inline
+  void set_team_shared()
+    { new( & m_team_shared ) space( ((char *) (*m_team_base)->scratch_memory()) + TEAM_REDUCE_SIZE , m_team_shared_size ); }
+  
+public:
+
+  // Fan-in and wait until the matching fan-out is called.
+  // The root thread which does not wait will return true.
+  // All other threads will return false during the fan-out.
+  KOKKOS_INLINE_FUNCTION bool team_fan_in() const
+    {
+      int n , j ;
+
+      // Wait for fan-in threads
+      for ( n = 1 ; ( ! ( m_team_rank_rev & n ) ) && ( ( j = m_team_rank_rev + n ) < m_team_size ) ; n <<= 1 ) {
+        Impl::spinwait( m_team_base[j]->state() , ThreadsExec::Active );
+      }
+
+      // If not root then wait for release
+      if ( m_team_rank_rev ) {
+        m_exec->state() = ThreadsExec::Rendezvous ;
+        Impl::spinwait( m_exec->state() , ThreadsExec::Rendezvous );
+      }
+
+      return ! m_team_rank_rev ;
+    }
+
+  KOKKOS_INLINE_FUNCTION void team_fan_out() const
+    {
+      int n , j ;
+      for ( n = 1 ; ( ! ( m_team_rank_rev & n ) ) && ( ( j = m_team_rank_rev + n ) < m_team_size ) ; n <<= 1 ) {
+        m_team_base[j]->state() = ThreadsExec::Active ;
+      }
+    }
+
+public:
+
+  KOKKOS_INLINE_FUNCTION static int team_reduce_size() { return TEAM_REDUCE_SIZE ; }
+
+  KOKKOS_INLINE_FUNCTION
+  const execution_space::scratch_memory_space & team_shmem() const
+    { return m_team_shared.set_team_thread_mode(0,1,0) ; }
+
+  KOKKOS_INLINE_FUNCTION
+  const execution_space::scratch_memory_space & team_scratch(int) const
+    { return m_team_shared.set_team_thread_mode(0,1,0) ; }
+
+  KOKKOS_INLINE_FUNCTION
+  const execution_space::scratch_memory_space & thread_scratch(int) const
+    { return m_team_shared.set_team_thread_mode(0,team_size(),team_rank()) ; }
+
+  KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
+  KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
+  KOKKOS_INLINE_FUNCTION int team_rank() const { return m_team_rank ; }
+  KOKKOS_INLINE_FUNCTION int team_size() const { return m_team_size ; }
+
+  KOKKOS_INLINE_FUNCTION void team_barrier() const
+    {
+      team_fan_in();
+      team_fan_out();
+    }
+
+  template<class ValueType>
+  KOKKOS_INLINE_FUNCTION
+  void team_broadcast(ValueType& value, const int& thread_id) const
+  {
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { }
+#else
+    // Make sure there is enough scratch space:
+    typedef typename if_c< sizeof(ValueType) < TEAM_REDUCE_SIZE
+                         , ValueType , void >::type type ;
+
+    if ( m_team_base ) {
+      type * const local_value = ((type*) m_team_base[0]->scratch_memory());
+      if(team_rank() == thread_id) *local_value = value;
+      memory_fence();
+      team_barrier();
+      value = *local_value;
+    }
+#endif
+  }
+
+  template< typename Type >
+  KOKKOS_INLINE_FUNCTION Type team_reduce( const Type & value ) const
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { return Type(); }
+#else
+    {
+      // Make sure there is enough scratch space:
+      typedef typename if_c< sizeof(Type) < TEAM_REDUCE_SIZE , Type , void >::type type ;
+
+      if ( 0 == m_exec ) return value ;
+
+      *((volatile type*) m_exec->scratch_memory() ) = value ;
+
+      memory_fence();
+
+      type & accum = *((type *) m_team_base[0]->scratch_memory() );
+
+      if ( team_fan_in() ) {
+        for ( int i = 1 ; i < m_team_size ; ++i ) {
+          accum += *((type *) m_team_base[i]->scratch_memory() );
+        }
+        memory_fence();
+      }
+
+      team_fan_out();
+
+      return accum ;
+    }
+#endif
+
+#ifdef KOKKOS_HAVE_CXX11
+  template< class ValueType, class JoinOp >
+  KOKKOS_INLINE_FUNCTION ValueType
+    team_reduce( const ValueType & value
+               , const JoinOp & op_in ) const
+  #if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { return ValueType(); }
+  #else
+    {
+      typedef ValueType value_type;
+      const JoinLambdaAdapter<value_type,JoinOp> op(op_in);
+  #endif
+#else // KOKKOS_HAVE_CXX11
+  template< class JoinOp >
+  KOKKOS_INLINE_FUNCTION typename JoinOp::value_type
+    team_reduce( const typename JoinOp::value_type & value
+               , const JoinOp & op ) const
+  #if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { return typename JoinOp::value_type(); }
+  #else
+    {
+      typedef typename JoinOp::value_type value_type;
+  #endif
+#endif // KOKKOS_HAVE_CXX11
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      // Make sure there is enough scratch space:
+      typedef typename if_c< sizeof(value_type) < TEAM_REDUCE_SIZE
+                           , value_type , void >::type type ;
+
+      if ( 0 == m_exec ) return value ;
+
+      type * const local_value = ((type*) m_exec->scratch_memory());
+
+      // Set this thread's contribution
+      *local_value = value ;
+
+      // Fence to make sure the base team member has access:
+      memory_fence();
+
+      if ( team_fan_in() ) {
+        // The last thread to synchronize returns true, all other threads wait for team_fan_out()
+        type * const team_value = ((type*) m_team_base[0]->scratch_memory());
+
+        // Join to the team value:
+        for ( int i = 1 ; i < m_team_size ; ++i ) {
+          op.join( *team_value , *((type*) m_team_base[i]->scratch_memory()) );
+        }
+
+        // Team base thread may "lap" member threads so copy out to their local value.
+        for ( int i = 1 ; i < m_team_size ; ++i ) {
+          *((type*) m_team_base[i]->scratch_memory()) = *team_value ;
+        }
+
+        // Fence to make sure all team members have access
+        memory_fence();
+      }
+
+      team_fan_out();
+
+      // Value was changed by the team base
+      return *((type volatile const *) local_value);
+    }
+#endif
+
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
+   *          with intra-team non-deterministic ordering accumulation.
+   *
+   *  The global inter-team accumulation value will, at the end of the
+   *  league's parallel execution, be the scan's total.
+   *  Parallel execution ordering of the league's teams is non-deterministic.
+   *  As such the base value for each team's scan operation is similarly
+   *  non-deterministic.
+   */
+  template< typename ArgType >
+  KOKKOS_INLINE_FUNCTION ArgType team_scan( const ArgType & value , ArgType * const global_accum ) const
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { return ArgType(); }
+#else
+    {
+      // Make sure there is enough scratch space:
+      typedef typename if_c< sizeof(ArgType) < TEAM_REDUCE_SIZE , ArgType , void >::type type ;
+
+      if ( 0 == m_exec ) return type(0);
+
+      volatile type * const work_value  = ((type*) m_exec->scratch_memory());
+
+      *work_value = value ;
+
+      memory_fence();
+
+      if ( team_fan_in() ) {
+        // The last thread to synchronize returns true, all other threads wait for team_fan_out()
+        // m_team_base[0]                 == highest ranking team member
+        // m_team_base[ m_team_size - 1 ] == lowest ranking team member
+        //
+        // 1) copy from lower to higher rank, initialize lowest rank to zero
+        // 2) prefix sum from lowest to highest rank, skipping lowest rank
+
+        type accum = 0 ;
+
+        if ( global_accum ) {
+          for ( int i = m_team_size ; i-- ; ) {
+            type & val = *((type*) m_team_base[i]->scratch_memory());
+            accum += val ;
+          }
+          accum = atomic_fetch_add( global_accum , accum );
+        }
+
+        for ( int i = m_team_size ; i-- ; ) {
+          type & val = *((type*) m_team_base[i]->scratch_memory());
+          const type offset = accum ;
+          accum += val ;
+          val = offset ;
+        }
+
+        memory_fence();
+      }
+
+      team_fan_out();
+
+      return *work_value ;
+    }
+#endif
+
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering.
+   *
+   *  The highest rank thread can compute the reduction total as
+   *    reduction_total = dev.team_scan( value ) + value ;
+   */
+  template< typename ArgType >
+  KOKKOS_INLINE_FUNCTION ArgType team_scan( const ArgType & value ) const
+    { return this-> template team_scan<ArgType>( value , 0 ); }
+
+
+  //----------------------------------------
+  // Private for the driver
+
+  template< class ... Properties >
+  ThreadsExecTeamMember( Impl::ThreadsExec * exec
+                       , const TeamPolicyInternal< Kokkos::Threads , Properties ... > & team
+                       , const int shared_size )
+    : m_exec( exec )
+    , m_team_base(0)
+    , m_team_shared(0,0)
+    , m_team_shared_size( shared_size )
+    , m_team_size(team.team_size())
+    , m_team_rank(0)
+    , m_team_rank_rev(0)
+    , m_league_size(0)
+    , m_league_end(0)
+    , m_league_rank(0)
+    , m_chunk_size( team.chunk_size() )
+    , m_league_chunk_end(0)
+    , m_team_alloc( team.team_alloc())
+   {
+      if ( team.league_size() ) {
+        // Execution is using device-team interface:
+
+        const int pool_rank_rev = m_exec->pool_size() - ( m_exec->pool_rank() + 1 );
+        const int team_rank_rev = pool_rank_rev % team.team_alloc();
+        const size_t pool_league_size     = m_exec->pool_size() / team.team_alloc() ;
+        const size_t pool_league_rank_rev = pool_rank_rev / team.team_alloc() ;
+        const size_t pool_league_rank     = pool_league_size - ( pool_league_rank_rev + 1 );
+
+        const int pool_num_teams       = m_exec->pool_size()/team.team_alloc();
+        const int chunk_size           = team.chunk_size()>0?team.chunk_size():team.team_iter();
+        const int chunks_per_team      = ( team.league_size() + chunk_size*pool_num_teams-1 ) / (chunk_size*pool_num_teams);
+              int league_iter_end      = team.league_size() - pool_league_rank_rev * chunks_per_team * chunk_size;
+              int league_iter_begin    = league_iter_end - chunks_per_team * chunk_size;
+        if (league_iter_begin < 0)     league_iter_begin = 0;
+        if (league_iter_end>team.league_size()) league_iter_end = team.league_size();
+
+        if ((team.team_alloc()>m_team_size)?
+            (team_rank_rev >= m_team_size):
+            (m_exec->pool_size() - pool_num_teams*m_team_size > m_exec->pool_rank())
+           )
+          m_invalid_thread = 1;
+        else
+          m_invalid_thread = 0;
+
+        // May be using fewer threads per team than a multiple of threads per core,
+        // some threads will idle.
+
+        if ( team_rank_rev < team.team_size() && !m_invalid_thread) {
+
+          m_team_base        = m_exec->pool_base() + team.team_alloc() * pool_league_rank_rev ;
+          m_team_size        = team.team_size() ;
+          m_team_rank        = team.team_size() - ( team_rank_rev + 1 );
+          m_team_rank_rev    = team_rank_rev ;
+          m_league_size      = team.league_size();
+
+          m_league_rank      = ( team.league_size() *  pool_league_rank    ) / pool_league_size ;
+          m_league_end       = ( team.league_size() * (pool_league_rank+1) ) / pool_league_size ;
+
+          set_team_shared();
+        }
+
+        if ( (m_team_rank_rev == 0) && (m_invalid_thread == 0) ) {
+          m_exec->set_work_range(m_league_rank,m_league_end,m_chunk_size);
+          m_exec->reset_steal_target(m_team_size);
+        }
+        if(std::is_same<typename TeamPolicyInternal<Kokkos::Threads, Properties ...>::schedule_type::type,Kokkos::Dynamic>::value) {
+          m_exec->barrier();
+        }
+      }
+    }
+
+  ThreadsExecTeamMember()
+    : m_exec(0)
+    , m_team_base(0)
+    , m_team_shared(0,0)
+    , m_team_shared_size(0)
+    , m_team_size(1)
+    , m_team_rank(0)
+    , m_team_rank_rev(0)
+    , m_league_size(1)
+    , m_league_end(0)
+    , m_league_rank(0)
+    , m_chunk_size(0)
+    , m_league_chunk_end(0)
+    , m_invalid_thread(0)
+    , m_team_alloc(0)
+    {}
+
+  inline
+  ThreadsExec & threads_exec_team_base() const { return m_team_base ? **m_team_base : *m_exec ; }
+
+  bool valid_static() const
+    { return m_league_rank < m_league_end ; }
+
+  void next_static()
+    {
+      if ( m_league_rank < m_league_end ) {
+        team_barrier();
+        set_team_shared();
+      }
+      m_league_rank++;
+    }
+
+  bool valid_dynamic() {
+
+    if(m_invalid_thread)
+      return false;
+    if ((m_league_rank < m_league_chunk_end) && (m_league_rank < m_league_size)) {
+      return true;
+    }
+
+    if (  m_team_rank_rev == 0 ) {
+      m_team_base[0]->get_work_index(m_team_alloc);
+    }
+    team_barrier();
+
+    long work_index = m_team_base[0]->team_work_index();
+
+    m_league_rank = work_index * m_chunk_size;
+    m_league_chunk_end = (work_index +1 ) * m_chunk_size;
+
+    if(m_league_chunk_end > m_league_size) m_league_chunk_end = m_league_size;
+
+    if(m_league_rank>=0)
+      return true;
+    return false;
+  }
+
+  void next_dynamic() {
+    if(m_invalid_thread)
+      return;
+
+    if ( m_league_rank < m_league_chunk_end ) {
+      team_barrier();
+      set_team_shared();
+    }
+    m_league_rank++;
+  }
+
+  void set_league_shmem( const int arg_league_rank
+                       , const int arg_league_size
+                       , const int arg_shmem_size
+                       )
+    {
+      m_league_rank = arg_league_rank ;
+      m_league_size = arg_league_size ;
+      m_team_shared_size = arg_shmem_size ;
+      set_team_shared();
+    }
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+template< class ... Properties >
+class TeamPolicyInternal< Kokkos::Threads , Properties ... >: public PolicyTraits<Properties ...>
+{
+private:
+
+  int m_league_size ;
+  int m_team_size ;
+  int m_team_alloc ;
+  int m_team_iter ;
+
+  size_t m_team_scratch_size[2];
+  size_t m_thread_scratch_size[2];
+
+  int m_chunk_size;
+
+  inline
+  void init( const int league_size_request 
+           , const int team_size_request )
+   {
+      const int pool_size  = traits::execution_space::thread_pool_size(0);
+      const int team_max   = traits::execution_space::thread_pool_size(1);
+      const int team_grain = traits::execution_space::thread_pool_size(2);
+
+      m_league_size = league_size_request ;
+
+      m_team_size = team_size_request < team_max ?
+                    team_size_request : team_max ;
+
+      // Round team size up to a multiple of 'team_gain'
+      const int team_size_grain = team_grain * ( ( m_team_size + team_grain - 1 ) / team_grain );
+      const int team_count      = pool_size / team_size_grain ;
+
+      // Constraint : pool_size = m_team_alloc * team_count
+      m_team_alloc = pool_size / team_count ;
+
+      // Maxumum number of iterations each team will take:
+      m_team_iter  = ( m_league_size + team_count - 1 ) / team_count ;
+
+      set_auto_chunk_size();
+   }
+
+
+public:
+
+  //! Tag this class as a kokkos execution policy
+  //! Tag this class as a kokkos execution policy
+  typedef TeamPolicyInternal      execution_policy ;
+
+  typedef PolicyTraits<Properties ... > traits;
+
+  TeamPolicyInternal& operator = (const TeamPolicyInternal& p) {
+    m_league_size = p.m_league_size;
+    m_team_size = p.m_team_size;
+    m_team_alloc = p.m_team_alloc;
+    m_team_iter = p.m_team_iter;
+    m_team_scratch_size[0] = p.m_team_scratch_size[0];
+    m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
+    m_team_scratch_size[1] = p.m_team_scratch_size[1];
+    m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
+    m_chunk_size = p.m_chunk_size;
+    return *this;
+  }
+
+  //----------------------------------------
+
+  template< class FunctorType >
+  inline static
+  int team_size_max( const FunctorType & )
+    { return traits::execution_space::thread_pool_size(1); }
+
+  template< class FunctorType >
+  static int team_size_recommended( const FunctorType & )
+    { return traits::execution_space::thread_pool_size(2); }
+
+
+  template< class FunctorType >
+  inline static
+  int team_size_recommended( const FunctorType &, const int& )
+    { return traits::execution_space::thread_pool_size(2); }
+
+  //----------------------------------------
+
+  inline int team_size() const { return m_team_size ; }
+  inline int team_alloc() const { return m_team_alloc ; }
+  inline int league_size() const { return m_league_size ; }
+  inline size_t scratch_size(const int& level, int team_size_ = -1 ) const {
+    if(team_size_ < 0)
+      team_size_ = m_team_size;
+    return m_team_scratch_size[level] + team_size_*m_thread_scratch_size[level] ;
+  }
+
+  inline int team_iter() const { return m_team_iter ; }
+
+  /** \brief  Specify league size, request team size */
+  TeamPolicyInternal( typename traits::execution_space &
+            , int league_size_request
+            , int team_size_request
+            , int vector_length_request = 1 )
+    : m_league_size(0)
+    , m_team_size(0)
+    , m_team_alloc(0)
+    , m_team_scratch_size { 0 , 0 }
+    , m_thread_scratch_size { 0 , 0 }
+    , m_chunk_size(0)
+    { init(league_size_request,team_size_request); (void) vector_length_request; }
+
+  /** \brief  Specify league size, request team size */
+  TeamPolicyInternal( typename traits::execution_space &
+            , int league_size_request
+            , const Kokkos::AUTO_t & /* team_size_request */
+            , int /* vector_length_request */ = 1 )
+    : m_league_size(0)
+    , m_team_size(0)
+    , m_team_alloc(0)
+    , m_team_scratch_size { 0 , 0 }
+    , m_thread_scratch_size { 0 , 0 }
+    , m_chunk_size(0)
+    { init(league_size_request,traits::execution_space::thread_pool_size(2)); }
+
+  TeamPolicyInternal( int league_size_request
+            , int team_size_request
+            , int /* vector_length_request */ = 1 )
+    : m_league_size(0)
+    , m_team_size(0)
+    , m_team_alloc(0)
+    , m_team_scratch_size { 0 , 0 }
+    , m_thread_scratch_size { 0 , 0 }
+    , m_chunk_size(0)
+    { init(league_size_request,team_size_request); }
+
+  TeamPolicyInternal( int league_size_request
+            , const Kokkos::AUTO_t & /* team_size_request */
+            , int /* vector_length_request */ = 1 )
+    : m_league_size(0)
+    , m_team_size(0)
+    , m_team_alloc(0)
+    , m_team_scratch_size { 0 , 0 }
+    , m_thread_scratch_size { 0 , 0 }
+    , m_chunk_size(0)
+    { init(league_size_request,traits::execution_space::thread_pool_size(2)); }
+
+  inline int chunk_size() const { return m_chunk_size ; }
+
+  /** \brief set chunk_size to a discrete value*/
+  inline TeamPolicyInternal set_chunk_size(typename traits::index_type chunk_size_) const {
+    TeamPolicyInternal p = *this;
+    p.m_chunk_size = chunk_size_;
+    return p;
+  }
+
+  /** \brief set per team scratch size for a specific level of the scratch hierarchy */
+  inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team) const {
+    TeamPolicyInternal p = *this;
+    p.m_team_scratch_size[level] = per_team.value;
+    return p;
+  };
+
+  /** \brief set per thread scratch size for a specific level of the scratch hierarchy */
+  inline TeamPolicyInternal set_scratch_size(const int& level, const PerThreadValue& per_thread) const {
+    TeamPolicyInternal p = *this;
+    p.m_thread_scratch_size[level] = per_thread.value;
+    return p;
+  };
+
+  /** \brief set per thread and per team scratch size for a specific level of the scratch hierarchy */
+  inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team, const PerThreadValue& per_thread) const {
+    TeamPolicyInternal p = *this;
+    p.m_team_scratch_size[level] = per_team.value;
+    p.m_thread_scratch_size[level] = per_thread.value;
+    return p;
+  };
+
+private:
+  /** \brief finalize chunk_size if it was set to AUTO*/
+  inline void set_auto_chunk_size() {
+
+    int concurrency = traits::execution_space::thread_pool_size(0)/m_team_alloc;
+    if( concurrency==0 ) concurrency=1;
+
+    if(m_chunk_size > 0) {
+      if(!Impl::is_integral_power_of_two( m_chunk_size ))
+        Kokkos::abort("TeamPolicy blocking granularity must be power of two" );
+    }
+
+    int new_chunk_size = 1;
+    while(new_chunk_size*100*concurrency < m_league_size)
+      new_chunk_size *= 2;
+    if(new_chunk_size < 128) {
+      new_chunk_size = 1;
+      while( (new_chunk_size*40*concurrency < m_league_size ) && (new_chunk_size<128) )
+        new_chunk_size*=2;
+    }
+    m_chunk_size = new_chunk_size;
+  }
+
+public:
+
+  typedef Impl::ThreadsExecTeamMember member_type ;
+
+  friend class Impl::ThreadsExecTeamMember ;
+};
+
+} /*namespace Impl */
+} /* namespace Kokkos */
+
+
+namespace Kokkos {
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember>
+TeamThreadRange(const Impl::ThreadsExecTeamMember& thread, const iType& count)
+{
+  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember>(thread,count);
+}
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember>
+TeamThreadRange( const Impl::ThreadsExecTeamMember& thread
+               , const iType & begin
+               , const iType & end
+               )
+{
+  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember>(thread,begin,end);
+}
+
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember >
+  ThreadVectorRange(const Impl::ThreadsExecTeamMember& thread, const iType& count) {
+  return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember >(thread,count);
+}
+
+
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadSingleStruct<Impl::ThreadsExecTeamMember> PerTeam(const Impl::ThreadsExecTeamMember& thread) {
+  return Impl::ThreadSingleStruct<Impl::ThreadsExecTeamMember>(thread);
+}
+
+KOKKOS_INLINE_FUNCTION
+Impl::VectorSingleStruct<Impl::ThreadsExecTeamMember> PerThread(const Impl::ThreadsExecTeamMember& thread) {
+  return Impl::VectorSingleStruct<Impl::ThreadsExecTeamMember>(thread);
+}
+} // namespace Kokkos
+
+namespace Kokkos {
+
+  /** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
+   *
+   * The range i=0..N-1 is mapped to all threads of the the calling thread team.
+   * This functionality requires C++11 support.*/
+template<typename iType, class Lambda>
+KOKKOS_INLINE_FUNCTION
+void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember>& loop_boundaries, const Lambda& lambda) {
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
+    lambda(i);
+}
+
+/** \brief  Inter-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all threads of the the calling thread team and a summation of
+ * val is performed and put into result. This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember>& loop_boundaries,
+                     const Lambda & lambda, ValueType& result) {
+
+  result = ValueType();
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    result+=tmp;
+  }
+
+  result = loop_boundaries.thread.team_reduce(result,Impl::JoinAdd<ValueType>());
+}
+
+#if defined( KOKKOS_HAVE_CXX11 )
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
+ * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
+ * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
+ * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
+ * '1 for *'). This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember>& loop_boundaries,
+                     const Lambda & lambda, const JoinType& join, ValueType& init_result) {
+
+  ValueType result = init_result;
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    join(result,tmp);
+  }
+
+  init_result = loop_boundaries.thread.team_reduce(result,Impl::JoinLambdaAdapter<ValueType,JoinType>(join));
+}
+
+#endif /* #if defined( KOKKOS_HAVE_CXX11 ) */
+
+} //namespace Kokkos
+
+
+namespace Kokkos {
+/** \brief  Intra-thread vector parallel_for. Executes lambda(iType i) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread.
+ * This functionality requires C++11 support.*/
+template<typename iType, class Lambda>
+KOKKOS_INLINE_FUNCTION
+void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember >&
+    loop_boundaries, const Lambda& lambda) {
+  #ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+  #pragma ivdep
+  #endif
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
+    lambda(i);
+}
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a summation of
+ * val is performed and put into result. This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember >&
+      loop_boundaries, const Lambda & lambda, ValueType& result) {
+  result = ValueType();
+#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    result+=tmp;
+  }
+}
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
+ * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
+ * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
+ * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
+ * '1 for *'). This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember >&
+      loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& init_result) {
+
+  ValueType result = init_result;
+#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    join(result,tmp);
+  }
+  init_result = result;
+}
+
+/** \brief  Intra-thread vector parallel exclusive prefix sum. Executes lambda(iType i, ValueType & val, bool final)
+ *          for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes in the thread and a scan operation is performed.
+ * Depending on the target execution space the operator might be called twice: once with final=false
+ * and once with final=true. When final==true val contains the prefix sum value. The contribution of this
+ * "i" needs to be added to val no matter whether final==true or not. In a serial execution
+ * (i.e. team_size==1) the operator is only called once with final==true. Scan_val will be set
+ * to the final sum value over all vector lanes.
+ * This functionality requires C++11 support.*/
+template< typename iType, class FunctorType >
+KOKKOS_INLINE_FUNCTION
+void parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember >&
+      loop_boundaries, const FunctorType & lambda) {
+
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ;
+  typedef typename ValueTraits::value_type value_type ;
+
+  value_type scan_val = value_type();
+
+#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,scan_val,true);
+  }
+}
+
+} // namespace Kokkos
+
+namespace Kokkos {
+
+template<class FunctorType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::VectorSingleStruct<Impl::ThreadsExecTeamMember>& single_struct, const FunctorType& lambda) {
+  lambda();
+}
+
+template<class FunctorType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::ThreadSingleStruct<Impl::ThreadsExecTeamMember>& single_struct, const FunctorType& lambda) {
+  if(single_struct.team_member.team_rank()==0) lambda();
+}
+
+template<class FunctorType, class ValueType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::VectorSingleStruct<Impl::ThreadsExecTeamMember>& single_struct, const FunctorType& lambda, ValueType& val) {
+  lambda(val);
+}
+
+template<class FunctorType, class ValueType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::ThreadSingleStruct<Impl::ThreadsExecTeamMember>& single_struct, const FunctorType& lambda, ValueType& val) {
+  if(single_struct.team_member.team_rank()==0) {
+    lambda(val);
+  }
+  single_struct.team_member.team_broadcast(val,0);
+}
+}
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #define KOKKOS_THREADSTEAM_HPP */
+
diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..1aba00c94b0bffd95f1e09acc22337b96c87eedb
--- /dev/null
+++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp
@@ -0,0 +1,658 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_THREADS_PARALLEL_HPP
+#define KOKKOS_THREADS_PARALLEL_HPP
+
+#include <vector>
+#include <iostream> 
+
+#include <Kokkos_Parallel.hpp>
+
+#include <impl/Kokkos_StaticAssert.hpp>
+#include <impl/Kokkos_FunctorAdapter.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+/* ParallelFor Kokkos::Threads with RangePolicy */
+
+template< class FunctorType , class ... Traits >
+class ParallelFor< FunctorType
+                 , Kokkos::RangePolicy< Traits ... >
+                 , Kokkos::Threads
+                 >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Traits ... > Policy ;
+  typedef typename Policy::work_tag    WorkTag ;
+  typedef typename Policy::WorkRange   WorkRange ;
+  typedef typename Policy::member_type Member ;
+
+  const FunctorType  m_functor ;
+  const Policy       m_policy ;
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec_range( const FunctorType & functor
+            , const Member ibeg , const Member iend )
+    {
+      #if defined( KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION ) && \
+          defined( KOKKOS_HAVE_PRAGMA_IVDEP )
+      #pragma ivdep
+      #endif
+      for ( Member i = ibeg ; i < iend ; ++i ) {
+        functor( i );
+      }
+    }
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec_range( const FunctorType & functor
+            , const Member ibeg , const Member iend )
+    {
+      const TagType t{} ;
+      #if defined( KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION ) && \
+          defined( KOKKOS_HAVE_PRAGMA_IVDEP )
+      #pragma ivdep
+      #endif
+      for ( Member i = ibeg ; i < iend ; ++i ) {
+        functor( t , i );
+      }
+    }
+
+  static void exec( ThreadsExec & exec , const void * arg )
+  {
+    exec_schedule<typename Policy::schedule_type::type>(exec,arg);
+  }
+
+  template<class Schedule>
+  static
+  typename std::enable_if< std::is_same<Schedule,Kokkos::Static>::value >::type
+  exec_schedule( ThreadsExec & exec , const void * arg )
+  {
+    const ParallelFor & self = * ((const ParallelFor *) arg );
+
+    WorkRange range( self.m_policy , exec.pool_rank() , exec.pool_size() );
+
+    ParallelFor::template exec_range< WorkTag >
+      ( self.m_functor , range.begin() , range.end() );
+
+    exec.fan_in();
+  }
+
+  template<class Schedule>
+  static
+  typename std::enable_if< std::is_same<Schedule,Kokkos::Dynamic>::value >::type
+  exec_schedule( ThreadsExec & exec , const void * arg )
+  {
+    const ParallelFor & self = * ((const ParallelFor *) arg );
+
+    WorkRange range( self.m_policy , exec.pool_rank() , exec.pool_size() );
+
+    exec.set_work_range(range.begin(),range.end(),self.m_policy.chunk_size());
+    exec.reset_steal_target();
+    exec.barrier();
+
+    long work_index = exec.get_work_index();
+
+    while(work_index != -1) {
+      const Member begin = static_cast<Member>(work_index) * self.m_policy.chunk_size();
+      const Member end = begin + self.m_policy.chunk_size() < self.m_policy.end()?begin+self.m_policy.chunk_size():self.m_policy.end();
+
+      ParallelFor::template exec_range< WorkTag >
+        ( self.m_functor , begin , end );
+      work_index = exec.get_work_index();
+    }
+
+    exec.fan_in();
+  }
+
+public:
+
+  inline
+  void execute() const
+    {
+      ThreadsExec::start( & ParallelFor::exec , this );
+      ThreadsExec::fence();
+    }
+
+  ParallelFor( const FunctorType & arg_functor
+             , const Policy      & arg_policy )
+    : m_functor( arg_functor )
+    , m_policy( arg_policy )
+    {}
+};
+
+//----------------------------------------------------------------------------
+/* ParallelFor Kokkos::Threads with TeamPolicy */
+
+template< class FunctorType , class ... Properties >
+class ParallelFor< FunctorType
+                 , Kokkos::TeamPolicy< Properties ... >
+                 , Kokkos::Threads
+                 >
+{
+private:
+
+  typedef Kokkos::Impl::TeamPolicyInternal< Kokkos::Threads, Properties ... >  Policy ;
+  typedef typename Policy::work_tag                    WorkTag ;
+  typedef typename Policy::member_type                 Member ;
+
+  const FunctorType  m_functor ;
+  const Policy       m_policy ;
+  const int          m_shared ;
+
+  template< class TagType , class Schedule>
+  inline static
+  typename std::enable_if< std::is_same< TagType , void >::value
+  && std::is_same<Schedule,Kokkos::Static>::value >::type
+  exec_team( const FunctorType & functor , Member member )
+    {
+      for ( ; member.valid_static() ; member.next_static() ) {
+        functor( member );
+      }
+    }
+
+  template< class TagType , class Schedule>
+  inline static
+  typename std::enable_if< ! std::is_same< TagType , void >::value
+  && std::is_same<Schedule,Kokkos::Static>::value >::type
+  exec_team( const FunctorType & functor , Member member )
+    {
+      const TagType t{} ;
+      for ( ; member.valid_static() ; member.next_static() ) {
+        functor( t , member );
+      }
+    }
+
+  template< class TagType , class Schedule>
+  inline static
+  typename std::enable_if< std::is_same< TagType , void >::value
+  && std::is_same<Schedule,Kokkos::Dynamic>::value >::type
+  exec_team( const FunctorType & functor , Member member )
+    {
+
+      for ( ; member.valid_dynamic() ; member.next_dynamic() ) {
+        functor( member );
+      }
+    }
+
+  template< class TagType , class Schedule>
+  inline static
+  typename std::enable_if< ! std::is_same< TagType , void >::value
+                          && std::is_same<Schedule,Kokkos::Dynamic>::value >::type
+  exec_team( const FunctorType & functor , Member member )
+    {
+      const TagType t{} ;
+      for ( ; member.valid_dynamic() ; member.next_dynamic() ) {
+        functor( t , member );
+      }
+    }
+
+  static void exec( ThreadsExec & exec , const void * arg )
+  {
+    const ParallelFor & self = * ((const ParallelFor *) arg );
+
+    ParallelFor::exec_team< WorkTag , typename Policy::schedule_type::type >
+      ( self.m_functor , Member( & exec , self.m_policy , self.m_shared ) );
+
+    exec.barrier();
+    exec.fan_in();
+  }
+
+public:
+
+  inline
+  void execute() const
+    {
+      ThreadsExec::resize_scratch( 0 , Policy::member_type::team_reduce_size() + m_shared );
+
+      ThreadsExec::start( & ParallelFor::exec , this );
+
+      ThreadsExec::fence();
+    }
+
+  ParallelFor( const FunctorType & arg_functor
+             , const Policy      & arg_policy )
+    : m_functor( arg_functor )
+    , m_policy(  arg_policy )
+    , m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
+    { }
+};
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+/* ParallelReduce with Kokkos::Threads and RangePolicy */
+
+template< class FunctorType , class ReducerType, class ... Traits >
+class ParallelReduce< FunctorType
+                    , Kokkos::RangePolicy< Traits ... >
+                    , ReducerType
+                    , Kokkos::Threads
+                    >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Traits ... > Policy ;
+
+  typedef typename Policy::work_tag    WorkTag ;
+  typedef typename Policy::WorkRange   WorkRange ;
+  typedef typename Policy::member_type Member ;
+
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
+  typedef typename ReducerConditional::type ReducerTypeFwd;
+
+  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd, WorkTag > ValueInit ;
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+
+  const FunctorType  m_functor ;
+  const Policy       m_policy ;
+  const ReducerType   m_reducer ;
+  const pointer_type m_result_ptr ;
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec_range( const FunctorType & functor
+            , const Member & ibeg , const Member & iend
+            , reference_type update )
+    {
+      #if defined( KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION ) && \
+          defined( KOKKOS_HAVE_PRAGMA_IVDEP )
+      #pragma ivdep
+      #endif
+      for ( Member i = ibeg ; i < iend ; ++i ) {
+        functor( i , update );
+      }
+    }
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec_range( const FunctorType & functor
+            , const Member & ibeg , const Member & iend
+            , reference_type update )
+    {
+      const TagType t{} ;
+      #if defined( KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION ) && \
+          defined( KOKKOS_HAVE_PRAGMA_IVDEP )
+      #pragma ivdep
+      #endif
+      for ( Member i = ibeg ; i < iend ; ++i ) {
+        functor( t , i , update );
+      }
+    }
+
+  static void
+  exec( ThreadsExec & exec , const void * arg ) {
+    exec_schedule<typename Policy::schedule_type::type>(exec, arg);
+  }
+
+  template<class Schedule>
+  static
+  typename std::enable_if< std::is_same<Schedule,Kokkos::Static>::value >::type
+  exec_schedule( ThreadsExec & exec , const void * arg )
+  {
+    const ParallelReduce & self = * ((const ParallelReduce *) arg );
+    const WorkRange range( self.m_policy, exec.pool_rank(), exec.pool_size() );
+
+    ParallelReduce::template exec_range< WorkTag >
+      ( self.m_functor , range.begin() , range.end() 
+      , ValueInit::init( ReducerConditional::select(self.m_functor , self.m_reducer) , exec.reduce_memory() ) );
+
+    exec.template fan_in_reduce< ReducerTypeFwd , WorkTag >( ReducerConditional::select(self.m_functor , self.m_reducer) );
+  }
+
+  template<class Schedule>
+  static
+  typename std::enable_if< std::is_same<Schedule,Kokkos::Dynamic>::value >::type
+    exec_schedule( ThreadsExec & exec , const void * arg )
+  {
+    const ParallelReduce & self = * ((const ParallelReduce *) arg );
+    const WorkRange range( self.m_policy, exec.pool_rank(), exec.pool_size() );
+
+    exec.set_work_range(range.begin(),range.end(),self.m_policy.chunk_size());
+    exec.reset_steal_target();
+    exec.barrier();
+
+    long work_index = exec.get_work_index();
+    reference_type update = ValueInit::init( ReducerConditional::select(self.m_functor , self.m_reducer) , exec.reduce_memory() );
+    while(work_index != -1) {
+      const Member begin = static_cast<Member>(work_index) * self.m_policy.chunk_size();
+      const Member end = begin + self.m_policy.chunk_size() < self.m_policy.end()?begin+self.m_policy.chunk_size():self.m_policy.end();
+      ParallelReduce::template exec_range< WorkTag >
+        ( self.m_functor , begin , end
+        , update );
+      work_index = exec.get_work_index();
+    }
+
+    exec.template fan_in_reduce< ReducerTypeFwd , WorkTag >( ReducerConditional::select(self.m_functor , self.m_reducer) );
+  }
+
+public:
+
+  inline
+  void execute() const
+    {
+      ThreadsExec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 );
+
+      ThreadsExec::start( & ParallelReduce::exec , this );
+
+      ThreadsExec::fence();
+
+      if ( m_result_ptr ) {
+
+        const pointer_type data =
+          (pointer_type) ThreadsExec::root_reduce_scratch();
+
+        const unsigned n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
+        for ( unsigned i = 0 ; i < n ; ++i ) { m_result_ptr[i] = data[i]; }
+      }
+    }
+
+  template< class HostViewType >
+  ParallelReduce( const FunctorType  & arg_functor ,
+                  const Policy       & arg_policy ,
+                  const HostViewType & arg_result_view ,
+                  typename std::enable_if<
+                               Kokkos::is_view< HostViewType >::value &&
+                              !Kokkos::is_reducer_type<ReducerType>::value
+                  ,void*>::type = NULL)
+    : m_functor( arg_functor )
+    , m_policy( arg_policy )
+    , m_reducer( InvalidType() )
+    , m_result_ptr( arg_result_view.ptr_on_device() )
+    {
+      static_assert( Kokkos::is_view< HostViewType >::value
+        , "Kokkos::Threads reduce result must be a View" );
+
+      static_assert( std::is_same< typename HostViewType::memory_space , HostSpace >::value
+        , "Kokkos::Threads reduce result must be a View in HostSpace" );
+    }
+
+  inline
+  ParallelReduce( const FunctorType & arg_functor
+                , Policy       arg_policy
+                , const ReducerType& reducer )
+    : m_functor( arg_functor )
+    , m_policy(  arg_policy )
+    , m_reducer( reducer )
+    , m_result_ptr(  reducer.result_view().data() )
+    {
+      /*static_assert( std::is_same< typename ViewType::memory_space
+                                      , Kokkos::HostSpace >::value
+        , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
+    }
+
+};
+
+//----------------------------------------------------------------------------
+/* ParallelReduce with Kokkos::Threads and TeamPolicy */
+
+template< class FunctorType , class ReducerType, class ... Properties >
+class ParallelReduce< FunctorType
+                    , Kokkos::TeamPolicy< Properties ... >
+                    , ReducerType
+                    , Kokkos::Threads
+                    >
+{
+private:
+
+  typedef Kokkos::Impl::TeamPolicyInternal< Kokkos::Threads, Properties ... >              Policy ;
+  typedef typename Policy::work_tag                                WorkTag ;
+  typedef typename Policy::member_type                             Member ;
+
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
+  typedef typename ReducerConditional::type ReducerTypeFwd;
+
+  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd, WorkTag > ValueInit ;
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+
+  const FunctorType  m_functor ;
+  const Policy       m_policy ;
+  const ReducerType  m_reducer ;
+  const pointer_type m_result_ptr ;
+  const int          m_shared ;
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec_team( const FunctorType & functor , Member member , reference_type update )
+    {
+      for ( ; member.valid_static() ; member.next_static() ) {
+        functor( member , update );
+      }
+    }
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec_team( const FunctorType & functor , Member member , reference_type update )
+    {
+      const TagType t{} ;
+      for ( ; member.valid_static() ; member.next_static() ) {
+        functor( t , member , update );
+      }
+    }
+
+  static void exec( ThreadsExec & exec , const void * arg )
+  {
+    const ParallelReduce & self = * ((const ParallelReduce *) arg );
+
+    ParallelReduce::template exec_team< WorkTag >
+      ( self.m_functor , Member( & exec , self.m_policy , self.m_shared )
+      , ValueInit::init( ReducerConditional::select(self.m_functor , self.m_reducer) , exec.reduce_memory() ) );
+
+    exec.template fan_in_reduce< ReducerTypeFwd , WorkTag >( ReducerConditional::select(self.m_functor , self.m_reducer) );
+  }
+
+public:
+
+  inline
+  void execute() const
+    {
+      ThreadsExec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , Policy::member_type::team_reduce_size() + m_shared );
+
+      ThreadsExec::start( & ParallelReduce::exec , this );
+
+      ThreadsExec::fence();
+
+      if ( m_result_ptr ) {
+
+        const pointer_type data = (pointer_type) ThreadsExec::root_reduce_scratch();
+
+        const unsigned n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
+        for ( unsigned i = 0 ; i < n ; ++i ) { m_result_ptr[i] = data[i]; }
+      }
+    }
+
+  template< class ViewType >
+  inline
+  ParallelReduce( const FunctorType  & arg_functor ,
+                  const Policy       & arg_policy ,
+                  const ViewType     & arg_result ,
+                  typename std::enable_if<
+                    Kokkos::is_view< ViewType >::value &&
+                    !Kokkos::is_reducer_type<ReducerType>::value
+                    ,void*>::type = NULL)
+    : m_functor( arg_functor )
+    , m_policy(  arg_policy )
+    , m_reducer( InvalidType() )
+    , m_result_ptr( arg_result.ptr_on_device() )
+    , m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
+    {}
+
+  inline
+  ParallelReduce( const FunctorType & arg_functor
+    , Policy       arg_policy
+    , const ReducerType& reducer )
+  : m_functor( arg_functor )
+  , m_policy(  arg_policy )
+  , m_reducer( reducer )
+  , m_result_ptr(  reducer.result_view().data() )
+  , m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
+  {
+  /*static_assert( std::is_same< typename ViewType::memory_space
+                          , Kokkos::HostSpace >::value
+  , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
+  }
+};
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+/* ParallelScan with Kokkos::Threads and RangePolicy */
+
+template< class FunctorType , class ... Traits >
+class ParallelScan< FunctorType
+                  , Kokkos::RangePolicy< Traits ... >
+                  , Kokkos::Threads
+                  >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Traits ... > Policy ;
+  typedef typename Policy::WorkRange                               WorkRange ;
+  typedef typename Policy::work_tag                                WorkTag ;
+  typedef typename Policy::member_type                             Member ;
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   FunctorType, WorkTag > ValueInit ;
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+
+  const FunctorType  m_functor ;
+  const Policy       m_policy ;
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< std::is_same< TagType , void >::value >::type
+  exec_range( const FunctorType & functor
+            , const Member & ibeg , const Member & iend
+            , reference_type update , const bool final )
+    {
+      #if defined( KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION ) && \
+          defined( KOKKOS_HAVE_PRAGMA_IVDEP )
+      #pragma ivdep
+      #endif
+      for ( Member i = ibeg ; i < iend ; ++i ) {
+        functor( i , update , final );
+      }
+    }
+
+  template< class TagType >
+  inline static
+  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
+  exec_range( const FunctorType & functor
+            , const Member & ibeg , const Member & iend
+            , reference_type update , const bool final )
+    {
+      const TagType t{} ;
+      #if defined( KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION ) && \
+          defined( KOKKOS_HAVE_PRAGMA_IVDEP )
+      #pragma ivdep
+      #endif
+      for ( Member i = ibeg ; i < iend ; ++i ) {
+        functor( t , i , update , final );
+      }
+    }
+
+  static void exec( ThreadsExec & exec , const void * arg )
+  {
+    const ParallelScan & self = * ((const ParallelScan *) arg );
+
+    const WorkRange range( self.m_policy, exec.pool_rank(), exec.pool_size() );
+
+    reference_type update =
+      ValueInit::init( self.m_functor , exec.reduce_memory() );
+
+    ParallelScan::template exec_range< WorkTag >
+      ( self.m_functor , range.begin(), range.end(), update, false );
+
+    //  exec.template scan_large<FunctorType,WorkTag>( self.m_functor );
+    exec.template scan_small<FunctorType,WorkTag>( self.m_functor );
+
+    ParallelScan::template exec_range< WorkTag >
+      ( self.m_functor , range.begin(), range.end(), update, true );
+
+    exec.fan_in();
+  }
+
+public:
+
+  inline
+  void execute() const
+    {
+      ThreadsExec::resize_scratch( 2 * ValueTraits::value_size( m_functor ) , 0 );
+      ThreadsExec::start( & ParallelScan::exec , this );
+      ThreadsExec::fence();
+    }
+
+  ParallelScan( const FunctorType & arg_functor
+              , const Policy      & arg_policy )
+    : m_functor( arg_functor )
+    , m_policy( arg_policy )
+    { }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #define KOKKOS_THREADS_PARALLEL_HPP */
+
diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_TaskPolicy.cpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_TaskPolicy.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e1599284b297bee7a770d2a6ce87a429a9e5d08a
--- /dev/null
+++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_TaskPolicy.cpp
@@ -0,0 +1,930 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+// Experimental unified task-data parallel manycore LDRD
+
+#include <stdio.h>
+#include <iostream>
+#include <sstream>
+#include <Kokkos_Core.hpp>
+#include <Threads/Kokkos_Threads_TaskPolicy.hpp>
+
+#if defined( KOKKOS_HAVE_PTHREAD ) && defined( KOKKOS_ENABLE_TASKPOLICY )
+
+#define QLOCK   (reinterpret_cast<void*>( ~((uintptr_t)0) ))
+#define QDENIED (reinterpret_cast<void*>( ~((uintptr_t)0) - 1 ))
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+void ThreadsTaskPolicyQueue::Destroy::destroy_shared_allocation()
+{
+  // Verify the queue is empty
+
+  if ( m_policy->m_count_ready ||
+       m_policy->m_team[0] ||
+       m_policy->m_team[1] ||
+       m_policy->m_team[2] ||
+       m_policy->m_serial[0] ||
+       m_policy->m_serial[1] ||
+       m_policy->m_serial[2] ) {
+    Kokkos::abort("ThreadsTaskPolicyQueue ERROR : Attempt to destroy non-empty queue" );
+  }
+
+  m_policy->~ThreadsTaskPolicyQueue();
+}
+
+//----------------------------------------------------------------------------
+
+ThreadsTaskPolicyQueue::~ThreadsTaskPolicyQueue()
+{
+}
+
+ThreadsTaskPolicyQueue::ThreadsTaskPolicyQueue
+  ( const unsigned arg_task_max_count
+  , const unsigned arg_task_max_size
+  , const unsigned arg_task_default_dependence_capacity
+  , const unsigned arg_task_team_size
+  )
+  : m_space( Kokkos::Threads::memory_space()
+           , arg_task_max_size * arg_task_max_count * 1.2
+           , 16 /* log2(superblock size) */
+           )
+  , m_team { 0 , 0 , 0 }
+  , m_serial { 0 , 0 , 0 }
+  , m_team_size( arg_task_team_size )
+  , m_default_dependence_capacity( arg_task_default_dependence_capacity )
+  , m_count_ready(0)
+  , m_count_alloc(0)
+{
+  const int threads_total    = Threads::thread_pool_size(0);
+  const int threads_per_numa = Threads::thread_pool_size(1);
+  const int threads_per_core = Threads::thread_pool_size(2);
+
+  if ( 0 == m_team_size ) {
+    // If a team task then claim for execution until count is zero
+    // Issue: team collectives cannot assume which pool members are in the team.
+    // Issue: team must only span a single NUMA region.
+
+    // If more than one thread per core then map cores to work team,
+    // else  map numa to work team.
+
+    if      ( 1 < threads_per_core ) m_team_size = threads_per_core ;
+    else if ( 1 < threads_per_numa ) m_team_size = threads_per_numa ;
+    else                             m_team_size = 1 ;
+  }
+
+  // Verify a valid team size
+  const bool valid_team_size =
+    ( 0 < m_team_size && m_team_size <= threads_total ) &&
+    (
+      ( 1                == m_team_size ) ||
+      ( threads_per_core == m_team_size ) ||
+      ( threads_per_numa == m_team_size )
+    );
+
+  if ( ! valid_team_size ) {
+    std::ostringstream msg ;
+
+    msg << "Kokkos::Experimental::TaskPolicy< Kokkos::Threads > ERROR"
+        << " invalid team_size(" << m_team_size << ")"
+        << " threads_per_core(" << threads_per_core << ")"
+        << " threads_per_numa(" << threads_per_numa << ")"
+        << " threads_total(" << threads_total << ")"
+        ;
+
+    Kokkos::Impl::throw_runtime_exception( msg.str() );
+  }
+
+  Kokkos::memory_fence();
+}
+
+//----------------------------------------------------------------------------
+
+void ThreadsTaskPolicyQueue::driver( Kokkos::Impl::ThreadsExec & exec
+                                   , const void * arg )
+{
+  // Whole thread pool is calling this function
+
+  typedef Kokkos::Impl::ThreadsExecTeamMember member_type ;
+
+  ThreadsTaskPolicyQueue & self =
+    * reinterpret_cast< ThreadsTaskPolicyQueue * >( const_cast<void*>(arg) );
+
+  // Create the thread team member with shared memory for the given task.
+
+  const TeamPolicy< Kokkos::Threads > team_policy( 1 , self.m_team_size );
+
+  member_type team_member( & exec , team_policy , 0 );
+
+  Kokkos::Impl::ThreadsExec & exec_team_base =
+    team_member.threads_exec_team_base();
+
+  task_root_type * volatile * const task_team_ptr =
+    reinterpret_cast<task_root_type**>( exec_team_base.reduce_memory() );
+
+  volatile int * const work_team_ptr =
+    reinterpret_cast<volatile int*>( task_team_ptr + 1 );
+
+  // Each team must iterate this loop synchronously
+  // to insure team-execution of team-task.
+
+  const bool team_lead = team_member.team_fan_in();
+
+  bool work_team = true ;
+
+  while ( work_team ) {
+
+    task_root_type * task = 0 ;
+
+    // Start here with members in a fan_in state
+
+    if ( team_lead ) {
+      // Team lead queries the ready count for a team-consistent view.
+      *work_team_ptr = 0 != self.m_count_ready ;
+
+      // Only the team lead attempts to pop a team task from the queues
+      for ( int i = 0 ; i < int(NPRIORITY) && 0 == task ; ++i ) {
+        if ( ( i < 2 /* regular queue */ )
+             || ( ! self.m_space.is_empty() /* waiting for memory queue */ ) ) {
+          task = pop_ready_task( & self.m_team[i] );
+        }
+      }
+
+      *task_team_ptr = task ;
+    }
+
+    Kokkos::memory_fence();
+
+    team_member.team_fan_out();
+
+    work_team = *work_team_ptr ;
+
+    // Query if team acquired a team task
+
+    if ( 0 != ( task = *task_team_ptr ) ) {
+      // Set shared memory
+      team_member.set_league_shmem( 0 , 1 , task->m_shmem_size );
+
+      (*task->m_team)( task , team_member );
+
+      // The team task called the functor,
+      // called the team_fan_in(), and
+      // if completed the team lead destroyed the task functor.
+
+      if ( team_lead ) {
+        self.complete_executed_task( task );
+      }
+    }
+    else {
+      // No team task acquired, each thread try a serial task
+      // Try the priority queue, then the regular queue.
+      for ( int i = 0 ; i < int(NPRIORITY) && 0 == task ; ++i ) {
+        if ( ( i < 2 /* regular queue */ )
+             || ( ! self.m_space.is_empty() /* waiting for memory queue */ ) ) {
+          task = pop_ready_task( & self.m_serial[i] );
+        }
+      }
+
+      if ( 0 != task ) {
+
+        (*task->m_serial)( task );
+
+        self.complete_executed_task( task );
+      }
+
+      team_member.team_fan_in();
+    }
+  }
+
+  team_member.team_fan_out();
+
+  exec.fan_in();
+}
+
+//----------------------------------------------------------------------------
+
+ThreadsTaskPolicyQueue::task_root_type *
+ThreadsTaskPolicyQueue::pop_ready_task(
+  ThreadsTaskPolicyQueue::task_root_type * volatile * const queue )
+{
+  task_root_type * const q_lock = reinterpret_cast<task_root_type*>(QLOCK);
+  task_root_type * task = 0 ;
+  task_root_type * const task_claim = *queue ;
+
+  if ( ( q_lock != task_claim ) && ( 0 != task_claim ) ) {
+
+    // Queue is not locked and not null, try to claim head of queue.
+    // Is a race among threads to claim the queue.
+
+    if ( task_claim == atomic_compare_exchange(queue,task_claim,q_lock) ) {
+
+      // Aquired the task which must be in the waiting state.
+
+      const int claim_state =
+        atomic_compare_exchange( & task_claim->m_state
+                               , int(TASK_STATE_WAITING)
+                               , int(TASK_STATE_EXECUTING) );
+
+      task_root_type * lock_verify = 0 ;
+
+      if ( claim_state == int(TASK_STATE_WAITING) ) {
+
+        // Transitioned this task from waiting to executing
+        // Update the queue to the next entry and release the lock
+
+        task_root_type * const next =
+          *((task_root_type * volatile *) & task_claim->m_next );
+
+        *((task_root_type * volatile *) & task_claim->m_next ) = 0 ;
+
+        lock_verify = atomic_compare_exchange( queue , q_lock , next );
+      }
+
+      if ( ( claim_state != int(TASK_STATE_WAITING) ) |
+           ( q_lock != lock_verify ) ) {
+
+        fprintf(stderr,"ThreadsTaskPolicyQueue::pop_ready_task(0x%lx) task(0x%lx) state(%d) ERROR %s\n"
+                      , (unsigned long) queue
+                      , (unsigned long) task
+                      , claim_state
+                      , ( claim_state != int(TASK_STATE_WAITING)
+                        ? "NOT WAITING"
+                        : "UNLOCK" ) );
+        fflush(stderr);
+        Kokkos::abort("ThreadsTaskPolicyQueue::pop_ready_task");
+      }
+
+      task = task_claim ;
+    }
+  }
+
+  return task ;
+}
+
+//----------------------------------------------------------------------------
+
+void ThreadsTaskPolicyQueue::complete_executed_task(
+  ThreadsTaskPolicyQueue::task_root_type * task )
+{
+  task_root_type * const q_denied = reinterpret_cast<task_root_type*>(QDENIED);
+
+  // State is either executing or if respawned then waiting,
+  // try to transition from executing to complete.
+  // Reads the current value.
+
+  const int state_old =
+    atomic_compare_exchange( & task->m_state
+                           , int(Kokkos::Experimental::TASK_STATE_EXECUTING)
+                           , int(Kokkos::Experimental::TASK_STATE_COMPLETE) );
+
+  if ( int(Kokkos::Experimental::TASK_STATE_WAITING) == state_old ) {
+    // Task requested a respawn so reschedule it.
+    // The reference count will be incremented if placed in a queue.
+    schedule_task( task , false /* not the initial spawn */ );
+  }
+  else if ( int(Kokkos::Experimental::TASK_STATE_EXECUTING) == state_old ) {
+    /* Task is complete */
+
+    // Clear dependences of this task before locking wait queue
+
+    task->clear_dependence();
+
+    // Stop other tasks from adding themselves to this task's wait queue.
+    // The wait queue is updated concurrently so guard with an atomic.
+
+    task_root_type * wait_queue     = *((task_root_type * volatile *) & task->m_wait );
+    task_root_type * wait_queue_old = 0 ;
+
+    do {
+      wait_queue_old = wait_queue ;
+      wait_queue     = atomic_compare_exchange( & task->m_wait , wait_queue_old , q_denied );
+    } while ( wait_queue_old != wait_queue );
+
+    // The task has been removed from ready queue and
+    // execution is complete so decrement the reference count.
+    // The reference count was incremented by the initial spawning.
+    // The task may be deleted if this was the last reference.
+    task_root_type::assign( & task , 0 );
+
+    // Pop waiting tasks and schedule them
+    while ( wait_queue ) {
+      task_root_type * const x = wait_queue ; wait_queue = x->m_next ; x->m_next = 0 ;
+      schedule_task( x , false /* not the initial spawn */ );
+    }
+  }
+  else {
+    fprintf( stderr
+           , "ThreadsTaskPolicyQueue::complete_executed_task(0x%lx) ERROR state_old(%d) dep_size(%d)\n"
+           , (unsigned long)( task )
+           , int(state_old)
+           , task->m_dep_size
+           );
+    fflush( stderr );
+    Kokkos::abort("ThreadsTaskPolicyQueue::complete_executed_task" );
+  }
+
+  // If the task was respawned it may have already been
+  // put in a ready queue and the count incremented.
+  // By decrementing the count last it will never go to zero
+  // with a ready or executing task.
+
+  atomic_fetch_add( & m_count_ready , -1 );
+}
+
+//----------------------------------------------------------------------------
+
+void ThreadsTaskPolicyQueue::reschedule_task(
+  ThreadsTaskPolicyQueue::task_root_type * const task )
+{
+  // Reschedule transitions from executing back to waiting.
+  const int old_state =
+    atomic_compare_exchange( & task->m_state
+                           , int(TASK_STATE_EXECUTING)
+                           , int(TASK_STATE_WAITING) );
+
+  if ( old_state != int(TASK_STATE_EXECUTING) ) {
+
+    fprintf( stderr
+           , "ThreadsTaskPolicyQueue::reschedule_task(0x%lx) ERROR state(%d)\n"
+           , (unsigned long) task
+           , old_state
+           );
+    fflush(stderr);
+    Kokkos::abort("ThreadsTaskPolicyQueue::reschedule" );
+  }
+}
+
+void ThreadsTaskPolicyQueue::schedule_task
+  ( ThreadsTaskPolicyQueue::task_root_type * const task 
+  , const bool initial_spawn )
+{
+  task_root_type * const q_lock = reinterpret_cast<task_root_type*>(QLOCK);
+  task_root_type * const q_denied = reinterpret_cast<task_root_type*>(QDENIED);
+
+  //----------------------------------------
+  // State is either constructing or already waiting.
+  // If constructing then transition to waiting.
+
+  {
+    const int old_state = atomic_compare_exchange( & task->m_state
+                                                 , int(TASK_STATE_CONSTRUCTING)
+                                                 , int(TASK_STATE_WAITING) );
+
+    // Head of linked list of tasks waiting on this task
+    task_root_type * const waitTask =
+      *((task_root_type * volatile const *) & task->m_wait );
+
+    // Member of linked list of tasks waiting on some other task
+    task_root_type * const next =
+      *((task_root_type * volatile const *) & task->m_next );
+
+    // An incomplete and non-executing task has:
+    //   task->m_state == TASK_STATE_CONSTRUCTING or TASK_STATE_WAITING
+    //   task->m_wait  != q_denied
+    //   task->m_next  == 0
+    //
+    if ( ( q_denied == waitTask ) ||
+         ( 0 != next ) ||
+         ( old_state != int(TASK_STATE_CONSTRUCTING) &&
+           old_state != int(TASK_STATE_WAITING) ) ) {
+      fprintf(stderr,"ThreadsTaskPolicyQueue::schedule_task(0x%lx) STATE ERROR: state(%d) wait(0x%lx) next(0x%lx)\n"
+                    , (unsigned long) task
+                    , old_state
+                    , (unsigned long) waitTask
+                    , (unsigned long) next );
+      fflush(stderr);
+      Kokkos::abort("ThreadsTaskPolicyQueue::schedule" );
+    }
+  }
+
+  //----------------------------------------
+
+  if ( initial_spawn ) {
+    // The initial spawn of a task increments the reference count
+    // for the task's existence in either a waiting or ready queue
+    // until the task has completed.
+    // Completing the task's execution is the matching
+    // decrement of the reference count.
+
+    task_root_type::assign( 0 , task );
+  }
+
+  //----------------------------------------
+  // Insert this task into a dependence task that is not complete.
+  // Push on to that task's wait queue.
+
+  bool attempt_insert_in_queue = true ;
+
+  task_root_type * volatile * queue =
+    task->m_dep_size ? & task->m_dep[0]->m_wait : (task_root_type **) 0 ;
+
+  for ( int i = 0 ; attempt_insert_in_queue && ( 0 != queue ) ; ) {
+
+    task_root_type * const head_value_old = *queue ;
+
+    if ( q_denied == head_value_old ) {
+      // Wait queue is closed because task is complete,
+      // try again with the next dependence wait queue.
+      ++i ;
+      queue = i < task->m_dep_size ? & task->m_dep[i]->m_wait
+                                   : (task_root_type **) 0 ;
+    }
+    else {
+
+      // Wait queue is open and not denied.
+      // Have exclusive access to this task.
+      // Assign m_next assuming a successfull insertion into the queue.
+      // Fence the memory assignment before attempting the CAS.
+
+      *((task_root_type * volatile *) & task->m_next ) = head_value_old ;
+
+      memory_fence();
+
+      // Attempt to insert this task into the queue.
+      // If fails then continue the attempt.
+
+      attempt_insert_in_queue =
+        head_value_old != atomic_compare_exchange(queue,head_value_old,task);
+    }
+  }
+
+  //----------------------------------------
+  // All dependences are complete, insert into the ready list
+
+  if ( attempt_insert_in_queue ) {
+
+    // Increment the count of ready tasks.
+    // Count will be decremented when task is complete.
+
+    atomic_fetch_add( & m_count_ready , 1 );
+
+    queue = task->m_queue ;
+
+    while ( attempt_insert_in_queue ) {
+
+      // A locked queue is being popped.
+
+      task_root_type * const head_value_old = *queue ;
+
+      if ( q_lock != head_value_old ) {
+        // Read the head of ready queue,
+        // if same as previous value then CAS locks the ready queue
+
+        // Have exclusive access to this task,
+        // assign to head of queue, assuming successful insert
+        // Fence assignment before attempting insert.
+        *((task_root_type * volatile *) & task->m_next ) = head_value_old ;
+
+        memory_fence();
+
+        attempt_insert_in_queue =
+          head_value_old != atomic_compare_exchange(queue,head_value_old,task);
+      }
+    }
+  }
+}
+
+
+void TaskMember< Kokkos::Threads , void , void >::latch_add( const int k )
+{
+  typedef TaskMember< Kokkos::Threads , void , void > task_root_type ;
+
+  task_root_type * const q_denied = reinterpret_cast<task_root_type*>(QDENIED);
+  
+  const bool ok_input = 0 < k ;
+  
+  const int count = ok_input ? atomic_fetch_add( & m_dep_size , -k ) - k
+                             : k ;
+                           
+  const bool ok_count = 0 <= count ;
+  
+  const int state = 0 != count ? TASK_STATE_WAITING :
+    atomic_compare_exchange( & m_state
+                           , TASK_STATE_WAITING
+                           , TASK_STATE_COMPLETE );
+          
+  const bool ok_state = state == TASK_STATE_WAITING ;
+            
+  if ( ! ok_count || ! ok_state ) {
+    printf( "ThreadsTaskPolicyQueue::latch_add[0x%lx](%d) ERROR %s %d\n"
+          , (unsigned long) this
+          , k
+          , ( ! ok_input ? "Non-positive input" :
+            ( ! ok_count ? "Negative count" : "Bad State" ) )
+          , ( ! ok_input ? k :
+            ( ! ok_count ? count : state ) )
+          );
+    Kokkos::abort( "ThreadsTaskPolicyQueue::latch_add ERROR" );
+  } 
+  else if ( 0 == count ) {
+    // Stop other tasks from adding themselves to this latch's wait queue.
+    // The wait queue is updated concurrently so guard with an atomic.
+      
+    ThreadsTaskPolicyQueue & policy = *m_policy ; 
+    task_root_type * wait_queue     = *((task_root_type * volatile *) &m_wait);
+    task_root_type * wait_queue_old = 0 ;
+
+    do {
+      wait_queue_old = wait_queue ;
+      wait_queue     = atomic_compare_exchange( & m_wait , wait_queue_old , q_denied );
+    } while ( wait_queue_old != wait_queue );
+    
+    // Pop waiting tasks and schedule them
+    while ( wait_queue ) {
+      task_root_type * const x = wait_queue ; wait_queue = x->m_next ; x->m_next = 0 ;
+      policy.schedule_task( x , false /* not initial spawn */ );
+    }
+  }
+}
+
+//----------------------------------------------------------------------------
+
+void ThreadsTaskPolicyQueue::deallocate_task( void * ptr , unsigned size_alloc )
+{
+/*
+  const int n = atomic_fetch_add( & alloc_count , -1 ) - 1 ;
+
+  fprintf( stderr
+         , "ThreadsTaskPolicyQueue::deallocate_task(0x%lx,%d) count(%d)\n"
+         , (unsigned long) ptr
+         , size_alloc
+         , n
+         );
+  fflush( stderr );
+*/
+
+  m_space.deallocate( ptr , size_alloc );
+
+  Kokkos::atomic_decrement( & m_count_alloc );
+}
+
+ThreadsTaskPolicyQueue::task_root_type *
+ThreadsTaskPolicyQueue::allocate_task
+  ( const unsigned arg_sizeof_task
+  , const unsigned arg_dep_capacity
+  , const unsigned arg_team_shmem
+  )
+{ 
+  const unsigned base_size = arg_sizeof_task +
+    ( arg_sizeof_task % sizeof(task_root_type*)
+    ? sizeof(task_root_type*) - arg_sizeof_task % sizeof(task_root_type*)
+    : 0 );
+    
+  const unsigned dep_capacity
+    = ~0u == arg_dep_capacity
+    ? m_default_dependence_capacity
+    : arg_dep_capacity ;
+
+  const unsigned size_alloc =
+     base_size + sizeof(task_root_type*) * dep_capacity ;
+
+#if 0
+  // User created task memory pool with an estimate,
+  // if estimate is to low then report and throw exception.
+
+  if ( m_space.get_min_block_size() < size_alloc ) {
+    fprintf(stderr,"TaskPolicy<Threads> task allocation requires %d bytes on memory pool with %d byte chunk size\n"
+           , int(size_alloc)
+           , int(m_space.get_min_block_size())
+           );
+    fflush(stderr);
+    Kokkos::Impl::throw_runtime_exception("TaskMember< Threads >::task_allocate");
+  }
+#endif
+
+  task_root_type * const task =
+    reinterpret_cast<task_root_type*>( m_space.allocate( size_alloc ) );
+      
+  if ( task != 0 ) {
+        
+    // Initialize task's root and value data structure
+    // Calling function must copy construct the functor.
+        
+    new( (void*) task ) task_root_type();
+  
+    task->m_policy       = this ;
+    task->m_size_alloc   = size_alloc ;
+    task->m_dep_capacity = dep_capacity ;
+    task->m_shmem_size   = arg_team_shmem ;
+
+    if ( dep_capacity ) {
+      task->m_dep =
+        reinterpret_cast<task_root_type**>(
+        reinterpret_cast<unsigned char*>(task) + base_size );
+
+      for ( unsigned i = 0 ; i < dep_capacity ; ++i )
+        task->task_root_type::m_dep[i] = 0 ;
+    }
+
+    Kokkos::atomic_increment( & m_count_alloc );
+  }
+  return  task ;
+}
+
+
+//----------------------------------------------------------------------------
+
+void ThreadsTaskPolicyQueue::add_dependence
+  ( ThreadsTaskPolicyQueue::task_root_type * const after
+  , ThreadsTaskPolicyQueue::task_root_type * const before
+  )
+{
+  if ( ( after != 0 ) && ( before != 0 ) ) {
+
+    int const state = *((volatile const int *) & after->m_state );
+
+    // Only add dependence during construction or during execution.
+    // Both tasks must have the same policy.
+    // Dependence on non-full memory cannot be mixed with any other dependence.
+
+    const bool ok_state =
+      Kokkos::Experimental::TASK_STATE_CONSTRUCTING == state ||
+      Kokkos::Experimental::TASK_STATE_EXECUTING    == state ;
+
+    const bool ok_capacity =
+      after->m_dep_size < after->m_dep_capacity ;
+
+    const bool ok_policy =
+      after->m_policy == this && before->m_policy == this ;
+
+    if ( ok_state && ok_capacity && ok_policy ) {
+
+      ++after->m_dep_size ;
+
+      task_root_type::assign( after->m_dep + (after->m_dep_size-1) , before );
+
+      memory_fence();
+    }
+    else {
+
+fprintf( stderr
+       , "ThreadsTaskPolicyQueue::add_dependence( 0x%lx , 0x%lx ) ERROR %s\n"
+       , (unsigned long) after
+       , (unsigned long) before
+       , ( ! ok_state    ? "Task not constructing or executing" :
+         ( ! ok_capacity ? "Task Exceeded dependence capacity" 
+                         : "Tasks from different policies" 
+         )) );
+
+fflush( stderr );
+
+      Kokkos::abort("ThreadsTaskPolicyQueue::add_dependence ERROR");
+    }
+  }
+}
+
+} /* namespace Impl */
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+
+TaskPolicy< Kokkos::Threads >::TaskPolicy
+  ( const unsigned arg_task_max_count
+  , const unsigned arg_task_max_size // Application's task size
+  , const unsigned arg_task_default_dependence_capacity
+  , const unsigned arg_task_team_size
+  )
+  : m_track()
+  , m_policy(0)
+{
+  typedef Kokkos::Experimental::Impl::SharedAllocationRecord
+    < Kokkos::HostSpace , Impl::ThreadsTaskPolicyQueue::Destroy > record_type ;
+
+  record_type * record =
+    record_type::allocate( Kokkos::HostSpace()
+                         , "Threads task queue"
+                         , sizeof(Impl::ThreadsTaskPolicyQueue)
+                         );
+
+  m_policy =
+    reinterpret_cast< Impl::ThreadsTaskPolicyQueue * >( record->data() );
+
+  // Tasks are allocated with application's task size + sizeof(task_root_type)
+
+  const size_t full_task_size_estimate =
+    arg_task_max_size +
+    sizeof(task_root_type) +
+    sizeof(task_root_type*) * arg_task_default_dependence_capacity ;
+
+  new( m_policy )
+    Impl::ThreadsTaskPolicyQueue( arg_task_max_count
+                                , full_task_size_estimate
+                                , arg_task_default_dependence_capacity
+                                , arg_task_team_size );
+
+  record->m_destroy.m_policy = m_policy ;
+
+  m_track.assign_allocated_record_to_uninitialized( record );
+}
+
+
+TaskPolicy< Kokkos::Threads >::member_type &
+TaskPolicy< Kokkos::Threads >::member_single()
+{
+  static member_type s ;
+  return s ;
+}
+
+void wait( Kokkos::Experimental::TaskPolicy< Kokkos::Threads > & policy )
+{
+  typedef Kokkos::Impl::ThreadsExecTeamMember member_type ;
+
+  enum { BASE_SHMEM = 1024 };
+
+  Kokkos::Impl::ThreadsExec::resize_scratch( 0 , member_type::team_reduce_size() + BASE_SHMEM );
+
+  Kokkos::Impl::ThreadsExec::start( & Impl::ThreadsTaskPolicyQueue::driver
+                                  , policy.m_policy );
+
+  Kokkos::Impl::ThreadsExec::fence();
+}
+
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+typedef TaskMember< Kokkos::Threads , void , void > Task ;
+
+//----------------------------------------------------------------------------
+
+Task::~TaskMember()
+{
+}
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+
+void Task::assign( Task ** const lhs_ptr , Task * rhs )
+{
+  Task * const q_denied = reinterpret_cast<Task*>(QDENIED);
+
+  // Increment rhs reference count.
+  if ( rhs ) { atomic_fetch_add( & rhs->m_ref_count , 1 ); }
+
+  if ( 0 == lhs_ptr ) return ;
+
+  // Must have exclusive access to *lhs_ptr.
+  // Assign the pointer and retrieve the previous value.
+
+#if 1
+
+  Task * const old_lhs = *lhs_ptr ;
+
+  *lhs_ptr = rhs ;
+
+#elif 0
+
+  Task * const old_lhs = *((Task*volatile*)lhs_ptr);
+
+  *((Task*volatile*)lhs_ptr) = rhs ;
+
+  Kokkos::memory_fence();
+
+#else
+
+  Task * const old_lhs = atomic_exchange( lhs_ptr , rhs );
+
+#endif
+
+  if ( old_lhs && rhs && old_lhs->m_policy != rhs->m_policy ) {
+    Kokkos::abort( "Kokkos::Impl::TaskMember<Kokkos::Threads>::assign ERROR different queues");
+  }
+
+  if ( old_lhs ) {
+
+    // Decrement former lhs reference count.
+    // If reference count is zero task must be complete, then delete task.
+    // Task is ready for deletion when  wait == q_denied
+    int const count = atomic_fetch_add( & (old_lhs->m_ref_count) , -1 ) - 1 ;
+    int const state = old_lhs->m_state ;
+    Task * const wait = *((Task * const volatile *) & old_lhs->m_wait );
+
+    const bool ok_count = 0 <= count ;
+
+    // If count == 0 then will be deleting
+    // and must either be constructing or complete.
+    const bool ok_state = 0 < count ? true :
+      ( ( state == int(TASK_STATE_CONSTRUCTING) && wait == 0 ) ||
+        ( state == int(TASK_STATE_COMPLETE)     && wait == q_denied ) )
+      &&
+     old_lhs->m_next == 0 &&
+     old_lhs->m_dep_size == 0 ;
+
+    if ( ! ok_count || ! ok_state ) {
+
+      fprintf( stderr , "Kokkos::Impl::TaskManager<Kokkos::Threads>::assign ERROR deleting task(0x%lx) m_ref_count(%d) , m_wait(0x%ld)\n"
+                      , (unsigned long) old_lhs
+                      , count
+                      , (unsigned long) wait );
+      fflush(stderr);
+      Kokkos::abort( "Kokkos::Impl::TaskMember<Kokkos::Threads>::assign ERROR deleting");
+    }
+
+    if ( count == 0 ) {
+      // When 'count == 0' this thread has exclusive access to 'old_lhs'
+
+      ThreadsTaskPolicyQueue & queue = *( old_lhs->m_policy );
+
+      queue.deallocate_task( old_lhs , old_lhs->m_size_alloc );
+    }
+  }
+}
+
+#endif
+
+//----------------------------------------------------------------------------
+
+Task * Task::get_dependence( int i ) const
+{
+  Task * const t = m_dep[i] ;
+
+  if ( Kokkos::Experimental::TASK_STATE_EXECUTING != m_state || i < 0 || m_dep_size <= i || 0 == t ) {
+
+fprintf( stderr
+       , "TaskMember< Threads >::get_dependence ERROR : task[%lx]{ state(%d) dep_size(%d) dep[%d] = %lx }\n"
+       , (unsigned long) this
+       , m_state
+       , m_dep_size
+       , i
+       , (unsigned long) t
+       );
+fflush( stderr );
+
+    Kokkos::Impl::throw_runtime_exception("TaskMember< Threads >::get_dependence ERROR");
+  }
+
+  return t ;
+}
+
+//----------------------------------------------------------------------------
+
+void Task::clear_dependence()
+{
+  for ( int i = m_dep_size - 1 ; 0 <= i ; --i ) {
+    assign( m_dep + i , 0 );
+  }
+
+  *((volatile int *) & m_dep_size ) = 0 ;
+
+  memory_fence();
+}
+
+//----------------------------------------------------------------------------
+
+} /* namespace Impl */
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+#endif /* #if defined( KOKKOS_HAVE_PTHREAD ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */
+
diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_TaskPolicy.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_TaskPolicy.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..116d32e4fc4d6c6da2968518caacc133e7488ab4
--- /dev/null
+++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_TaskPolicy.hpp
@@ -0,0 +1,745 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+// Experimental unified task-data parallel manycore LDRD
+
+#ifndef KOKKOS_THREADS_TASKPOLICY_HPP
+#define KOKKOS_THREADS_TASKPOLICY_HPP
+
+
+#include <Kokkos_Threads.hpp>
+#include <Kokkos_TaskPolicy.hpp>
+
+#if defined( KOKKOS_HAVE_PTHREAD ) && defined( KOKKOS_ENABLE_TASKPOLICY )
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+struct ThreadsTaskPolicyQueue ;
+
+/** \brief  Base class for all Kokkos::Threads tasks */
+template<>
+class TaskMember< Kokkos::Threads , void , void > {
+public:
+
+  template < class > friend class Kokkos::Experimental::TaskPolicy ;
+  friend struct ThreadsTaskPolicyQueue ;
+
+  typedef TaskMember * (* function_verify_type) ( TaskMember * );
+  typedef void         (* function_single_type) ( TaskMember * );
+  typedef void         (* function_team_type)   ( TaskMember * , Kokkos::Impl::ThreadsExecTeamMember & );
+
+private:
+
+
+  ThreadsTaskPolicyQueue * m_policy ;
+  TaskMember * volatile  * m_queue ;
+  function_verify_type     m_verify ;
+  function_team_type       m_team ;         ///< Apply function
+  function_single_type     m_serial ;       ///< Apply function
+  TaskMember **            m_dep ;          ///< Dependences
+  TaskMember *             m_wait ;         ///< Head of linked list of tasks waiting on this task
+  TaskMember *             m_next ;         ///< Member of linked list of tasks
+  int                      m_dep_capacity ; ///< Capacity of dependences
+  int                      m_dep_size ;     ///< Actual count of dependences
+  int                      m_size_alloc ;
+  int                      m_shmem_size ;
+  int                      m_ref_count ;    ///< Reference count
+  int                      m_state ;        ///< State of the task
+
+
+  TaskMember( TaskMember && ) = delete ;
+  TaskMember( const TaskMember & ) = delete ;
+  TaskMember & operator = ( TaskMember && ) = delete ;
+  TaskMember & operator = ( const TaskMember & ) = delete ;
+
+protected:
+
+  TaskMember()
+    : m_policy(0)
+    , m_verify(0)
+    , m_team(0)
+    , m_serial(0)
+    , m_dep(0)
+    , m_wait(0)
+    , m_next(0)
+    , m_dep_capacity(0)
+    , m_dep_size(0)
+    , m_size_alloc(0)
+    , m_shmem_size(0)
+    , m_ref_count(0)
+    , m_state( TASK_STATE_CONSTRUCTING )
+    {}
+
+public:
+
+  ~TaskMember();
+
+  KOKKOS_INLINE_FUNCTION
+  int reference_count() const
+    { return *((volatile int *) & m_ref_count ); }
+
+  template< typename ResultType >
+  KOKKOS_FUNCTION static
+  TaskMember * verify_type( TaskMember * t )
+    {
+      enum { check_type = ! std::is_same< ResultType , void >::value };
+
+      if ( check_type && t != 0 ) {
+
+        // Verify that t->m_verify is this function
+        const function_verify_type self = & TaskMember::template verify_type< ResultType > ;
+
+        if ( t->m_verify != self ) {
+          t = 0 ;
+          Kokkos::abort("TaskPolicy< Threads > verify_result_type" );
+        }
+      }
+      return t ;
+    }
+
+  //----------------------------------------
+  /*  Inheritence Requirements on task types:
+   *
+   *    class TaskMember< Threads , DerivedType::value_type , FunctorType >
+   *      : public TaskMember< Threads , DerivedType::value_type , void >
+   *      , public Functor
+   *      { ... };
+   *
+   *  If value_type != void
+   *    class TaskMember< Threads , value_type , void >
+   *      : public TaskMember< Threads , void , void >
+   *
+   */
+  //----------------------------------------
+
+  template< class DerivedTaskType , class Tag >
+  KOKKOS_FUNCTION static
+  void apply_single(
+    typename std::enable_if
+      <( std::is_same<Tag,void>::value &&
+         std::is_same< typename DerivedTaskType::result_type , void >::value
+       ), TaskMember * >::type t )
+    {
+      {
+        typedef typename DerivedTaskType::functor_type  functor_type ;
+
+        functor_type * const f = 
+          static_cast< functor_type * >( static_cast< DerivedTaskType * >(t) );
+
+        f->apply();
+
+        if ( t->m_state == int(Kokkos::Experimental::TASK_STATE_EXECUTING) ) {
+          f->~functor_type();
+        }
+      }
+    }
+
+  template< class DerivedTaskType , class Tag >
+  KOKKOS_FUNCTION static
+  void apply_single(
+    typename std::enable_if
+      <( std::is_same< Tag , void >::value &&
+         ! std::is_same< typename DerivedTaskType::result_type , void >::value
+       ), TaskMember * >::type t )
+    {
+      {
+        typedef typename DerivedTaskType::functor_type  functor_type ;
+
+        DerivedTaskType * const self = static_cast< DerivedTaskType * >(t);
+        functor_type    * const f    = static_cast< functor_type * >( self );
+
+        f->apply( self->m_result );
+
+        if ( t->m_state == int(Kokkos::Experimental::TASK_STATE_EXECUTING) ) {
+          f->~functor_type();
+        }
+      }
+    }
+
+  //----------------------------------------
+
+  template< class DerivedTaskType , class Tag >
+  KOKKOS_FUNCTION static
+  void apply_team(
+    typename std::enable_if
+      <( std::is_same<Tag,void>::value &&
+         std::is_same<typename DerivedTaskType::result_type,void>::value
+       ), TaskMember * >::type t
+    , Kokkos::Impl::ThreadsExecTeamMember & member
+    )
+    {
+      typedef typename DerivedTaskType::functor_type  functor_type ;
+
+      functor_type * const f =
+        static_cast< functor_type * >( static_cast< DerivedTaskType * >(t) );
+    
+      f->apply( member );
+
+      // Synchronize for possible functor destruction and
+      // completion of team task.
+      if ( member.team_fan_in() ) {
+        if ( t->m_state == int(Kokkos::Experimental::TASK_STATE_EXECUTING) ) {
+          f->~functor_type();
+        }
+      }
+    }
+
+  template< class DerivedTaskType , class Tag >
+  KOKKOS_FUNCTION static
+  void apply_team(
+    typename std::enable_if
+      <( std::is_same<Tag,void>::value &&
+         ! std::is_same<typename DerivedTaskType::result_type,void>::value
+       ), TaskMember * >::type t
+    , Kokkos::Impl::ThreadsExecTeamMember & member
+    )
+    {
+      typedef typename DerivedTaskType::functor_type  functor_type ;
+
+      DerivedTaskType * const self = static_cast< DerivedTaskType * >(t);
+      functor_type    * const f    = static_cast< functor_type * >( self );
+    
+      f->apply( member , self->m_result );
+
+      // Synchronize for possible functor destruction and
+      // completion of team task.
+      if ( member.team_fan_in() ) {
+        if ( t->m_state == int(Kokkos::Experimental::TASK_STATE_EXECUTING) ) {
+          f->~functor_type();
+        }
+      }
+    }
+
+  //----------------------------------------
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+  static
+  void assign( TaskMember ** const lhs , TaskMember * const rhs );
+#else
+  KOKKOS_INLINE_FUNCTION static
+  void assign( TaskMember ** const lhs , TaskMember * const rhs ) {}
+#endif
+
+  TaskMember * get_dependence( int i ) const ;
+
+  KOKKOS_INLINE_FUNCTION
+  int get_dependence() const { return m_dep_size ; }
+
+  void clear_dependence();
+
+  void latch_add( const int k );
+
+  //----------------------------------------
+
+  typedef FutureValueTypeIsVoidError get_result_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  get_result_type get() const { return get_result_type() ; }
+
+  inline static
+  void construct_result( TaskMember * const ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  Kokkos::Experimental::TaskState get_state() const { return Kokkos::Experimental::TaskState( m_state ); }
+
+};
+
+/** \brief  A Future< Kokkos::Threads , ResultType > will cast
+ *          from  TaskMember< Kokkos::Threads , void , void >
+ *          to    TaskMember< Kokkos::Threads , ResultType , void >
+ *          to query the result.
+ */
+template< class ResultType >
+class TaskMember< Kokkos::Threads , ResultType , void >
+  : public TaskMember< Kokkos::Threads , void , void >
+{
+public:
+
+  typedef ResultType result_type ;
+
+  result_type  m_result ;
+
+  typedef const result_type & get_result_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  get_result_type get() const { return m_result ; }
+
+  inline static
+  void construct_result( TaskMember * const ptr )
+    { 
+      new((void*)(& ptr->m_result)) result_type();
+    }
+
+  inline
+  TaskMember() : TaskMember< Kokkos::Threads , void , void >(), m_result() {}
+
+  TaskMember( TaskMember && ) = delete ;
+  TaskMember( const TaskMember & ) = delete ;
+  TaskMember & operator = ( TaskMember && ) = delete ;
+  TaskMember & operator = ( const TaskMember & ) = delete ;
+};
+
+/** \brief  Callback functions will cast
+ *          from  TaskMember< Kokkos::Threads , void , void >
+ *          to    TaskMember< Kokkos::Threads , ResultType , FunctorType >
+ *          to execute work functions.
+ */
+template< class ResultType , class FunctorType >
+class TaskMember< Kokkos::Threads , ResultType , FunctorType >
+  : public TaskMember< Kokkos::Threads , ResultType , void >
+  , public FunctorType
+{
+public:
+  typedef ResultType   result_type ;
+  typedef FunctorType  functor_type ;
+
+  inline
+  TaskMember( const functor_type & arg_functor )
+    : TaskMember< Kokkos::Threads , ResultType , void >()
+    , functor_type( arg_functor )
+    {}
+
+  inline static
+  void copy_construct( TaskMember * const ptr
+                     , const functor_type & arg_functor )
+    {
+      typedef TaskMember< Kokkos::Threads , ResultType , void > base_type ;
+
+      new((void*)static_cast<FunctorType*>(ptr)) functor_type( arg_functor );
+
+      base_type::construct_result( static_cast<base_type*>( ptr ) );
+    }
+
+  TaskMember() = delete ;
+  TaskMember( TaskMember && ) = delete ;
+  TaskMember( const TaskMember & ) = delete ;
+  TaskMember & operator = ( TaskMember && ) = delete ;
+  TaskMember & operator = ( const TaskMember & ) = delete ;
+};
+
+//----------------------------------------------------------------------------
+
+struct ThreadsTaskPolicyQueue {
+
+  enum { NPRIORITY = 3 };
+
+  typedef Kokkos::Experimental::MemoryPool< Kokkos::Threads >
+    memory_space ;
+
+  typedef Kokkos::Experimental::Impl::TaskMember< Kokkos::Threads, void, void >
+    task_root_type ;
+
+  memory_space     m_space ;
+  task_root_type * m_team[ NPRIORITY ];
+  task_root_type * m_serial[ NPRIORITY ];
+  int              m_team_size ;    ///< Fixed size of a task-team
+  int              m_default_dependence_capacity ;
+  int     volatile m_count_ready ;  ///< Ready plus executing tasks
+  int     volatile m_count_alloc ;  ///< Total allocated tasks
+
+  // Execute tasks until all non-waiting tasks are complete.
+  static void driver( Kokkos::Impl::ThreadsExec & exec
+                    , const void * arg );
+
+  task_root_type * allocate_task
+   ( const unsigned arg_sizeof_task
+   , const unsigned arg_dep_capacity
+   , const unsigned arg_team_shmem
+   );
+
+  void deallocate_task( void * , unsigned );
+  void schedule_task( task_root_type * const
+                    , const bool initial_spawn = true );
+  void reschedule_task( task_root_type * const );
+  void add_dependence( task_root_type * const after
+                     , task_root_type * const before );
+
+  // When a task finishes executing update its dependences
+  // and either deallocate the task if complete
+  // or reschedule the task if respawned.
+  void complete_executed_task( task_root_type * );
+
+  // Pop a task from a ready queue
+  static task_root_type *
+    pop_ready_task( task_root_type * volatile * const queue );
+
+  ThreadsTaskPolicyQueue() = delete ;
+  ThreadsTaskPolicyQueue( ThreadsTaskPolicyQueue && ) = delete ;
+  ThreadsTaskPolicyQueue( const ThreadsTaskPolicyQueue & ) = delete ;
+  ThreadsTaskPolicyQueue & operator = ( ThreadsTaskPolicyQueue && ) = delete ;
+  ThreadsTaskPolicyQueue & operator = ( const ThreadsTaskPolicyQueue & ) = delete ;
+
+  ~ThreadsTaskPolicyQueue();
+
+  ThreadsTaskPolicyQueue
+    ( const unsigned arg_task_max_count
+    , const unsigned arg_task_max_size
+    , const unsigned arg_task_default_dependence_capacity
+    , const unsigned arg_task_team_size
+    );
+
+  // Callback to destroy the shared memory tracked queue.
+  struct Destroy {
+    ThreadsTaskPolicyQueue * m_policy ;
+    void destroy_shared_allocation();
+  };
+};
+
+} /* namespace Impl */
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+
+void wait( TaskPolicy< Kokkos::Threads > & );
+
+template<>
+class TaskPolicy< Kokkos::Threads >
+{
+public:
+
+  typedef Kokkos::Threads                      execution_space ;
+  typedef TaskPolicy                           execution_policy ;
+  typedef Kokkos::Impl::ThreadsExecTeamMember  member_type ;
+
+private:
+
+  typedef Impl::TaskMember< Kokkos::Threads , void , void >  task_root_type ;
+  typedef Kokkos::Experimental::MemoryPool< Kokkos::Threads > memory_space ;
+
+  typedef Kokkos::Experimental::Impl::SharedAllocationTracker track_type ;
+
+  track_type                      m_track ;
+  Impl::ThreadsTaskPolicyQueue  * m_policy ;
+
+  template< class FunctorType >
+  static inline
+  const task_root_type * get_task_root( const FunctorType * f )
+    {
+      typedef Impl::TaskMember< execution_space , typename FunctorType::value_type , FunctorType > task_type ;
+      return static_cast< const task_root_type * >( static_cast< const task_type * >(f) );
+    }
+
+  template< class FunctorType >
+  static inline
+  task_root_type * get_task_root( FunctorType * f )
+    {
+      typedef Impl::TaskMember< execution_space , typename FunctorType::value_type , FunctorType > task_type ;
+      return static_cast< task_root_type * >( static_cast< task_type * >(f) );
+    }
+
+  /** \brief  Allocate and construct a task.
+   *
+   *  Allocate space for DerivedTaskType followed by TaskMember*[ dependence_capacity ]
+   */
+  template< class DerivedTaskType , class Tag >
+  task_root_type *
+  create( const typename DerivedTaskType::functor_type &  arg_functor
+        , const task_root_type::function_single_type      arg_apply_single
+        , const task_root_type::function_team_type        arg_apply_team
+        , const unsigned                                  arg_team_shmem
+        , const unsigned                                  arg_dependence_capacity
+        )
+    {
+      task_root_type * const t =
+        m_policy->allocate_task( sizeof(DerivedTaskType)
+                               , arg_dependence_capacity
+                               , arg_team_shmem
+                               );
+      if ( t != 0 ) {
+
+        DerivedTaskType * const task = static_cast<DerivedTaskType*>(t);
+
+        DerivedTaskType::copy_construct( task , arg_functor );
+
+        task->task_root_type::m_verify  = & task_root_type::template verify_type< typename DerivedTaskType::value_type > ;
+        task->task_root_type::m_team    = arg_apply_team ;
+        task->task_root_type::m_serial  = arg_apply_single ;
+
+        // Do not proceed until initialization is written to memory
+        Kokkos::memory_fence();
+      }
+      return t ;
+    }
+
+public:
+
+  // Valid team sizes are 1,
+  // Threads::pool_size(1) == threads per numa, or
+  // Threads::pool_size(2) == threads per core
+
+  TaskPolicy
+    ( const unsigned arg_task_max_count
+    , const unsigned arg_task_max_size
+    , const unsigned arg_task_default_dependence_capacity = 4
+    , const unsigned arg_task_team_size = 0 /* choose default */
+    );
+
+  KOKKOS_FUNCTION TaskPolicy() = default ;
+  KOKKOS_FUNCTION TaskPolicy( TaskPolicy && rhs ) = default ;
+  KOKKOS_FUNCTION TaskPolicy( const TaskPolicy & rhs ) = default ;
+  KOKKOS_FUNCTION TaskPolicy & operator = ( TaskPolicy && rhs ) = default ;
+  KOKKOS_FUNCTION TaskPolicy & operator = ( const TaskPolicy & rhs ) = default ;
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  int allocated_task_count() const { return m_policy->m_count_alloc ; }
+
+  //----------------------------------------
+  // Create serial-thread task
+
+  template< class FunctorType >
+  KOKKOS_INLINE_FUNCTION
+  Future< typename FunctorType::value_type , execution_space >
+  task_create( const FunctorType & functor
+             , const unsigned dependence_capacity = ~0u )
+    {
+      typedef typename FunctorType::value_type  value_type ;
+      typedef Impl::TaskMember< execution_space , value_type , FunctorType >  task_type ;
+
+      return Future< value_type , execution_space >(
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+        TaskPolicy::create< task_type , void >
+          ( functor
+          , & task_root_type::template apply_single< task_type , void >
+          , task_root_type::function_team_type(0)
+          , 0
+          , dependence_capacity
+          )
+#endif
+        );
+    }
+
+  template< class FunctorType >
+  KOKKOS_INLINE_FUNCTION
+  Future< typename FunctorType::value_type , execution_space >
+  proc_create( const FunctorType & functor
+             , const unsigned dependence_capacity = ~0u )
+    { return task_create( functor , dependence_capacity ); }
+
+  // Create thread-team task
+
+  template< class FunctorType >
+  KOKKOS_INLINE_FUNCTION
+  Future< typename FunctorType::value_type , execution_space >
+  task_create_team( const FunctorType & functor
+                  , const unsigned dependence_capacity = ~0u )
+    {
+      typedef typename FunctorType::value_type  value_type ;
+      typedef Impl::TaskMember< execution_space , value_type , FunctorType >  task_type ;
+
+      return Future< value_type , execution_space >(
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+        TaskPolicy::create< task_type , void >
+          ( functor
+          , task_root_type::function_single_type(0)
+          , & task_root_type::template apply_team< task_type , void >
+          , Kokkos::Impl::FunctorTeamShmemSize< FunctorType >::
+              value( functor , m_policy->m_team_size )
+          , dependence_capacity
+          )
+#endif
+        );
+    }
+
+  template< class FunctorType >
+  KOKKOS_INLINE_FUNCTION
+  Future< typename FunctorType::value_type , execution_space >
+  proc_create_team( const FunctorType & functor
+                  , const unsigned dependence_capacity = ~0u )
+    { return task_create_team( functor , dependence_capacity ); }
+
+  template< class A1 , class A2 , class A3 , class A4 >
+  KOKKOS_INLINE_FUNCTION
+  void add_dependence( const Future<A1,A2> & after
+                     , const Future<A3,A4> & before
+                     , typename std::enable_if
+                        < std::is_same< typename Future<A1,A2>::execution_space , execution_space >::value
+                          &&
+                          std::is_same< typename Future<A3,A4>::execution_space , execution_space >::value
+                        >::type * = 0
+                      ) const
+    {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      m_policy->add_dependence( after.m_task , before.m_task );
+#endif
+    }
+
+  //----------------------------------------
+
+  Future< Latch , execution_space >
+  KOKKOS_INLINE_FUNCTION
+  create_latch( const int N ) const
+    {
+      task_root_type * const task =
+        m_policy->allocate_task( sizeof(task_root_type) , 0 , 0 );
+      task->m_dep_size = N ; // Using m_dep_size for latch counter
+      task->m_state = TASK_STATE_WAITING ;
+      return Future< Latch , execution_space >( task );
+    }
+
+  //----------------------------------------
+
+  template< class FunctorType , class A3 , class A4 >
+  KOKKOS_INLINE_FUNCTION
+  void add_dependence( FunctorType * task_functor
+                     , const Future<A3,A4> & before
+                     , typename std::enable_if
+                        < std::is_same< typename Future<A3,A4>::execution_space , execution_space >::value
+                        >::type * = 0
+                      ) const
+    {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      m_policy->add_dependence( get_task_root(task_functor) , before.m_task );
+#endif
+    }
+
+  template< class ValueType >
+  const Future< ValueType , execution_space > &
+    spawn( const Future< ValueType , execution_space > & f
+         , const bool priority = false ) const
+      {
+        if ( f.m_task ) {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+          f.m_task->m_queue =
+            ( f.m_task->m_team != 0
+            ? & ( m_policy->m_team[   priority ? 0 : 1 ] )
+            : & ( m_policy->m_serial[ priority ? 0 : 1 ] ) );
+          m_policy->schedule_task( f.m_task );
+#endif
+        }
+        return f ;
+      }
+
+  template< class FunctorType >
+  KOKKOS_INLINE_FUNCTION
+  void respawn( FunctorType * task_functor 
+              , const bool priority = false ) const
+    {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      task_root_type * const t = get_task_root(task_functor);
+      t->m_queue =
+        ( t->m_team != 0 ? & ( m_policy->m_team[   priority ? 0 : 1 ] )
+                         : & ( m_policy->m_serial[ priority ? 0 : 1 ] ) );
+      m_policy->reschedule_task( t );
+#endif
+    }
+
+  // When a create method fails by returning a null Future
+  // the task that called the create method may respawn
+  // with a dependence on memory becoming available.
+  // This is a race as more than one task may be respawned
+  // with this need.
+
+  template< class FunctorType >
+  KOKKOS_INLINE_FUNCTION
+  void respawn_needing_memory( FunctorType * task_functor ) const
+    {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      task_root_type * const t = get_task_root(task_functor);
+      t->m_queue =
+        ( t->m_team != 0 ? & ( m_policy->m_team[   2 ] )
+                         : & ( m_policy->m_serial[ 2 ] ) );
+      m_policy->reschedule_task( t );
+#endif
+    }
+
+  //----------------------------------------
+  // Functions for an executing task functor to query dependences,
+  // set new dependences, and respawn itself.
+
+  template< class FunctorType >
+  KOKKOS_INLINE_FUNCTION
+  Future< void , execution_space >
+  get_dependence( const FunctorType * task_functor , int i ) const
+    {
+      return Future<void,execution_space>(
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+        get_task_root(task_functor)->get_dependence(i)
+#endif
+        );
+    }
+
+  template< class FunctorType >
+  KOKKOS_INLINE_FUNCTION
+  int get_dependence( const FunctorType * task_functor ) const
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { return get_task_root(task_functor)->get_dependence(); }
+#else
+    { return 0 ; }
+#endif
+
+  template< class FunctorType >
+  KOKKOS_INLINE_FUNCTION
+  void clear_dependence( FunctorType * task_functor ) const
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { get_task_root(task_functor)->clear_dependence(); }
+#else
+    {}
+#endif
+
+  //----------------------------------------
+
+  static member_type & member_single();
+
+  friend void wait( TaskPolicy< Kokkos::Threads > & );
+};
+
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_HAVE_PTHREAD ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */
+#endif /* #ifndef KOKKOS_THREADS_TASKPOLICY_HPP */
+
+
diff --git a/lib/kokkos/core/src/impl/CMakeLists.txt b/lib/kokkos/core/src/impl/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c543194de3993015f6940506c0ff51da157f2084
--- /dev/null
+++ b/lib/kokkos/core/src/impl/CMakeLists.txt
@@ -0,0 +1,18 @@
+
+SET(HEADERS "")
+SET(SOURCES "")
+
+FILE(GLOB HEADERS *.hpp)
+FILE(GLOB SOURCES *.cpp)
+
+TRIBITS_ADD_LIBRARY(
+    kokkoscore_impl
+    NOINSTALLHEADERS ${HEADERS}
+    SOURCES ${SOURCES}
+    DEPLIBS 
+    )
+
+SET(TRILINOS_INCDIR ${CMAKE_INSTALL_PREFIX}/${${PROJECT_NAME}_INSTALL_INCLUDE_DIR})
+
+INSTALL(FILES ${HEADERS} DESTINATION ${TRILINOS_INCDIR}/impl/)
+
diff --git a/lib/kokkos/core/src/impl/KokkosExp_SharedAlloc.cpp b/lib/kokkos/core/src/impl/KokkosExp_SharedAlloc.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..96b37043455e51d726e1d22e4f3e450986acae01
--- /dev/null
+++ b/lib/kokkos/core/src/impl/KokkosExp_SharedAlloc.cpp
@@ -0,0 +1,346 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+int SharedAllocationRecord< void , void >::s_tracking_enabled = 1 ;
+
+void SharedAllocationRecord< void , void >::tracking_claim_and_disable()
+{
+  // A host thread claim and disable tracking flag
+
+  while ( ! Kokkos::atomic_compare_exchange_strong( & s_tracking_enabled, 1, 0 ) );
+}
+
+void SharedAllocationRecord< void , void >::tracking_release_and_enable()
+{
+  // The host thread that claimed and disabled the tracking flag
+  // now release and enable tracking.
+
+  if ( ! Kokkos::atomic_compare_exchange_strong( & s_tracking_enabled, 0, 1 ) ){
+    Kokkos::Impl::throw_runtime_exception("Kokkos::Experimental::Impl::SharedAllocationRecord<>::tracking_release_and_enable FAILED, this host process thread did not hold the lock" );
+  }
+}
+
+//----------------------------------------------------------------------------
+
+bool
+SharedAllocationRecord< void , void >::
+is_sane( SharedAllocationRecord< void , void > * arg_record )
+{
+  constexpr static SharedAllocationRecord * zero = 0 ;
+
+  SharedAllocationRecord * const root = arg_record ? arg_record->m_root : 0 ;
+
+  bool ok = root != 0 && root->use_count() == 0 ;
+
+  if ( ok ) {
+    SharedAllocationRecord * root_next = 0 ;
+
+    // Lock the list:
+    while ( ( root_next = Kokkos::atomic_exchange( & root->m_next , zero ) ) == zero );
+
+    for ( SharedAllocationRecord * rec = root_next ; ok && rec != root ; rec = rec->m_next ) {
+      const bool ok_non_null  = rec && rec->m_prev && ( rec == root || rec->m_next );
+      const bool ok_root      = ok_non_null && rec->m_root == root ;
+      const bool ok_prev_next = ok_non_null && ( rec->m_prev != root ? rec->m_prev->m_next == rec : root_next == rec );
+      const bool ok_next_prev = ok_non_null && rec->m_next->m_prev == rec ;
+      const bool ok_count     = ok_non_null && 0 <= rec->use_count() ;
+
+      ok = ok_root && ok_prev_next && ok_next_prev && ok_count ;
+
+if ( ! ok ) {
+  //Formatting dependent on sizeof(uintptr_t) 
+  const char * format_string;
+  
+  if (sizeof(uintptr_t) == sizeof(unsigned long)) {
+     format_string = "Kokkos::Experimental::Impl::SharedAllocationRecord failed is_sane: rec(0x%.12lx){ m_count(%d) m_root(0x%.12lx) m_next(0x%.12lx) m_prev(0x%.12lx) m_next->m_prev(0x%.12lx) m_prev->m_next(0x%.12lx) }\n";
+  }
+  else if (sizeof(uintptr_t) == sizeof(unsigned long long)) {
+     format_string = "Kokkos::Experimental::Impl::SharedAllocationRecord failed is_sane: rec(0x%.12llx){ m_count(%d) m_root(0x%.12llx) m_next(0x%.12llx) m_prev(0x%.12llx) m_next->m_prev(0x%.12llx) m_prev->m_next(0x%.12llx) }\n";
+  }
+
+  fprintf(stderr
+        , format_string 
+        , reinterpret_cast< uintptr_t >( rec )
+        , rec->use_count()
+        , reinterpret_cast< uintptr_t >( rec->m_root )
+        , reinterpret_cast< uintptr_t >( rec->m_next )
+        , reinterpret_cast< uintptr_t >( rec->m_prev )
+        , reinterpret_cast< uintptr_t >( rec->m_next != NULL ? rec->m_next->m_prev : NULL )
+        , reinterpret_cast< uintptr_t >( rec->m_prev != rec->m_root ? rec->m_prev->m_next : root_next )
+        );
+}
+
+    }
+
+    if ( zero != Kokkos::atomic_exchange( & root->m_next , root_next ) ) {
+      Kokkos::Impl::throw_runtime_exception("Kokkos::Experimental::Impl::SharedAllocationRecord failed is_sane unlocking");
+    }
+  }
+
+  return ok ; 
+}
+
+SharedAllocationRecord<void,void> *
+SharedAllocationRecord<void,void>::find( SharedAllocationRecord<void,void> * const arg_root , void * const arg_data_ptr )
+{
+  constexpr static SharedAllocationRecord * zero = 0 ;
+
+  SharedAllocationRecord * root_next = 0 ;
+
+  // Lock the list:
+  while ( ( root_next = Kokkos::atomic_exchange( & arg_root->m_next , zero ) ) == zero );
+
+  // Iterate searching for the record with this data pointer
+
+  SharedAllocationRecord * r = root_next ;
+
+  while ( ( r != arg_root ) && ( r->data() != arg_data_ptr ) ) { r = r->m_next ; }
+
+  if ( r == arg_root ) { r = 0 ; }
+
+  if ( zero != Kokkos::atomic_exchange( & arg_root->m_next , root_next ) ) {
+    Kokkos::Impl::throw_runtime_exception("Kokkos::Experimental::Impl::SharedAllocationRecord failed locking/unlocking");
+  }
+
+  return r ;
+}
+
+
+/**\brief  Construct and insert into 'arg_root' tracking set.
+ *         use_count is zero.
+ */
+SharedAllocationRecord< void , void >::
+SharedAllocationRecord( SharedAllocationRecord<void,void> * arg_root
+                      , SharedAllocationHeader            * arg_alloc_ptr
+                      , size_t                              arg_alloc_size
+                      , SharedAllocationRecord< void , void >::function_type  arg_dealloc
+                      )
+  : m_alloc_ptr(  arg_alloc_ptr )
+  , m_alloc_size( arg_alloc_size )
+  , m_dealloc(    arg_dealloc )
+  , m_root( arg_root )
+  , m_prev( 0 )
+  , m_next( 0 )
+  , m_count( 0 )
+{
+  constexpr static SharedAllocationRecord * zero = 0 ;
+
+  if ( 0 != arg_alloc_ptr ) {
+
+    // Insert into the root double-linked list for tracking
+    //
+    // before:  arg_root->m_next == next ; next->m_prev == arg_root
+    // after:   arg_root->m_next == this ; this->m_prev == arg_root ;
+    //              this->m_next == next ; next->m_prev == this
+
+    m_prev = m_root ;
+
+    // Read root->m_next and lock by setting to zero
+    while ( ( m_next = Kokkos::atomic_exchange( & m_root->m_next , zero ) ) == zero );
+
+    m_next->m_prev = this ;
+
+    // memory fence before completing insertion into linked list
+    Kokkos::memory_fence();
+
+    if ( zero != Kokkos::atomic_exchange( & m_root->m_next , this ) ) {
+      Kokkos::Impl::throw_runtime_exception("Kokkos::Experimental::Impl::SharedAllocationRecord failed locking/unlocking");
+    }
+  }
+  else {
+    Kokkos::Impl::throw_runtime_exception("Kokkos::Experimental::Impl::SharedAllocationRecord given NULL allocation");
+  }
+}
+
+void
+SharedAllocationRecord< void , void >::
+increment( SharedAllocationRecord< void , void > * arg_record )
+{
+  const int old_count = Kokkos::atomic_fetch_add( & arg_record->m_count , 1 );
+
+  if ( old_count < 0 ) { // Error
+    Kokkos::Impl::throw_runtime_exception("Kokkos::Experimental::Impl::SharedAllocationRecord failed increment");
+  }
+}
+
+SharedAllocationRecord< void , void > *
+SharedAllocationRecord< void , void >::
+decrement( SharedAllocationRecord< void , void > * arg_record )
+{
+  constexpr static SharedAllocationRecord * zero = 0 ;
+
+  const int old_count = Kokkos::atomic_fetch_add( & arg_record->m_count , -1 );
+
+#if 0
+  if ( old_count <= 1 ) {
+    fprintf(stderr,"Kokkos::Experimental::Impl::SharedAllocationRecord '%s' at 0x%lx delete count = %d\n", arg_record->m_alloc_ptr->m_label , (unsigned long) arg_record , old_count );
+    fflush(stderr);
+  }
+#endif
+
+
+  if ( old_count == 1 ) {
+
+    // before:  arg_record->m_prev->m_next == arg_record  &&
+    //          arg_record->m_next->m_prev == arg_record
+    //
+    // after:   arg_record->m_prev->m_next == arg_record->m_next  &&
+    //          arg_record->m_next->m_prev == arg_record->m_prev
+
+    SharedAllocationRecord * root_next = 0 ;
+
+    // Lock the list:
+    while ( ( root_next = Kokkos::atomic_exchange( & arg_record->m_root->m_next , zero ) ) == zero );
+
+    arg_record->m_next->m_prev = arg_record->m_prev ;
+
+    if ( root_next != arg_record ) {
+      arg_record->m_prev->m_next = arg_record->m_next ;
+    }
+    else {
+      // before:  arg_record->m_root == arg_record->m_prev
+      // after:   arg_record->m_root == arg_record->m_next
+      root_next = arg_record->m_next ; 
+    }
+
+    // Unlock the list:
+    if ( zero != Kokkos::atomic_exchange( & arg_record->m_root->m_next , root_next ) ) {
+      Kokkos::Impl::throw_runtime_exception("Kokkos::Experimental::Impl::SharedAllocationRecord failed decrement unlocking");
+    }
+
+    arg_record->m_next = 0 ;
+    arg_record->m_prev = 0 ;
+
+    function_type d = arg_record->m_dealloc ;
+    (*d)( arg_record );
+    arg_record = 0 ;
+  }
+  else if ( old_count < 1 ) { // Error
+    fprintf(stderr,"Kokkos::Experimental::Impl::SharedAllocationRecord '%s' failed decrement count = %d\n", arg_record->m_alloc_ptr->m_label , old_count );
+    fflush(stderr);
+    Kokkos::Impl::throw_runtime_exception("Kokkos::Experimental::Impl::SharedAllocationRecord failed decrement count");
+  }
+
+  return arg_record ;
+}
+
+void
+SharedAllocationRecord< void , void >::
+print_host_accessible_records( std::ostream & s
+                             , const char * const space_name
+                             , const SharedAllocationRecord * const root
+                             , const bool detail )
+{
+  const SharedAllocationRecord< void , void > * r = root ;
+
+  char buffer[256] ;
+
+  if ( detail ) {
+    do {
+      //Formatting dependent on sizeof(uintptr_t) 
+      const char * format_string;
+
+      if (sizeof(uintptr_t) == sizeof(unsigned long)) {
+        format_string = "%s addr( 0x%.12lx ) list( 0x%.12lx 0x%.12lx ) extent[ 0x%.12lx + %.8ld ] count(%d) dealloc(0x%.12lx) %s\n";
+      }
+      else if (sizeof(uintptr_t) == sizeof(unsigned long long)) {
+        format_string = "%s addr( 0x%.12llx ) list( 0x%.12llx 0x%.12llx ) extent[ 0x%.12llx + %.8ld ] count(%d) dealloc(0x%.12llx) %s\n";
+      }
+
+      snprintf( buffer , 256
+              , format_string
+              , space_name
+              , reinterpret_cast<uintptr_t>( r )
+              , reinterpret_cast<uintptr_t>( r->m_prev )
+              , reinterpret_cast<uintptr_t>( r->m_next )
+              , reinterpret_cast<uintptr_t>( r->m_alloc_ptr )
+              , r->m_alloc_size
+              , r->use_count()
+              , reinterpret_cast<uintptr_t>( r->m_dealloc )
+              , r->m_alloc_ptr->m_label
+              );
+      std::cout << buffer ;
+      r = r->m_next ;
+    } while ( r != root );
+  }
+  else {
+    do {
+      if ( r->m_alloc_ptr ) {
+        //Formatting dependent on sizeof(uintptr_t) 
+        const char * format_string;
+
+        if (sizeof(uintptr_t) == sizeof(unsigned long)) { 
+          format_string = "%s [ 0x%.12lx + %ld ] %s\n";
+        }
+        else if (sizeof(uintptr_t) == sizeof(unsigned long long)) { 
+          format_string = "%s [ 0x%.12llx + %ld ] %s\n";
+        }
+
+        snprintf( buffer , 256
+                , format_string
+                , space_name
+                , reinterpret_cast< uintptr_t >( r->data() )
+                , r->size()
+                , r->m_alloc_ptr->m_label
+                );
+      }
+      else {
+        snprintf( buffer , 256 , "%s [ 0 + 0 ]\n" , space_name );
+      }
+      std::cout << buffer ;
+      r = r->m_next ;
+    } while ( r != root );
+  }
+}
+
+} /* namespace Impl */
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+
diff --git a/lib/kokkos/core/src/impl/KokkosExp_SharedAlloc.hpp b/lib/kokkos/core/src/impl/KokkosExp_SharedAlloc.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..1498eafb008ffa5d26a84094df9ba3f48126551e
--- /dev/null
+++ b/lib/kokkos/core/src/impl/KokkosExp_SharedAlloc.hpp
@@ -0,0 +1,400 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_SHARED_ALLOC_HPP_
+#define KOKKOS_SHARED_ALLOC_HPP_
+
+#include <stdint.h>
+#include <string>
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template< class MemorySpace = void , class DestroyFunctor = void >
+class SharedAllocationRecord ;
+
+class SharedAllocationHeader {
+private:
+
+  typedef SharedAllocationRecord<void,void>  Record ;
+
+  static constexpr unsigned maximum_label_length = ( 1u << 7 /* 128 */ ) - sizeof(Record*);
+
+  template< class , class > friend class SharedAllocationRecord ;
+
+  Record * m_record ;
+  char     m_label[ maximum_label_length ];
+
+public:
+
+  /* Given user memory get pointer to the header */
+  KOKKOS_INLINE_FUNCTION static
+  const SharedAllocationHeader * get_header( void * alloc_ptr )
+    { return reinterpret_cast<SharedAllocationHeader*>( reinterpret_cast<char*>(alloc_ptr) - sizeof(SharedAllocationHeader) ); }
+};
+
+template<>
+class SharedAllocationRecord< void , void > {
+protected:
+
+  static_assert( sizeof(SharedAllocationHeader) == ( 1u << 7 /* 128 */ ) , "sizeof(SharedAllocationHeader) != 128" );
+
+  template< class , class > friend class SharedAllocationRecord ;
+
+  typedef void (* function_type )( SharedAllocationRecord<void,void> * );
+
+  static int s_tracking_enabled ;
+
+  SharedAllocationHeader * const m_alloc_ptr ;
+  size_t                   const m_alloc_size ;
+  function_type            const m_dealloc ;
+  SharedAllocationRecord * const m_root ;
+  SharedAllocationRecord *       m_prev ;
+  SharedAllocationRecord *       m_next ;
+  int                            m_count ;
+
+  SharedAllocationRecord( SharedAllocationRecord && ) = delete ;
+  SharedAllocationRecord( const SharedAllocationRecord & ) = delete ;
+  SharedAllocationRecord & operator = ( SharedAllocationRecord && ) = delete ;
+  SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ;
+
+  /**\brief  Construct and insert into 'arg_root' tracking set.
+   *         use_count is zero.
+   */
+  SharedAllocationRecord( SharedAllocationRecord * arg_root
+                        , SharedAllocationHeader * arg_alloc_ptr
+                        , size_t                   arg_alloc_size
+                        , function_type            arg_dealloc
+                        );
+
+public:
+
+  static int tracking_enabled() { return s_tracking_enabled ; }
+
+  /**\brief A host process thread claims and disables the 
+   *        shared allocation tracking flag.
+   */
+  static void tracking_claim_and_disable();
+
+  /**\brief A host process thread releases and enables the 
+   *        shared allocation tracking flag.
+   */
+  static void tracking_release_and_enable();
+
+  ~SharedAllocationRecord() = default ;
+
+  SharedAllocationRecord()
+    : m_alloc_ptr( 0 )
+    , m_alloc_size( 0 )
+    , m_dealloc( 0 )
+    , m_root( this )
+    , m_prev( this )
+    , m_next( this )
+    , m_count( 0 )
+    {}
+
+  static constexpr unsigned maximum_label_length = SharedAllocationHeader::maximum_label_length ;
+
+  KOKKOS_INLINE_FUNCTION
+  const SharedAllocationHeader * head() const { return m_alloc_ptr ; }
+
+  /* User's memory begins at the end of the header */
+  KOKKOS_INLINE_FUNCTION
+  void * data() const { return reinterpret_cast<void*>( m_alloc_ptr + 1 ); }
+
+  /* User's memory begins at the end of the header */
+  size_t size() const { return m_alloc_size - sizeof(SharedAllocationHeader) ; }
+
+  /* Cannot be 'constexpr' because 'm_count' is volatile */
+  int use_count() const { return *static_cast<const volatile int *>(&m_count); }
+
+  /* Increment use count */
+  static void increment( SharedAllocationRecord * );
+
+  /* Decrement use count. If 1->0 then remove from the tracking list and invoke m_dealloc */
+  static SharedAllocationRecord * decrement( SharedAllocationRecord * );
+
+  /* Given a root record and data pointer find the record */
+  static SharedAllocationRecord * find( SharedAllocationRecord * const , void * const );
+
+  /*  Sanity check for the whole set of records to which the input record belongs.
+   *  Locks the set's insert/erase operations until the sanity check is complete.
+   */
+  static bool is_sane( SharedAllocationRecord * );
+
+  /*  Print host-accessible records */
+  static void print_host_accessible_records( std::ostream &
+                                           , const char * const space_name
+                                           , const SharedAllocationRecord * const root
+                                           , const bool detail );
+};
+
+namespace {
+
+/* Taking the address of this function so make sure it is unique */
+template < class MemorySpace , class DestroyFunctor >
+void deallocate( SharedAllocationRecord<void,void> * record_ptr )
+{
+  typedef SharedAllocationRecord< MemorySpace , void > base_type ;
+  typedef SharedAllocationRecord< MemorySpace , DestroyFunctor > this_type ;
+
+  this_type * const ptr = static_cast< this_type * >(
+                          static_cast< base_type * >( record_ptr ) );
+
+  ptr->m_destroy.destroy_shared_allocation();
+
+  delete ptr ;
+}
+
+}
+
+/*
+ *  Memory space specialization of SharedAllocationRecord< Space , void > requires :
+ *
+ *  SharedAllocationRecord< Space , void > : public SharedAllocationRecord< void , void >
+ *  {
+ *    // delete allocated user memory via static_cast to this type.
+ *    static void deallocate( const SharedAllocationRecord<void,void> * );
+ *    Space m_space ;
+ *  }
+ */
+template< class MemorySpace , class DestroyFunctor >
+class SharedAllocationRecord : public SharedAllocationRecord< MemorySpace , void >
+{
+private:
+
+  SharedAllocationRecord( const MemorySpace & arg_space
+                        , const std::string & arg_label
+                        , const size_t        arg_alloc
+                        )
+    /*  Allocate user memory as [ SharedAllocationHeader , user_memory ] */
+    : SharedAllocationRecord< MemorySpace , void >( arg_space , arg_label , arg_alloc , & Kokkos::Experimental::Impl::deallocate< MemorySpace , DestroyFunctor > )
+    , m_destroy()
+    {}
+
+  SharedAllocationRecord() = delete ;
+  SharedAllocationRecord( const SharedAllocationRecord & ) = delete ;
+  SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ;
+
+public:
+
+  DestroyFunctor  m_destroy ;
+
+  // Allocate with a zero use count.  Incrementing the use count from zero to one
+  // inserts the record into the tracking list.  Decrementing the count from one to zero
+  // removes from the trakcing list and deallocates.
+  KOKKOS_INLINE_FUNCTION static
+  SharedAllocationRecord * allocate( const MemorySpace & arg_space 
+                                   , const std::string & arg_label
+                                   , const size_t        arg_alloc
+                                   )
+    {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      return new SharedAllocationRecord( arg_space , arg_label , arg_alloc );
+#else
+      return (SharedAllocationRecord *) 0 ;
+#endif
+    }
+};
+
+union SharedAllocationTracker {
+private:
+
+  typedef SharedAllocationRecord<void,void>  Record ;
+
+  enum : uintptr_t { DO_NOT_DEREF_FLAG = 0x01ul };
+
+  // The allocation record resides in Host memory space
+  uintptr_t m_record_bits ;
+  Record  * m_record ;
+
+public:
+
+  // Use macros instead of inline functions to reduce
+  // pressure on compiler optimization by reducing
+  // number of symbols and inline functons.
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+
+#define KOKKOS_SHARED_ALLOCATION_TRACKER_ENABLED	\
+  Record::tracking_enabled()
+
+#define KOKKOS_SHARED_ALLOCATION_TRACKER_INCREMENT	\
+  if ( ! ( m_record_bits & DO_NOT_DEREF_FLAG ) ) Record::increment( m_record );
+
+#define KOKKOS_SHARED_ALLOCATION_TRACKER_DECREMENT	\
+  if ( ! ( m_record_bits & DO_NOT_DEREF_FLAG ) ) Record::decrement( m_record );
+
+#else
+
+#define KOKKOS_SHARED_ALLOCATION_TRACKER_ENABLED  0
+
+#define KOKKOS_SHARED_ALLOCATION_TRACKER_INCREMENT /* */
+
+#define KOKKOS_SHARED_ALLOCATION_TRACKER_DECREMENT /* */
+
+#endif
+
+  /** \brief  Assign a specialized record */
+  inline
+  void assign_allocated_record_to_uninitialized( Record * arg_record )
+    {
+      if ( arg_record ) {
+        Record::increment( m_record = arg_record );
+      }
+      else {
+        m_record_bits = DO_NOT_DEREF_FLAG ;
+      }
+    }
+
+  template< class MemorySpace >
+  constexpr
+  SharedAllocationRecord< MemorySpace , void > &
+  get_record() const
+    { return * static_cast< SharedAllocationRecord< MemorySpace , void > * >( m_record ); }
+
+  template< class MemorySpace >
+  std::string get_label() const
+    {
+      return ( m_record_bits & DO_NOT_DEREF_FLAG )
+             ? std::string()
+             : static_cast< SharedAllocationRecord< MemorySpace , void > * >( m_record )->get_label()
+             ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  int use_count() const
+    {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      Record * const tmp = reinterpret_cast<Record*>( m_record_bits & ~DO_NOT_DEREF_FLAG );
+      return ( tmp ? tmp->use_count() : 0 );
+#else
+      return 0 ;
+#endif
+    }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  ~SharedAllocationTracker()
+    { KOKKOS_SHARED_ALLOCATION_TRACKER_DECREMENT }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  constexpr SharedAllocationTracker()
+    : m_record_bits( DO_NOT_DEREF_FLAG ) {}
+
+  // Move:
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  SharedAllocationTracker( SharedAllocationTracker && rhs )
+    : m_record_bits( rhs.m_record_bits )
+    { rhs.m_record_bits = DO_NOT_DEREF_FLAG ; }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  SharedAllocationTracker & operator = ( SharedAllocationTracker && rhs )
+    {
+      // If this is tracking then must decrement
+      KOKKOS_SHARED_ALLOCATION_TRACKER_DECREMENT
+      // Move and reset RHS to default constructed value.
+      m_record_bits = rhs.m_record_bits ;
+      rhs.m_record_bits = DO_NOT_DEREF_FLAG ;
+      return *this ;
+    }
+
+  // Copy:
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  SharedAllocationTracker( const SharedAllocationTracker & rhs )
+    : m_record_bits( KOKKOS_SHARED_ALLOCATION_TRACKER_ENABLED
+                   ? rhs.m_record_bits
+                   : rhs.m_record_bits | DO_NOT_DEREF_FLAG )
+    {
+      KOKKOS_SHARED_ALLOCATION_TRACKER_INCREMENT
+    }
+
+  /** \brief  Copy construction may disable tracking. */
+  KOKKOS_FORCEINLINE_FUNCTION
+  SharedAllocationTracker( const SharedAllocationTracker & rhs
+                         , const bool enable_tracking )
+    : m_record_bits( KOKKOS_SHARED_ALLOCATION_TRACKER_ENABLED
+                     && enable_tracking
+                   ? rhs.m_record_bits
+                   : rhs.m_record_bits | DO_NOT_DEREF_FLAG )
+    { KOKKOS_SHARED_ALLOCATION_TRACKER_INCREMENT }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  SharedAllocationTracker & operator = ( const SharedAllocationTracker & rhs )
+    {
+      // If this is tracking then must decrement
+      KOKKOS_SHARED_ALLOCATION_TRACKER_DECREMENT
+      m_record_bits = KOKKOS_SHARED_ALLOCATION_TRACKER_ENABLED
+                    ? rhs.m_record_bits
+                    : rhs.m_record_bits | DO_NOT_DEREF_FLAG ;
+      KOKKOS_SHARED_ALLOCATION_TRACKER_INCREMENT
+      return *this ;
+    }
+
+  /** \brief  Copy assignment may disable tracking */
+  KOKKOS_FORCEINLINE_FUNCTION
+  void assign( const SharedAllocationTracker & rhs
+             , const bool enable_tracking )
+    {
+      KOKKOS_SHARED_ALLOCATION_TRACKER_DECREMENT
+      m_record_bits = KOKKOS_SHARED_ALLOCATION_TRACKER_ENABLED
+                      && enable_tracking
+                    ? rhs.m_record_bits
+                    : rhs.m_record_bits | DO_NOT_DEREF_FLAG ;
+      KOKKOS_SHARED_ALLOCATION_TRACKER_INCREMENT
+    }
+
+#undef KOKKOS_SHARED_ALLOCATION_TRACKER_ENABLED
+#undef KOKKOS_SHARED_ALLOCATION_TRACKER_INCREMENT
+#undef KOKKOS_SHARED_ALLOCATION_TRACKER_DECREMENT
+
+};
+
+
+} /* namespace Impl */
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+#endif
diff --git a/lib/kokkos/core/src/impl/KokkosExp_ViewArray.hpp b/lib/kokkos/core/src/impl/KokkosExp_ViewArray.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..17d28ace4dae471accfa91ab52629aee357850e9
--- /dev/null
+++ b/lib/kokkos/core/src/impl/KokkosExp_ViewArray.hpp
@@ -0,0 +1,606 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_EXPERIMENTAL_VIEW_ARRAY_MAPPING_HPP
+#define KOKKOS_EXPERIMENTAL_VIEW_ARRAY_MAPPING_HPP
+
+#include <Kokkos_Array.hpp>
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template< class DataType , class ArrayLayout , class V , size_t N , class P >
+struct ViewDataAnalysis< DataType , ArrayLayout , Kokkos::Array<V,N,P> >
+{
+private:
+
+  typedef ViewArrayAnalysis<DataType> array_analysis ;
+
+  static_assert( std::is_same<P,void>::value , "" );
+  static_assert( std::is_same<typename array_analysis::non_const_value_type , Kokkos::Array<V,N,P> >::value , "" );
+  static_assert( std::is_scalar<V>::value , "View of Array type must be of a scalar type" );
+
+public:
+
+  typedef Kokkos::Array<>  specialize ;
+
+  typedef typename array_analysis::dimension  dimension ;
+
+private:
+
+  enum { is_const = std::is_same< typename array_analysis::value_type
+                                , typename array_analysis::const_value_type
+                                >::value };
+
+  typedef typename dimension::template append<N>::type array_scalar_dimension ;
+
+  typedef typename std::conditional< is_const , const V , V >::type  scalar_type ;
+  typedef V       non_const_scalar_type ;
+  typedef const V const_scalar_type ;
+
+public:
+
+  typedef typename array_analysis::value_type            value_type ;
+  typedef typename array_analysis::const_value_type      const_value_type ;
+  typedef typename array_analysis::non_const_value_type  non_const_value_type ;
+
+  typedef typename ViewDataType<           value_type , dimension >::type  type ;
+  typedef typename ViewDataType<     const_value_type , dimension >::type  const_type ;
+  typedef typename ViewDataType< non_const_value_type , dimension >::type  non_const_type ;
+
+  typedef typename ViewDataType<           scalar_type , array_scalar_dimension >::type  scalar_array_type ;
+  typedef typename ViewDataType<     const_scalar_type , array_scalar_dimension >::type  const_scalar_array_type ;
+  typedef typename ViewDataType< non_const_scalar_type , array_scalar_dimension >::type  non_const_scalar_array_type ;
+};
+
+}}} // namespace Kokkos::Experimental::Impl
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+/** \brief  View mapping for non-specialized data type and standard layout */
+template< class Traits >
+class ViewMapping< Traits ,
+  typename std::enable_if<(
+    std::is_same< typename Traits::specialize , Kokkos::Array<> >::value &&
+    ( std::is_same< typename Traits::array_layout , Kokkos::LayoutLeft >::value ||
+      std::is_same< typename Traits::array_layout , Kokkos::LayoutRight >::value ||
+      std::is_same< typename Traits::array_layout , Kokkos::LayoutStride >::value )
+  )>::type >
+{
+private:
+
+  template< class , class ... > friend class ViewMapping ;
+  template< class , class ... > friend class Kokkos::Experimental::View ;
+
+  typedef ViewOffset< typename Traits::dimension
+                    , typename Traits::array_layout
+                    , void
+                    >  offset_type ;
+
+  typedef typename Traits::value_type::pointer handle_type ;
+
+  handle_type  m_handle ;
+  offset_type  m_offset ;
+  size_t       m_stride ;
+
+  typedef typename Traits::value_type::value_type scalar_type ;
+
+  typedef Kokkos::Array< scalar_type , ~size_t(0) , Kokkos::Array<>::contiguous >  contiguous_reference ;
+  typedef Kokkos::Array< scalar_type , ~size_t(0) , Kokkos::Array<>::strided >     strided_reference ;
+
+  enum { is_contiguous_reference =
+    ( Traits::rank == 0 ) || ( std::is_same< typename Traits::array_layout , Kokkos::LayoutRight >::value ) };
+
+  enum { Array_N = Traits::value_type::size() };
+  enum { Array_S = is_contiguous_reference ? Array_N : 1 };
+
+  KOKKOS_INLINE_FUNCTION
+  ViewMapping( const handle_type & arg_handle , const offset_type & arg_offset )
+    : m_handle( arg_handle )
+    , m_offset( arg_offset )
+    , m_stride( is_contiguous_reference ? 0 : arg_offset.span() )
+    {}
+
+public:
+
+  //----------------------------------------
+  // Domain dimensions
+
+  enum { Rank = Traits::dimension::rank };
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION constexpr size_t extent( const iType & r ) const
+    { return m_offset.m_dim.extent(r); }
+
+  KOKKOS_INLINE_FUNCTION constexpr
+  typename Traits::array_layout layout() const
+    { return m_offset.layout(); }
+
+
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_0() const { return m_offset.dimension_0(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_1() const { return m_offset.dimension_1(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_2() const { return m_offset.dimension_2(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_3() const { return m_offset.dimension_3(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_4() const { return m_offset.dimension_4(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_5() const { return m_offset.dimension_5(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_6() const { return m_offset.dimension_6(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_7() const { return m_offset.dimension_7(); }
+
+  // Is a regular layout with uniform striding for each index.
+  using is_regular = typename offset_type::is_regular ;
+
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_0() const { return m_offset.stride_0(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_1() const { return m_offset.stride_1(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_2() const { return m_offset.stride_2(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_3() const { return m_offset.stride_3(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_4() const { return m_offset.stride_4(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_5() const { return m_offset.stride_5(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_6() const { return m_offset.stride_6(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_7() const { return m_offset.stride_7(); }
+
+  //----------------------------------------
+  // Range span
+
+  /** \brief  Span of the mapped range */
+  KOKKOS_INLINE_FUNCTION constexpr size_t span() const
+    { return m_offset.span() * Array_N ; }
+
+  /** \brief  Is the mapped range span contiguous */
+  KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const
+    { return m_offset.span_is_contiguous(); }
+
+  typedef typename std::conditional< is_contiguous_reference , contiguous_reference , strided_reference >::type  reference_type ;
+
+  typedef handle_type pointer_type ;
+
+  /** \brief  If data references are lvalue_reference than can query pointer to memory */
+  KOKKOS_INLINE_FUNCTION constexpr pointer_type data() const
+    { return m_handle ; }
+
+  //----------------------------------------
+  // The View class performs all rank and bounds checking before
+  // calling these element reference methods.
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  reference_type reference() const { return reference_type( m_handle + 0 , Array_N , 0 ); }
+
+  template< typename I0 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  reference_type
+  reference( const I0 & i0 ) const
+    { return reference_type( m_handle + m_offset(i0) * Array_S , Array_N , m_stride ); }
+
+  template< typename I0 , typename I1 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  reference_type reference( const I0 & i0 , const I1 & i1 ) const
+    { return reference_type( m_handle + m_offset(i0,i1) * Array_S , Array_N , m_stride ); }
+
+  template< typename I0 , typename I1 , typename I2 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  reference_type reference( const I0 & i0 , const I1 & i1 , const I2 & i2 ) const
+    { return reference_type( m_handle + m_offset(i0,i1,i2) * Array_S , Array_N , m_stride ); }
+
+  template< typename I0 , typename I1 , typename I2 , typename I3 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  reference_type reference( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3 ) const
+    { return reference_type( m_handle + m_offset(i0,i1,i2,i3) * Array_S , Array_N , m_stride ); }
+
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , typename I4 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  reference_type reference( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+                          , const I4 & i4 ) const
+    { return reference_type( m_handle + m_offset(i0,i1,i2,i3,i4) * Array_S , Array_N , m_stride ); }
+
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , typename I4 , typename I5 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  reference_type reference( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+                          , const I4 & i4 , const I5 & i5 ) const
+    { return reference_type( m_handle + m_offset(i0,i1,i2,i3,i4,i5) * Array_S , Array_N , m_stride ); }
+
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , typename I4 , typename I5 , typename I6 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  reference_type reference( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+                          , const I4 & i4 , const I5 & i5 , const I6 & i6 ) const
+    { return reference_type( m_handle + m_offset(i0,i1,i2,i3,i4,i5,i6) * Array_S , Array_N , m_stride ); }
+
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , typename I4 , typename I5 , typename I6 , typename I7 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  reference_type reference( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+                          , const I4 & i4 , const I5 & i5 , const I6 & i6 , const I7 & i7 ) const
+    { return reference_type( m_handle + m_offset(i0,i1,i2,i3,i4,i5,i6,i7) * Array_S , Array_N , m_stride ); }
+
+  //----------------------------------------
+
+private:
+
+  enum { MemorySpanMask = 8 - 1 /* Force alignment on 8 byte boundary */ };
+  enum { MemorySpanSize = sizeof(scalar_type) };
+
+public:
+
+  /** \brief  Span, in bytes, of the referenced memory */
+  KOKKOS_INLINE_FUNCTION constexpr size_t memory_span() const
+    {
+      return ( m_offset.span() * Array_N * MemorySpanSize + MemorySpanMask ) & ~size_t(MemorySpanMask);
+    }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION ~ViewMapping() {}
+  KOKKOS_INLINE_FUNCTION ViewMapping() : m_handle(), m_offset(), m_stride(0) {}
+  KOKKOS_INLINE_FUNCTION ViewMapping( const ViewMapping & rhs )
+    : m_handle( rhs.m_handle ), m_offset( rhs.m_offset ), m_stride( rhs.m_stride ) {}
+  KOKKOS_INLINE_FUNCTION ViewMapping & operator = ( const ViewMapping & rhs )
+    { m_handle = rhs.m_handle ; m_offset = rhs.m_offset ; m_stride = rhs.m_stride ; ; return *this ; }
+
+  KOKKOS_INLINE_FUNCTION ViewMapping( ViewMapping && rhs )
+    : m_handle( rhs.m_handle ), m_offset( rhs.m_offset ), m_stride( rhs.m_stride ) {}
+  KOKKOS_INLINE_FUNCTION ViewMapping & operator = ( ViewMapping && rhs )
+    { m_handle = rhs.m_handle ; m_offset = rhs.m_offset ; m_stride = rhs.m_stride ; return *this ; }
+
+  //----------------------------------------
+
+  template< class ... Args >
+  KOKKOS_INLINE_FUNCTION
+  ViewMapping( pointer_type ptr , Args ... args )
+    : m_handle( ptr )
+    , m_offset( std::integral_constant< unsigned , 0 >() , args... )
+    , m_stride( m_offset.span() )
+    {}
+
+  //----------------------------------------
+
+  template< class ... P >
+  SharedAllocationRecord<> *
+  allocate_shared( ViewCtorProp< P... > const & arg_prop
+                 , typename Traits::array_layout const & arg_layout
+                 )
+  {
+    typedef ViewCtorProp< P... > alloc_prop ;
+
+    typedef typename alloc_prop::execution_space  execution_space ;
+    typedef typename Traits::memory_space         memory_space ;
+    typedef ViewValueFunctor< execution_space , scalar_type > functor_type ;
+    typedef SharedAllocationRecord< memory_space , functor_type > record_type ;
+
+    // Query the mapping for byte-size of allocation.
+    typedef std::integral_constant< unsigned ,
+      alloc_prop::allow_padding ? sizeof(scalar_type) : 0 > padding ;
+
+    m_offset = offset_type( padding(), arg_layout );
+
+    const size_t alloc_size =
+      ( m_offset.span() * Array_N * MemorySpanSize + MemorySpanMask ) & ~size_t(MemorySpanMask);
+
+    // Allocate memory from the memory space and create tracking record.
+    record_type * const record =
+      record_type::allocate( ((ViewCtorProp<void,memory_space> const &) arg_prop ).value
+                           , ((ViewCtorProp<void,std::string>  const &) arg_prop ).value
+                           , alloc_size );
+
+    if ( alloc_size ) {
+      m_handle =
+        handle_type( reinterpret_cast< pointer_type >( record->data() ) );
+
+      if ( alloc_prop::initialize ) {
+        // The functor constructs and destroys
+        record->m_destroy = functor_type( ((ViewCtorProp<void,execution_space> const & )arg_prop).value
+                                        , (pointer_type) m_handle
+                                        , m_offset.span() * Array_N
+                                        );
+
+        record->m_destroy.construct_shared_allocation();
+      }
+    }
+
+    return record ;
+  }
+};
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+/** \brief  Assign compatible default mappings */
+
+template< class DstTraits , class SrcTraits >
+class ViewMapping< DstTraits , SrcTraits ,
+  typename std::enable_if<(
+    std::is_same< typename DstTraits::memory_space , typename SrcTraits::memory_space >::value
+    &&
+    std::is_same< typename DstTraits::specialize , Kokkos::Array<> >::value
+    &&
+    (
+      std::is_same< typename DstTraits::array_layout , Kokkos::LayoutLeft >::value ||
+      std::is_same< typename DstTraits::array_layout , Kokkos::LayoutRight >::value ||
+      std::is_same< typename DstTraits::array_layout , Kokkos::LayoutStride >::value
+    )
+    &&
+    std::is_same< typename SrcTraits::specialize , Kokkos::Array<> >::value
+    &&
+    (
+      std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutLeft >::value ||
+      std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutRight >::value ||
+      std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutStride >::value
+    )
+  )>::type >
+{
+public:
+
+  enum { is_assignable = true };
+
+  typedef Kokkos::Experimental::Impl::SharedAllocationTracker  TrackType ;
+  typedef ViewMapping< DstTraits , void >  DstType ;
+  typedef ViewMapping< SrcTraits , void >  SrcType ;
+
+  KOKKOS_INLINE_FUNCTION
+  static void assign( DstType & dst , const SrcType & src , const TrackType & src_track )
+    {
+      static_assert( std::is_same< typename DstTraits::value_type , typename SrcTraits::value_type >::value ||
+                     std::is_same< typename DstTraits::value_type , typename SrcTraits::const_value_type >::value
+                   , "View assignment must have same value type or const = non-const" );
+
+      static_assert( ViewDimensionAssignable< typename DstTraits::dimension , typename SrcTraits::dimension >::value
+                   , "View assignment must have compatible dimensions" );
+
+      static_assert( std::is_same< typename DstTraits::array_layout , typename SrcTraits::array_layout >::value ||
+                     std::is_same< typename DstTraits::array_layout , Kokkos::LayoutStride >::value ||
+                     ( DstTraits::dimension::rank == 0 ) ||
+                     ( DstTraits::dimension::rank == 1 && DstTraits::dimension::rank_dynamic == 1 )
+                   , "View assignment must have compatible layout or have rank <= 1" );
+
+      typedef typename DstType::offset_type  dst_offset_type ;
+
+      dst.m_offset = dst_offset_type( src.m_offset );
+      dst.m_handle = src.m_handle ;
+      dst.m_stride = src.m_stride ;
+    }
+};
+
+/** \brief Assign Array to non-Array */
+
+template< class DstTraits , class SrcTraits >
+class ViewMapping< DstTraits , SrcTraits ,
+  typename std::enable_if<(
+    std::is_same< typename DstTraits::memory_space , typename SrcTraits::memory_space >::value
+    &&
+    std::is_same< typename DstTraits::specialize , void >::value
+    &&
+    (
+      std::is_same< typename DstTraits::array_layout , Kokkos::LayoutLeft >::value ||
+      std::is_same< typename DstTraits::array_layout , Kokkos::LayoutRight >::value ||
+      std::is_same< typename DstTraits::array_layout , Kokkos::LayoutStride >::value
+    )
+    &&
+    std::is_same< typename SrcTraits::specialize , Kokkos::Array<> >::value
+    &&
+    (
+      std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutLeft >::value ||
+      std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutRight >::value ||
+      std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutStride >::value
+    )
+  )>::type >
+{
+public:
+
+  // Can only convert to View::array_type
+
+  enum { is_assignable = std::is_same< typename DstTraits::data_type ,    typename SrcTraits::scalar_array_type >::value &&
+                         std::is_same< typename DstTraits::array_layout , typename SrcTraits::array_layout >::value };
+
+  typedef Kokkos::Experimental::Impl::SharedAllocationTracker  TrackType ;
+  typedef ViewMapping< DstTraits , void >  DstType ;
+  typedef ViewMapping< SrcTraits , void >  SrcType ;
+
+  KOKKOS_INLINE_FUNCTION
+  static void assign( DstType & dst , const SrcType & src , const TrackType & src_track )
+    {
+      static_assert( is_assignable , "Can only convert to array_type" );
+
+      typedef typename DstType::offset_type  dst_offset_type ;
+
+      // Array dimension becomes the last dimension.
+      // Arguments beyond the destination rank are ignored.
+      if ( src.span_is_contiguous() ) { // not padded
+        dst.m_offset = dst_offset_type( std::integral_constant<unsigned,0>() ,
+          typename DstTraits::array_layout
+            ( ( 0 < SrcType::Rank ? src.dimension_0() : SrcTraits::value_type::size() )
+            , ( 1 < SrcType::Rank ? src.dimension_1() : SrcTraits::value_type::size() )
+            , ( 2 < SrcType::Rank ? src.dimension_2() : SrcTraits::value_type::size() )
+            , ( 3 < SrcType::Rank ? src.dimension_3() : SrcTraits::value_type::size() )
+            , ( 4 < SrcType::Rank ? src.dimension_4() : SrcTraits::value_type::size() )
+            , ( 5 < SrcType::Rank ? src.dimension_5() : SrcTraits::value_type::size() )
+            , ( 6 < SrcType::Rank ? src.dimension_6() : SrcTraits::value_type::size() )
+            , ( 7 < SrcType::Rank ? src.dimension_7() : SrcTraits::value_type::size() )
+            ) );
+      }
+      else { // is padded
+        typedef std::integral_constant<unsigned,sizeof(typename SrcTraits::value_type::value_type)> padded ;
+
+        dst.m_offset = dst_offset_type( padded() ,
+          typename DstTraits::array_layout
+            ( ( 0 < SrcType::Rank ? src.dimension_0() : SrcTraits::value_type::size() )
+            , ( 1 < SrcType::Rank ? src.dimension_1() : SrcTraits::value_type::size() )
+            , ( 2 < SrcType::Rank ? src.dimension_2() : SrcTraits::value_type::size() )
+            , ( 3 < SrcType::Rank ? src.dimension_3() : SrcTraits::value_type::size() )
+            , ( 4 < SrcType::Rank ? src.dimension_4() : SrcTraits::value_type::size() )
+            , ( 5 < SrcType::Rank ? src.dimension_5() : SrcTraits::value_type::size() )
+            , ( 6 < SrcType::Rank ? src.dimension_6() : SrcTraits::value_type::size() )
+            , ( 7 < SrcType::Rank ? src.dimension_7() : SrcTraits::value_type::size() )
+            ) );
+      }
+
+      dst.m_handle = src.m_handle ;
+    }
+};
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+template< class SrcTraits , class ... Args >
+struct ViewMapping
+  < typename std::enable_if<(
+      std::is_same< typename SrcTraits::specialize , Kokkos::Array<> >::value
+      &&
+      (
+        std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutLeft >::value ||
+        std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutRight >::value ||
+        std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutStride >::value
+      )
+    )>::type
+  , SrcTraits
+  , Args ... >
+{
+private:
+
+  static_assert( SrcTraits::rank == sizeof...(Args) , "" );
+
+  enum : bool
+    { R0 = is_integral_extent<0,Args...>::value
+    , R1 = is_integral_extent<1,Args...>::value
+    , R2 = is_integral_extent<2,Args...>::value
+    , R3 = is_integral_extent<3,Args...>::value
+    , R4 = is_integral_extent<4,Args...>::value
+    , R5 = is_integral_extent<5,Args...>::value
+    , R6 = is_integral_extent<6,Args...>::value
+    , R7 = is_integral_extent<7,Args...>::value
+    };
+
+  enum { rank = unsigned(R0) + unsigned(R1) + unsigned(R2) + unsigned(R3)
+              + unsigned(R4) + unsigned(R5) + unsigned(R6) + unsigned(R7) };
+
+  // Whether right-most rank is a range.
+  enum { R0_rev = 0 == SrcTraits::rank ? false : (
+                  1 == SrcTraits::rank ? R0 : (
+                  2 == SrcTraits::rank ? R1 : (
+                  3 == SrcTraits::rank ? R2 : (
+                  4 == SrcTraits::rank ? R3 : (
+                  5 == SrcTraits::rank ? R4 : (
+                  6 == SrcTraits::rank ? R5 : (
+                  7 == SrcTraits::rank ? R6 : R7 ))))))) };
+
+  // Subview's layout
+  typedef typename std::conditional<
+      ( /* Same array layout IF */
+        ( rank == 0 ) /* output rank zero */
+        ||
+        // OutputRank 1 or 2, InputLayout Left, Interval 0
+        // because single stride one or second index has a stride.
+        ( rank <= 2 && R0 && std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutLeft >::value )
+        ||
+        // OutputRank 1 or 2, InputLayout Right, Interval [InputRank-1]
+        // because single stride one or second index has a stride.
+        ( rank <= 2 && R0_rev && std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutRight >::value )
+      ), typename SrcTraits::array_layout , Kokkos::LayoutStride
+      >::type array_layout ;
+
+  typedef typename SrcTraits::value_type  value_type ;
+
+  typedef typename std::conditional< rank == 0 , value_type ,
+          typename std::conditional< rank == 1 , value_type * ,
+          typename std::conditional< rank == 2 , value_type ** ,
+          typename std::conditional< rank == 3 , value_type *** ,
+          typename std::conditional< rank == 4 , value_type **** ,
+          typename std::conditional< rank == 5 , value_type ***** ,
+          typename std::conditional< rank == 6 , value_type ****** ,
+          typename std::conditional< rank == 7 , value_type ******* ,
+                                                 value_type ********
+          >::type >::type >::type >::type >::type >::type >::type >::type
+     data_type ;
+
+public:
+
+  typedef Kokkos::Experimental::ViewTraits
+    < data_type
+    , array_layout
+    , typename SrcTraits::device_type
+    , typename SrcTraits::memory_traits > traits_type ;
+
+  typedef Kokkos::Experimental::View
+    < data_type
+    , array_layout
+    , typename SrcTraits::device_type
+    , typename SrcTraits::memory_traits > type ;
+
+  KOKKOS_INLINE_FUNCTION
+  static void assign( ViewMapping< traits_type , void > & dst
+                    , ViewMapping< SrcTraits , void > const & src
+                    , Args ... args )
+    {
+      typedef ViewMapping< traits_type , void >  DstType ;
+
+      typedef typename DstType::offset_type  dst_offset_type ;
+      typedef typename DstType::handle_type  dst_handle_type ;
+
+      const SubviewExtents< SrcTraits::rank , rank >
+        extents( src.m_offset.m_dim , args... );
+
+      dst.m_offset = dst_offset_type( src.m_offset , extents );
+      dst.m_handle = dst_handle_type( src.m_handle +
+                                      src.m_offset( extents.domain_offset(0)
+                                                  , extents.domain_offset(1)
+                                                  , extents.domain_offset(2)
+                                                  , extents.domain_offset(3)
+                                                  , extents.domain_offset(4)
+                                                  , extents.domain_offset(5)
+                                                  , extents.domain_offset(6)
+                                                  , extents.domain_offset(7)
+                                                  ) );
+    }
+};
+
+}}} // namespace Kokkos::Experimental::Impl
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_EXPERIMENTAL_VIEW_ARRAY_MAPPING_HPP */
+
diff --git a/lib/kokkos/core/src/impl/KokkosExp_ViewCtor.hpp b/lib/kokkos/core/src/impl/KokkosExp_ViewCtor.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..6525fed0a5ceb5995db3517b84fec6f7985e6d54
--- /dev/null
+++ b/lib/kokkos/core/src/impl/KokkosExp_ViewCtor.hpp
@@ -0,0 +1,252 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_EXPERIMENTAL_IMPL_VIEW_CTOR_PROP_HPP
+#define KOKKOS_EXPERIMENTAL_IMPL_VIEW_CTOR_PROP_HPP
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+/* For backward compatibility */
+
+struct ViewAllocateWithoutInitializing {
+
+  const std::string label ;
+
+  ViewAllocateWithoutInitializing() : label() {}
+
+  explicit
+  ViewAllocateWithoutInitializing( const std::string & arg_label ) : label( arg_label ) {}
+
+  explicit
+  ViewAllocateWithoutInitializing( const char * const  arg_label ) : label( arg_label ) {}
+};
+
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+struct WithoutInitializing_t {};
+struct AllowPadding_t {};
+struct NullSpace_t {};
+
+//----------------------------------------------------------------------------
+/**\brief Whether a type can be used for a view label */
+
+template < typename >
+struct is_view_label : public std::false_type {};
+
+template<>
+struct is_view_label< std::string > : public std::true_type {};
+
+template< unsigned N >
+struct is_view_label< char[N] > : public std::true_type {};
+
+template< unsigned N >
+struct is_view_label< const char[N] > : public std::true_type {};
+
+//----------------------------------------------------------------------------
+
+template< typename ... P >
+struct ViewCtorProp ;
+
+/*  std::integral_constant<unsigned,I> are dummy arguments
+ *  that avoid duplicate base class errors
+ */
+template< unsigned I >
+struct ViewCtorProp< void , std::integral_constant<unsigned,I> >
+{
+  ViewCtorProp() = default ;
+  ViewCtorProp( const ViewCtorProp & ) = default ;
+  ViewCtorProp & operator = ( const ViewCtorProp & ) = default ;
+
+  template< typename P >
+  ViewCtorProp( const P & ) {}
+};
+
+/* Property flags have constexpr value */
+template< typename P >
+struct ViewCtorProp
+  < typename std::enable_if<
+      std::is_same< P , AllowPadding_t >::value ||
+      std::is_same< P , WithoutInitializing_t >::value
+    >::type
+  , P
+  >
+{
+  ViewCtorProp() = default ;
+  ViewCtorProp( const ViewCtorProp & ) = default ;
+  ViewCtorProp & operator = ( const ViewCtorProp & ) = default ;
+
+  typedef P type ;
+
+  ViewCtorProp( const type & ) {}
+
+  static constexpr type value = type();
+};
+
+/* Map input label type to std::string */
+template< typename Label >
+struct ViewCtorProp
+  < typename std::enable_if< is_view_label< Label >::value >::type
+  , Label
+  >
+{
+  ViewCtorProp() = default ;
+  ViewCtorProp( const ViewCtorProp & ) = default ;
+  ViewCtorProp & operator = ( const ViewCtorProp & ) = default ;
+
+  typedef std::string type ;
+
+  ViewCtorProp( const type & arg ) : value( arg ) {}
+  ViewCtorProp( type && arg ) : value( arg ) {}
+
+  type value ;
+};
+
+template< typename Space >
+struct ViewCtorProp
+  < typename std::enable_if<
+      Kokkos::Impl::is_memory_space<Space>::value ||
+      Kokkos::Impl::is_execution_space<Space>::value
+    >::type
+  , Space
+  >
+{
+  ViewCtorProp() = default ;
+  ViewCtorProp( const ViewCtorProp & ) = default ;
+  ViewCtorProp & operator = ( const ViewCtorProp & ) = default ;
+
+  typedef Space type ;
+
+  ViewCtorProp( const type & arg ) : value( arg ) {}
+
+  type value ;
+};
+
+
+template< typename T >
+struct ViewCtorProp < void , T * >
+{
+  ViewCtorProp() = default ;
+  ViewCtorProp( const ViewCtorProp & ) = default ;
+  ViewCtorProp & operator = ( const ViewCtorProp & ) = default ;
+
+  typedef T * type ;
+
+  KOKKOS_INLINE_FUNCTION
+  ViewCtorProp( const type arg ) : value( arg ) {}
+
+  type value ;
+};
+
+
+template< typename ... P >
+struct ViewCtorProp : public ViewCtorProp< void , P > ...
+{
+private:
+
+  typedef Kokkos::Impl::has_condition< void , Kokkos::Impl::is_memory_space , P ... >
+    var_memory_space ;
+
+  typedef Kokkos::Impl::has_condition< void , Kokkos::Impl::is_execution_space , P ... >
+    var_execution_space ;
+
+  struct VOIDDUMMY{};
+
+  typedef Kokkos::Impl::has_condition< VOIDDUMMY , std::is_pointer , P ... >
+    var_pointer ;
+
+public:
+
+  /* Flags for the common properties */
+  enum { has_memory_space    = var_memory_space::value };
+  enum { has_execution_space = var_execution_space::value };
+  enum { has_pointer         = var_pointer::value };
+  enum { has_label           = Kokkos::Impl::has_type< std::string , P... >::value };
+  enum { allow_padding       = Kokkos::Impl::has_type< AllowPadding_t , P... >::value };
+  enum { initialize          = ! Kokkos::Impl::has_type< WithoutInitializing_t , P ... >::value };
+
+  typedef typename var_memory_space::type     memory_space ;
+  typedef typename var_execution_space::type  execution_space ;
+  typedef typename var_pointer::type          pointer_type ;
+
+  /*  Copy from a matching argument list.
+   *  Requires  std::is_same< P , ViewCtorProp< void , Args >::value ...
+   */
+  template< typename ... Args >
+  inline
+  ViewCtorProp( Args const & ... args )
+    : ViewCtorProp< void , P >( args ) ...
+    {}
+
+  template< typename ... Args >
+  KOKKOS_INLINE_FUNCTION
+  ViewCtorProp( pointer_type arg0 , Args const & ... args )
+    : ViewCtorProp< void , pointer_type >( arg0 )
+    , ViewCtorProp< void , typename ViewCtorProp< void , Args >::type >( args ) ...
+    {}
+
+  /* Copy from a matching property subset */
+  template< typename ... Args >
+  ViewCtorProp( ViewCtorProp< Args ... > const & arg )
+    : ViewCtorProp< void , Args >( ((ViewCtorProp<void,Args> const &) arg ) ) ...
+    {}
+};
+
+} /* namespace Impl */
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif
+
diff --git a/lib/kokkos/core/src/impl/KokkosExp_ViewMapping.hpp b/lib/kokkos/core/src/impl/KokkosExp_ViewMapping.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ed56536cd91b52f3d0beddc8095eba9a4bb593c9
--- /dev/null
+++ b/lib/kokkos/core/src/impl/KokkosExp_ViewMapping.hpp
@@ -0,0 +1,2932 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_EXPERIMENTAL_VIEW_MAPPING_HPP
+#define KOKKOS_EXPERIMENTAL_VIEW_MAPPING_HPP
+
+#include <type_traits>
+#include <initializer_list>
+
+#include <Kokkos_Core_fwd.hpp>
+#include <Kokkos_Pair.hpp>
+#include <Kokkos_Layout.hpp>
+#include <impl/Kokkos_Error.hpp>
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/KokkosExp_ViewCtor.hpp>
+#include <impl/Kokkos_Atomic_View.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template< unsigned I , size_t ... Args >
+struct variadic_size_t
+  { enum { value = ~size_t(0) }; };
+
+template< size_t Val , size_t ... Args >
+struct variadic_size_t< 0 , Val , Args ... >
+  { enum { value = Val }; };
+
+template< unsigned I , size_t Val , size_t ... Args >
+struct variadic_size_t< I , Val , Args ... >
+  { enum { value = variadic_size_t< I - 1 , Args ... >::value }; };
+
+template< size_t ... Args >
+struct rank_dynamic ;
+
+template<>
+struct rank_dynamic<> { enum { value = 0 }; };
+
+template< size_t Val , size_t ... Args >
+struct rank_dynamic< Val , Args... >
+{
+  enum { value = ( Val == 0 ? 1 : 0 ) + rank_dynamic< Args... >::value };
+};
+
+#define KOKKOS_IMPL_VIEW_DIMENSION( R ) \
+  template< size_t V , unsigned > struct ViewDimension ## R \
+    { \
+      enum { ArgN ## R = ( V != ~size_t(0) ? V : 1 ) }; \
+      enum { N ## R = ( V != ~size_t(0) ? V : 1 ) }; \
+      KOKKOS_INLINE_FUNCTION explicit ViewDimension ## R ( size_t ) {} \
+      ViewDimension ## R () = default ; \
+      ViewDimension ## R ( const ViewDimension ## R  & ) = default ; \
+      ViewDimension ## R & operator = ( const ViewDimension ## R  & ) = default ; \
+    }; \
+  template< unsigned RD > struct ViewDimension ## R < 0 , RD > \
+    { \
+      enum { ArgN ## R = 0 }; \
+      typename std::conditional<( RD < 3 ), size_t , unsigned >::type N ## R ; \
+      ViewDimension ## R () = default ; \
+      ViewDimension ## R ( const ViewDimension ## R  & ) = default ; \
+      ViewDimension ## R & operator = ( const ViewDimension ## R  & ) = default ; \
+      KOKKOS_INLINE_FUNCTION explicit ViewDimension ## R ( size_t V ) : N ## R ( V ) {} \
+    };
+
+KOKKOS_IMPL_VIEW_DIMENSION( 0 )
+KOKKOS_IMPL_VIEW_DIMENSION( 1 )
+KOKKOS_IMPL_VIEW_DIMENSION( 2 )
+KOKKOS_IMPL_VIEW_DIMENSION( 3 )
+KOKKOS_IMPL_VIEW_DIMENSION( 4 )
+KOKKOS_IMPL_VIEW_DIMENSION( 5 )
+KOKKOS_IMPL_VIEW_DIMENSION( 6 )
+KOKKOS_IMPL_VIEW_DIMENSION( 7 )
+
+#undef KOKKOS_IMPL_VIEW_DIMENSION
+
+template< size_t ... Vals >
+struct ViewDimension
+  : public ViewDimension0< variadic_size_t<0,Vals...>::value 
+                         , rank_dynamic< Vals... >::value >
+  , public ViewDimension1< variadic_size_t<1,Vals...>::value 
+                         , rank_dynamic< Vals... >::value >
+  , public ViewDimension2< variadic_size_t<2,Vals...>::value 
+                         , rank_dynamic< Vals... >::value >
+  , public ViewDimension3< variadic_size_t<3,Vals...>::value 
+                         , rank_dynamic< Vals... >::value >
+  , public ViewDimension4< variadic_size_t<4,Vals...>::value 
+                         , rank_dynamic< Vals... >::value >
+  , public ViewDimension5< variadic_size_t<5,Vals...>::value 
+                         , rank_dynamic< Vals... >::value >
+  , public ViewDimension6< variadic_size_t<6,Vals...>::value 
+                         , rank_dynamic< Vals... >::value >
+  , public ViewDimension7< variadic_size_t<7,Vals...>::value 
+                         , rank_dynamic< Vals... >::value >
+{
+  typedef ViewDimension0< variadic_size_t<0,Vals...>::value 
+                        , rank_dynamic< Vals... >::value > D0 ;
+  typedef ViewDimension1< variadic_size_t<1,Vals...>::value 
+                        , rank_dynamic< Vals... >::value > D1 ;
+  typedef ViewDimension2< variadic_size_t<2,Vals...>::value 
+                        , rank_dynamic< Vals... >::value > D2 ;
+  typedef ViewDimension3< variadic_size_t<3,Vals...>::value 
+                        , rank_dynamic< Vals... >::value > D3 ;
+  typedef ViewDimension4< variadic_size_t<4,Vals...>::value 
+                        , rank_dynamic< Vals... >::value > D4 ;
+  typedef ViewDimension5< variadic_size_t<5,Vals...>::value 
+                        , rank_dynamic< Vals... >::value > D5 ;
+  typedef ViewDimension6< variadic_size_t<6,Vals...>::value 
+                        , rank_dynamic< Vals... >::value > D6 ;
+  typedef ViewDimension7< variadic_size_t<7,Vals...>::value 
+                        , rank_dynamic< Vals... >::value > D7 ;
+
+  using D0::ArgN0 ;
+  using D1::ArgN1 ;
+  using D2::ArgN2 ;
+  using D3::ArgN3 ;
+  using D4::ArgN4 ;
+  using D5::ArgN5 ;
+  using D6::ArgN6 ;
+  using D7::ArgN7 ;
+
+  using D0::N0 ;
+  using D1::N1 ;
+  using D2::N2 ;
+  using D3::N3 ;
+  using D4::N4 ;
+  using D5::N5 ;
+  using D6::N6 ;
+  using D7::N7 ;
+
+  enum { rank = sizeof...(Vals) };
+  enum { rank_dynamic = Impl::rank_dynamic< Vals... >::value };
+
+  ViewDimension() = default ;
+  ViewDimension( const ViewDimension & ) = default ;
+  ViewDimension & operator = ( const ViewDimension & ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr
+  ViewDimension( size_t n0 , size_t n1 , size_t n2 , size_t n3
+               , size_t n4 , size_t n5 , size_t n6 , size_t n7 )
+    : D0( n0 )
+    , D1( n1 )
+    , D2( n2 )
+    , D3( n3 )
+    , D4( n4 )
+    , D5( n5 )
+    , D6( n6 )
+    , D7( n7 )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr size_t extent( const unsigned r ) const
+    {
+      return r == 0 ? N0 : (
+             r == 1 ? N1 : (
+             r == 2 ? N2 : (
+             r == 3 ? N3 : (
+             r == 4 ? N4 : (
+             r == 5 ? N5 : (
+             r == 6 ? N6 : (
+             r == 7 ? N7 : 0 )))))));
+    }
+
+  template< size_t N >
+  struct prepend { typedef ViewDimension< N , Vals... > type ; };
+
+  template< size_t N >
+  struct append { typedef ViewDimension< Vals... , N > type ; };
+};
+
+template< class A , class B >
+struct ViewDimensionJoin ;
+
+template< size_t ... A , size_t ... B >
+struct ViewDimensionJoin< ViewDimension< A... > , ViewDimension< B... > > {
+  typedef ViewDimension< A... , B... > type ;
+};
+
+//----------------------------------------------------------------------------
+
+template< class DstDim , class SrcDim >
+struct ViewDimensionAssignable ;
+
+template< size_t ... DstArgs , size_t ... SrcArgs >
+struct ViewDimensionAssignable< ViewDimension< DstArgs ... >
+                              , ViewDimension< SrcArgs ... > >
+{
+  typedef ViewDimension< DstArgs... > dst ;
+  typedef ViewDimension< SrcArgs... > src ;
+
+  enum { value =
+    unsigned(dst::rank) == unsigned(src::rank) && (
+      //Compile time check that potential static dimensions match
+      ( ( 1 > dst::rank_dynamic && 1 > src::rank_dynamic ) ? (size_t(dst::ArgN0) == size_t(src::ArgN0)) : true ) &&
+      ( ( 2 > dst::rank_dynamic && 2 > src::rank_dynamic ) ? (size_t(dst::ArgN1) == size_t(src::ArgN1)) : true ) &&
+      ( ( 3 > dst::rank_dynamic && 3 > src::rank_dynamic ) ? (size_t(dst::ArgN2) == size_t(src::ArgN2)) : true ) &&
+      ( ( 4 > dst::rank_dynamic && 4 > src::rank_dynamic ) ? (size_t(dst::ArgN3) == size_t(src::ArgN3)) : true ) &&
+      ( ( 5 > dst::rank_dynamic && 5 > src::rank_dynamic ) ? (size_t(dst::ArgN4) == size_t(src::ArgN4)) : true ) &&
+      ( ( 6 > dst::rank_dynamic && 6 > src::rank_dynamic ) ? (size_t(dst::ArgN5) == size_t(src::ArgN5)) : true ) &&
+      ( ( 7 > dst::rank_dynamic && 7 > src::rank_dynamic ) ? (size_t(dst::ArgN6) == size_t(src::ArgN6)) : true ) &&
+      ( ( 8 > dst::rank_dynamic && 8 > src::rank_dynamic ) ? (size_t(dst::ArgN7) == size_t(src::ArgN7)) : true )
+    )};
+
+};
+
+}}} // namespace Kokkos::Experimental::Impl
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+struct ALL_t {
+  KOKKOS_INLINE_FUNCTION
+  constexpr const ALL_t & operator()() const { return *this ; }
+};
+
+template< class T >
+struct is_integral_extent_type
+{ enum { value = std::is_same<T,Kokkos::Experimental::Impl::ALL_t>::value ? 1 : 0 }; };
+
+template< class iType >
+struct is_integral_extent_type< std::pair<iType,iType> >
+{ enum { value = std::is_integral<iType>::value ? 1 : 0 }; };
+
+template< class iType >
+struct is_integral_extent_type< Kokkos::pair<iType,iType> >
+{ enum { value = std::is_integral<iType>::value ? 1 : 0 }; };
+
+// Assuming '2 == initializer_list<iType>::size()'
+template< class iType >
+struct is_integral_extent_type< std::initializer_list<iType> >
+{ enum { value = std::is_integral<iType>::value ? 1 : 0 }; };
+
+template < unsigned I , class ... Args >
+struct is_integral_extent
+{
+  // get_type is void when sizeof...(Args) <= I
+  typedef typename std::remove_cv<
+          typename std::remove_reference<
+          typename Kokkos::Impl::get_type<I,Args...
+          >::type >::type >::type type ;
+
+  enum { value = is_integral_extent_type<type>::value };
+
+  static_assert( value ||
+                 std::is_integral<type>::value ||
+                 std::is_same<type,void>::value 
+               , "subview argument must be either integral or integral extent" );
+};
+
+template< unsigned DomainRank , unsigned RangeRank >
+struct SubviewExtents {
+private:
+
+  // Cannot declare zero-length arrays
+  enum { InternalRangeRank = RangeRank ? RangeRank : 1u };
+
+  size_t   m_begin[  DomainRank ];
+  size_t   m_length[ InternalRangeRank ];
+  unsigned m_index[  InternalRangeRank ];
+
+  template< size_t ... DimArgs >
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool set( unsigned domain_rank
+          , unsigned range_rank
+          , const ViewDimension< DimArgs ... > & dim )
+    { return true ; }
+
+  template< class T , size_t ... DimArgs , class ... Args >
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool set( unsigned domain_rank
+          , unsigned range_rank
+          , const ViewDimension< DimArgs ... > & dim
+          , const T & val
+          , Args ... args )
+    {
+      const size_t v = static_cast<size_t>(val);
+
+      m_begin[ domain_rank ] = v ;
+
+      return set( domain_rank + 1 , range_rank , dim , args... )
+#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
+             && ( v < dim.extent( domain_rank ) )
+#endif
+      ;
+    }
+
+  // ALL_t
+  template< size_t ... DimArgs , class ... Args >
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool set( unsigned domain_rank
+          , unsigned range_rank
+          , const ViewDimension< DimArgs ... > & dim
+          , const Kokkos::Experimental::Impl::ALL_t 
+          , Args ... args )
+    {
+      m_begin[  domain_rank ] = 0 ;
+      m_length[ range_rank  ] = dim.extent( domain_rank );
+      m_index[  range_rank  ] = domain_rank ;
+
+      return set( domain_rank + 1 , range_rank + 1 , dim , args... );
+    }
+
+  // std::pair range
+  template< class T , size_t ... DimArgs , class ... Args >
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool set( unsigned domain_rank
+          , unsigned range_rank
+          , const ViewDimension< DimArgs ... > & dim
+          , const std::pair<T,T> & val
+          , Args ... args )
+    {
+      const size_t b = static_cast<size_t>( val.first );
+      const size_t e = static_cast<size_t>( val.second );
+
+      m_begin[  domain_rank ] = b ;
+      m_length[ range_rank  ] = e - b ;
+      m_index[  range_rank  ] = domain_rank ;
+
+      return set( domain_rank + 1 , range_rank + 1 , dim , args... )
+#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
+             && ( e <= b + dim.extent( domain_rank ) )
+#endif
+      ;
+    }
+
+  // Kokkos::pair range
+  template< class T , size_t ... DimArgs , class ... Args >
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool set( unsigned domain_rank
+          , unsigned range_rank
+          , const ViewDimension< DimArgs ... > & dim
+          , const Kokkos::pair<T,T> & val
+          , Args ... args )
+    {
+      const size_t b = static_cast<size_t>( val.first );
+      const size_t e = static_cast<size_t>( val.second );
+
+      m_begin[  domain_rank ] = b ;
+      m_length[ range_rank  ] = e - b ;
+      m_index[  range_rank  ] = domain_rank ;
+
+      return set( domain_rank + 1 , range_rank + 1 , dim , args... )
+#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
+             && ( e <= b + dim.extent( domain_rank ) )
+#endif
+      ;
+    }
+
+  // { begin , end } range
+  template< class T , size_t ... DimArgs , class ... Args >
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool set( unsigned domain_rank
+          , unsigned range_rank
+          , const ViewDimension< DimArgs ... > & dim
+          , const std::initializer_list< T > & val
+          , Args ... args )
+    {
+      const size_t b = static_cast<size_t>( val.begin()[0] );
+      const size_t e = static_cast<size_t>( val.begin()[1] );
+
+      m_begin[  domain_rank ] = b ;
+      m_length[ range_rank  ] = e - b ;
+      m_index[  range_rank  ] = domain_rank ;
+
+      return set( domain_rank + 1 , range_rank + 1 , dim , args... )
+#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
+             && ( val.size() == 2 )
+             && ( e <= b + dim.extent( domain_rank ) )
+#endif
+      ;
+    }
+
+  //------------------------------
+
+#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
+
+  template< size_t ... DimArgs >
+  void error( char *
+            , int
+            , unsigned
+            , unsigned
+            , const ViewDimension< DimArgs ... > & ) const
+    {}
+
+  template< class T , size_t ... DimArgs , class ... Args >
+  void error( char * buf , int buf_len
+            , unsigned domain_rank
+            , unsigned range_rank
+            , const ViewDimension< DimArgs ... > & dim
+            , const T & val
+            , Args ... args ) const
+    {
+      const int n = std::min( buf_len ,
+        snprintf( buf , buf_len
+                , " %lu < %lu %c"
+                , static_cast<unsigned long>(val)
+                , static_cast<unsigned long>( dim.extent( domain_rank ) )
+                , int( sizeof...(Args) ? ',' : ')' ) ) );
+
+      error( buf+n, buf_len-n, domain_rank + 1 , range_rank , dim , args... );
+    }
+
+  // std::pair range
+  template< size_t ... DimArgs , class ... Args >
+  void error( char * buf , int buf_len
+            , unsigned domain_rank
+            , unsigned range_rank
+            , const ViewDimension< DimArgs ... > & dim
+            , const Kokkos::Experimental::Impl::ALL_t 
+            , Args ... args ) const
+    {
+      const int n = std::min( buf_len ,
+        snprintf( buf , buf_len
+                , " Kokkos::ALL %c" 
+                , int( sizeof...(Args) ? ',' : ')' ) ) );
+
+      error( buf+n , buf_len-n , domain_rank + 1 , range_rank + 1 , dim , args... );
+    }
+
+  // std::pair range
+  template< class T , size_t ... DimArgs , class ... Args >
+  void error( char * buf , int buf_len
+            , unsigned domain_rank
+            , unsigned range_rank
+            , const ViewDimension< DimArgs ... > & dim
+            , const std::pair<T,T> & val
+            , Args ... args ) const
+    {
+      // d <= e - b
+      const int n = std::min( buf_len ,
+        snprintf( buf , buf_len
+                , " %lu <= %lu - %lu %c"
+                , static_cast<unsigned long>( dim.extent( domain_rank ) )
+                , static_cast<unsigned long>( val.second )
+                , static_cast<unsigned long>( val.begin )
+                , int( sizeof...(Args) ? ',' : ')' ) ) );
+
+      error( buf+n , buf_len-n , domain_rank + 1 , range_rank + 1 , dim , args... );
+    }
+
+  // Kokkos::pair range
+  template< class T , size_t ... DimArgs , class ... Args >
+  void error( char * buf , int buf_len
+            , unsigned domain_rank
+            , unsigned range_rank
+            , const ViewDimension< DimArgs ... > & dim
+            , const Kokkos::pair<T,T> & val
+            , Args ... args ) const
+    {
+      // d <= e - b
+      const int n = std::min( buf_len ,
+        snprintf( buf , buf_len
+                , " %lu <= %lu - %lu %c"
+                , static_cast<unsigned long>( dim.extent( domain_rank ) )
+                , static_cast<unsigned long>( val.second )
+                , static_cast<unsigned long>( val.begin )
+                , int( sizeof...(Args) ? ',' : ')' ) ) );
+
+      error( buf+n , buf_len-n , domain_rank + 1 , range_rank + 1 , dim , args... );
+    }
+
+  // { begin , end } range
+  template< class T , size_t ... DimArgs , class ... Args >
+  void error( char * buf , int buf_len
+            , unsigned domain_rank
+            , unsigned range_rank
+            , const ViewDimension< DimArgs ... > & dim
+            , const std::initializer_list< T > & val
+            , Args ... args ) const
+    {
+      // d <= e - b
+      int n = 0 ;
+      if ( val.size() == 2 ) {
+        n = std::min( buf_len ,
+          snprintf( buf , buf_len
+                  , " %lu <= %lu - %lu %c"
+                  , static_cast<unsigned long>( dim.extent( domain_rank ) )
+                  , static_cast<unsigned long>( val.begin()[0] )
+                  , static_cast<unsigned long>( val.begin()[1] )
+                  , int( sizeof...(Args) ? ',' : ')' ) ) );
+      }
+      else {
+        n = std::min( buf_len ,
+          snprintf( buf , buf_len
+                  , " { ... }.size() == %u %c"
+                  , unsigned(val.size())
+                  , int( sizeof...(Args) ? ',' : ')' ) ) );
+      }
+
+      error( buf+n , buf_len-n , domain_rank + 1 , range_rank + 1 , dim , args... );
+    }
+
+  template< size_t ... DimArgs , class ... Args >
+  KOKKOS_FORCEINLINE_FUNCTION
+  void error( const ViewDimension< DimArgs ... > & dim , Args ... args ) const
+    {
+#if defined( KOKKOS_ACTIVE_EXECUTION_SPACE_HOST )
+      enum { LEN = 1024 };
+      char buffer[ LEN ];
+
+      const int n = snprintf(buffer,LEN,"Kokkos::subview bounds error (");
+      error( buffer+n , LEN-n , 0 , 0 , dim , args... );
+
+      Kokkos::Impl::throw_runtime_exception(std::string(buffer));
+#else
+      Kokkos::abort("Kokkos::subview bounds error");
+#endif
+    }
+
+#else
+
+  template< size_t ... DimArgs , class ... Args >
+  KOKKOS_FORCEINLINE_FUNCTION
+  void error( const ViewDimension< DimArgs ... > & , Args ... ) const {}
+
+#endif
+
+public:
+
+  template< size_t ... DimArgs , class ... Args >
+  KOKKOS_INLINE_FUNCTION
+  SubviewExtents( const ViewDimension< DimArgs ... > & dim , Args ... args )
+    {
+      static_assert( DomainRank == sizeof...(DimArgs) , "" );
+      static_assert( DomainRank == sizeof...(Args) , "" );
+
+      // Verifies that all arguments, up to 8, are integral types,
+      // integral extents, or don't exist.
+      static_assert( RangeRank ==
+        unsigned( is_integral_extent<0,Args...>::value ) +
+        unsigned( is_integral_extent<1,Args...>::value ) +
+        unsigned( is_integral_extent<2,Args...>::value ) +
+        unsigned( is_integral_extent<3,Args...>::value ) +
+        unsigned( is_integral_extent<4,Args...>::value ) +
+        unsigned( is_integral_extent<5,Args...>::value ) +
+        unsigned( is_integral_extent<6,Args...>::value ) +
+        unsigned( is_integral_extent<7,Args...>::value ) , "" );
+
+      if ( RangeRank == 0 ) { m_length[0] = 0 ; m_index[0] = ~0u ; }
+
+      if ( ! set( 0 , 0 , dim , args... ) ) error( dim , args... );
+    }
+
+  template < typename iType >
+  KOKKOS_FORCEINLINE_FUNCTION
+  constexpr size_t domain_offset( const iType i ) const
+    { return unsigned(i) < DomainRank ? m_begin[i] : 0 ; }
+
+  template < typename iType >
+  KOKKOS_FORCEINLINE_FUNCTION
+  constexpr size_t range_extent( const iType i ) const
+    { return unsigned(i) < InternalRangeRank ? m_length[i] : 0 ; }
+
+  template < typename iType >
+  KOKKOS_FORCEINLINE_FUNCTION
+  constexpr unsigned range_index( const iType i ) const
+    { return unsigned(i) < InternalRangeRank ? m_index[i] : ~0u ; }
+};
+
+}}} // namespace Kokkos::Experimental::Impl
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+/** \brief  Given a value type and dimension generate the View data type */
+template< class T , class Dim >
+struct ViewDataType ;
+
+template< class T >
+struct ViewDataType< T , ViewDimension<> >
+{
+  typedef T type ;
+};
+
+template< class T , size_t ... Args >
+struct ViewDataType< T , ViewDimension< 0 , Args... > >
+{
+  typedef typename ViewDataType<T*,ViewDimension<Args...> >::type type ;
+};
+
+template< class T , size_t N , size_t ... Args >
+struct ViewDataType< T , ViewDimension< N , Args... > >
+{
+  typedef typename ViewDataType<T,ViewDimension<Args...> >::type type[N] ;
+};
+
+/**\brief  Analysis of View data type.
+ *
+ *  Data type conforms to one of the following patterns :
+ *    {const} value_type [][#][#][#]
+ *    {const} value_type ***[#][#][#]
+ *  Where the sum of counts of '*' and '[#]' is at most ten.
+ *
+ *  Provide typedef for the ViewDimension<...> and value_type.
+ */
+template< class T >
+struct ViewArrayAnalysis 
+{
+  typedef T                                      value_type ;
+  typedef typename std::add_const<    T >::type  const_value_type ;
+  typedef typename std::remove_const< T >::type  non_const_value_type ;
+  typedef ViewDimension<>                        static_dimension ;
+  typedef ViewDimension<>                        dynamic_dimension ;
+  typedef ViewDimension<>                        dimension ;
+};
+
+template< class T , size_t N >
+struct ViewArrayAnalysis< T[N] >
+{
+private:
+  typedef ViewArrayAnalysis< T > nested ;
+public:
+  typedef typename nested::value_type            value_type ;
+  typedef typename nested::const_value_type      const_value_type ;
+  typedef typename nested::non_const_value_type  non_const_value_type ;
+
+  typedef typename nested::static_dimension::template prepend<N>::type
+    static_dimension ;
+
+  typedef typename nested::dynamic_dimension dynamic_dimension ;
+
+  typedef typename
+    ViewDimensionJoin< dynamic_dimension , static_dimension >::type
+      dimension ;
+};
+
+template< class T >
+struct ViewArrayAnalysis< T[] >
+{
+private:
+  typedef ViewArrayAnalysis< T > nested ;
+  typedef typename nested::dimension nested_dimension ;
+public:
+  typedef typename nested::value_type            value_type ;
+  typedef typename nested::const_value_type      const_value_type ;
+  typedef typename nested::non_const_value_type  non_const_value_type ;
+
+  typedef typename nested::dynamic_dimension::template prepend<0>::type
+    dynamic_dimension ;
+
+  typedef typename nested::static_dimension static_dimension ;
+
+  typedef typename
+    ViewDimensionJoin< dynamic_dimension , static_dimension >::type
+      dimension ;
+};
+
+template< class T >
+struct ViewArrayAnalysis< T* >
+{
+private:
+  typedef ViewArrayAnalysis< T > nested ;
+public:
+  typedef typename nested::value_type            value_type ;
+  typedef typename nested::const_value_type      const_value_type ;
+  typedef typename nested::non_const_value_type  non_const_value_type ;
+
+  typedef typename nested::dynamic_dimension::template prepend<0>::type
+    dynamic_dimension ;
+
+  typedef typename nested::static_dimension static_dimension ;
+
+  typedef typename
+    ViewDimensionJoin< dynamic_dimension , static_dimension >::type
+      dimension ;
+};
+
+
+template< class DataType , class ArrayLayout , class ValueType >
+struct ViewDataAnalysis
+{
+private:
+
+  typedef ViewArrayAnalysis< DataType > array_analysis ;
+
+  // ValueType is opportunity for partial specialization.
+  // Must match array analysis when this default template is used.
+  static_assert( std::is_same< ValueType , typename array_analysis::non_const_value_type >::value , "" );
+
+public:
+
+  typedef void specialize ; // No specialization
+
+  typedef typename array_analysis::dimension             dimension ;
+  typedef typename array_analysis::value_type            value_type ;
+  typedef typename array_analysis::const_value_type      const_value_type ;
+  typedef typename array_analysis::non_const_value_type  non_const_value_type ;
+
+  // Generate analogous multidimensional array specification type.
+  typedef typename ViewDataType<           value_type , dimension >::type  type ;
+  typedef typename ViewDataType<     const_value_type , dimension >::type  const_type ;
+  typedef typename ViewDataType< non_const_value_type , dimension >::type  non_const_type ;
+
+  // Generate "flattened" multidimensional array specification type.
+  typedef type            scalar_array_type ;
+  typedef const_type      const_scalar_array_type ;
+  typedef non_const_type  non_const_scalar_array_type ;
+};
+
+}}} // namespace Kokkos::Experimental::Impl
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template < class Dimension , class Layout , typename Enable = void >
+struct ViewOffset {
+  using is_mapping_plugin = std::false_type ;
+};
+
+//----------------------------------------------------------------------------
+// LayoutLeft AND ( 1 >= rank OR 0 == rank_dynamic ) : no padding / striding
+template < class Dimension >
+struct ViewOffset< Dimension , Kokkos::LayoutLeft
+                 , typename std::enable_if<( 1 >= Dimension::rank
+                                             ||
+                                             0 == Dimension::rank_dynamic
+                                           )>::type >
+{
+  using is_mapping_plugin = std::true_type ;
+  using is_regular        = std::true_type ;
+
+  typedef size_t             size_type ;
+  typedef Dimension          dimension_type ;
+  typedef Kokkos::LayoutLeft array_layout ;
+
+  dimension_type m_dim ;
+
+  //----------------------------------------
+
+  // rank 1
+  template< typename I0 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0 ) const { return i0 ; }
+
+  // rank 2
+  template < typename I0 , typename I1 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0 , I1 const & i1 ) const
+    { return i0 + m_dim.N0 * i1 ; }
+
+  //rank 3
+  template < typename I0, typename I1, typename I2 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2 ) const
+  {
+    return i0 + m_dim.N0 * ( i1 + m_dim.N1 * i2 );
+  }
+
+  //rank 4
+  template < typename I0, typename I1, typename I2, typename I3 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3 ) const
+  {
+    return i0 + m_dim.N0 * (
+           i1 + m_dim.N1 * (
+           i2 + m_dim.N2 * i3 ));
+  }
+
+  //rank 5
+  template < typename I0, typename I1, typename I2, typename I3
+           , typename I4 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3
+                      , I4 const & i4 ) const
+  {
+    return i0 + m_dim.N0 * (
+           i1 + m_dim.N1 * (
+           i2 + m_dim.N2 * (
+           i3 + m_dim.N3 * i4 )));
+  }
+
+  //rank 6
+  template < typename I0, typename I1, typename I2, typename I3
+           , typename I4, typename I5 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3
+                      , I4 const & i4, I5 const & i5 ) const
+  {
+    return i0 + m_dim.N0 * (
+           i1 + m_dim.N1 * (
+           i2 + m_dim.N2 * (
+           i3 + m_dim.N3 * (
+           i4 + m_dim.N4 * i5 ))));
+  }
+
+  //rank 7
+  template < typename I0, typename I1, typename I2, typename I3
+           , typename I4, typename I5, typename I6 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3
+                      , I4 const & i4, I5 const & i5, I6 const & i6 ) const
+  {
+    return i0 + m_dim.N0 * (
+           i1 + m_dim.N1 * (
+           i2 + m_dim.N2 * (
+           i3 + m_dim.N3 * (
+           i4 + m_dim.N4 * (
+           i5 + m_dim.N5 * i6 )))));
+  }
+
+  //rank 8
+  template < typename I0, typename I1, typename I2, typename I3
+           , typename I4, typename I5, typename I6, typename I7 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3
+                      , I4 const & i4, I5 const & i5, I6 const & i6, I7 const & i7 ) const
+  {
+    return i0 + m_dim.N0 * (
+           i1 + m_dim.N1 * (
+           i2 + m_dim.N2 * (
+           i3 + m_dim.N3 * (
+           i4 + m_dim.N4 * (
+           i5 + m_dim.N5 * (
+           i6 + m_dim.N6 * i7 ))))));
+  }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr array_layout layout() const
+    {
+      return array_layout( m_dim.N0 , m_dim.N1 , m_dim.N2 , m_dim.N3
+                         , m_dim.N4 , m_dim.N5 , m_dim.N6 , m_dim.N7 );
+    }
+
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_0() const { return m_dim.N0 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_1() const { return m_dim.N1 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_2() const { return m_dim.N2 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_3() const { return m_dim.N3 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_4() const { return m_dim.N4 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_5() const { return m_dim.N5 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_6() const { return m_dim.N6 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_7() const { return m_dim.N7 ; }
+
+  /* Cardinality of the domain index space */
+  KOKKOS_INLINE_FUNCTION
+  constexpr size_type size() const
+    { return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 * m_dim.N6 * m_dim.N7 ; }
+
+  /* Span of the range space */
+  KOKKOS_INLINE_FUNCTION
+  constexpr size_type span() const
+    { return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 * m_dim.N6 * m_dim.N7 ; }
+
+  KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { return true ; }
+
+  /* Strides of dimensions */
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_0() const { return 1 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_1() const { return m_dim.N0 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_2() const { return m_dim.N0 * m_dim.N1 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_3() const { return m_dim.N0 * m_dim.N1 * m_dim.N2 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_4() const { return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_5() const { return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_6() const { return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_7() const { return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 * m_dim.N6 ; }
+
+  // Stride with [ rank ] value is the total length
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  void stride( iType * const s ) const
+    {
+      s[0] = 1 ;
+      if ( 0 < dimension_type::rank ) { s[1] = m_dim.N0 ; }
+      if ( 1 < dimension_type::rank ) { s[2] = s[1] * m_dim.N1 ; }
+      if ( 2 < dimension_type::rank ) { s[3] = s[2] * m_dim.N2 ; }
+      if ( 3 < dimension_type::rank ) { s[4] = s[3] * m_dim.N3 ; }
+      if ( 4 < dimension_type::rank ) { s[5] = s[4] * m_dim.N4 ; }
+      if ( 5 < dimension_type::rank ) { s[6] = s[5] * m_dim.N5 ; }
+      if ( 6 < dimension_type::rank ) { s[7] = s[6] * m_dim.N6 ; }
+      if ( 7 < dimension_type::rank ) { s[8] = s[7] * m_dim.N7 ; }
+    }
+
+  //----------------------------------------
+
+  ViewOffset() = default ;
+  ViewOffset( const ViewOffset & ) = default ;
+  ViewOffset & operator = ( const ViewOffset & ) = default ;
+
+  template< unsigned TrivialScalarSize >
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewOffset
+    ( std::integral_constant<unsigned,TrivialScalarSize> const &
+    , Kokkos::LayoutLeft const & arg_layout
+    )
+    : m_dim( arg_layout.dimension[0], 0, 0, 0, 0, 0, 0, 0 )
+    {}
+
+  template< class DimRHS >
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewOffset( const ViewOffset< DimRHS , Kokkos::LayoutLeft , void > & rhs )
+    : m_dim( rhs.m_dim.N0 , rhs.m_dim.N1 , rhs.m_dim.N2 , rhs.m_dim.N3 
+           , rhs.m_dim.N4 , rhs.m_dim.N5 , rhs.m_dim.N6 , rhs.m_dim.N7 )
+    {
+      static_assert( int(DimRHS::rank) == int(dimension_type::rank) , "ViewOffset assignment requires equal rank" );
+      // Also requires equal static dimensions ...
+    } 
+
+  template< class DimRHS >
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewOffset( const ViewOffset< DimRHS , Kokkos::LayoutRight , void > & rhs )
+    : m_dim( rhs.m_dim.N0, 0, 0, 0, 0, 0, 0, 0 )
+    {
+      static_assert( DimRHS::rank == 1 && dimension_type::rank == 1 && dimension_type::rank_dynamic == 1
+                   , "ViewOffset LayoutLeft and LayoutRight are only compatible when rank == 1" );
+    }
+
+  template< class DimRHS >
+  KOKKOS_INLINE_FUNCTION
+  ViewOffset( const ViewOffset< DimRHS , Kokkos::LayoutStride , void > & rhs )
+    : m_dim( rhs.m_dim.N0, 0, 0, 0, 0, 0, 0, 0 )
+    {
+      static_assert( DimRHS::rank == 1 && dimension_type::rank == 1 && dimension_type::rank_dynamic == 1
+                   , "ViewOffset LayoutLeft and LayoutStride are only compatible when rank == 1" );
+      if ( rhs.m_stride.S0 != 1 ) {
+        Kokkos::abort("Kokkos::Experimental::ViewOffset assignment of LayoutLeft from LayoutStride  requires stride == 1" );
+      }
+    }
+
+  //----------------------------------------
+  // Subview construction
+
+  template< class DimRHS >
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewOffset(
+    const ViewOffset< DimRHS , Kokkos::LayoutLeft , void > & rhs ,
+    const SubviewExtents< DimRHS::rank , dimension_type::rank > & sub )
+    : m_dim( sub.range_extent(0), 0, 0, 0, 0, 0, 0, 0 )
+    {
+      static_assert( ( 0 == dimension_type::rank ) ||
+                     ( 1 == dimension_type::rank && 1 == dimension_type::rank_dynamic && 1 <= DimRHS::rank )
+                   , "ViewOffset subview construction requires compatible rank" );
+    }
+};
+
+//----------------------------------------------------------------------------
+// LayoutLeft AND ( 1 < rank AND 0 < rank_dynamic ) : has padding / striding
+template < class Dimension >
+struct ViewOffset< Dimension , Kokkos::LayoutLeft
+                 , typename std::enable_if<( 1 < Dimension::rank
+                                             &&
+                                             0 < Dimension::rank_dynamic
+                                           )>::type >
+{
+  using is_mapping_plugin = std::true_type ;
+  using is_regular        = std::true_type ;
+
+  typedef size_t             size_type ;
+  typedef Dimension          dimension_type ;
+  typedef Kokkos::LayoutLeft array_layout ;
+
+  dimension_type m_dim ;
+  size_type      m_stride ;
+
+  //----------------------------------------
+
+  // rank 1
+  template< typename I0 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0 ) const { return i0 ; }
+
+  // rank 2
+  template < typename I0 , typename I1 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0 , I1 const & i1 ) const
+    { return i0 + m_stride * i1 ; }
+
+  //rank 3
+  template < typename I0, typename I1, typename I2 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2 ) const
+  {
+    return i0 + m_stride * ( i1 + m_dim.N1 * i2 );
+  }
+
+  //rank 4
+  template < typename I0, typename I1, typename I2, typename I3 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3 ) const
+  {
+    return i0 + m_stride * (
+           i1 + m_dim.N1 * (
+           i2 + m_dim.N2 * i3 ));
+  }
+
+  //rank 5
+  template < typename I0, typename I1, typename I2, typename I3
+           , typename I4 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3
+                      , I4 const & i4 ) const
+  {
+    return i0 + m_stride * (
+           i1 + m_dim.N1 * (
+           i2 + m_dim.N2 * (
+           i3 + m_dim.N3 * i4 )));
+  }
+
+  //rank 6
+  template < typename I0, typename I1, typename I2, typename I3
+           , typename I4, typename I5 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3
+                      , I4 const & i4, I5 const & i5 ) const
+  {
+    return i0 + m_stride * (
+           i1 + m_dim.N1 * (
+           i2 + m_dim.N2 * (
+           i3 + m_dim.N3 * (
+           i4 + m_dim.N4 * i5 ))));
+  }
+
+  //rank 7
+  template < typename I0, typename I1, typename I2, typename I3
+           , typename I4, typename I5, typename I6 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3
+                      , I4 const & i4, I5 const & i5, I6 const & i6 ) const
+  {
+    return i0 + m_stride * (
+           i1 + m_dim.N1 * (
+           i2 + m_dim.N2 * (
+           i3 + m_dim.N3 * (
+           i4 + m_dim.N4 * (
+           i5 + m_dim.N5 * i6 )))));
+  }
+
+  //rank 8
+  template < typename I0, typename I1, typename I2, typename I3
+           , typename I4, typename I5, typename I6, typename I7 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3
+                      , I4 const & i4, I5 const & i5, I6 const & i6, I7 const & i7 ) const
+  {
+    return i0 + m_stride * (
+           i1 + m_dim.N1 * (
+           i2 + m_dim.N2 * (
+           i3 + m_dim.N3 * (
+           i4 + m_dim.N4 * (
+           i5 + m_dim.N5 * (
+           i6 + m_dim.N6 * i7 ))))));
+  }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr array_layout layout() const
+    {
+      return array_layout( m_dim.N0 , m_dim.N1 , m_dim.N2 , m_dim.N3
+                         , m_dim.N4 , m_dim.N5 , m_dim.N6 , m_dim.N7 );
+    }
+
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_0() const { return m_dim.N0 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_1() const { return m_dim.N1 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_2() const { return m_dim.N2 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_3() const { return m_dim.N3 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_4() const { return m_dim.N4 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_5() const { return m_dim.N5 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_6() const { return m_dim.N6 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_7() const { return m_dim.N7 ; }
+
+  /* Cardinality of the domain index space */
+  KOKKOS_INLINE_FUNCTION
+  constexpr size_type size() const
+    { return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 * m_dim.N6 * m_dim.N7 ; }
+
+  /* Span of the range space */
+  KOKKOS_INLINE_FUNCTION
+  constexpr size_type span() const
+    { return m_stride * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 * m_dim.N6 * m_dim.N7 ; }
+
+  KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { return m_stride == m_dim.N0 ; }
+
+  /* Strides of dimensions */
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_0() const { return 1 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_1() const { return m_stride ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_2() const { return m_stride * m_dim.N1 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_3() const { return m_stride * m_dim.N1 * m_dim.N2 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_4() const { return m_stride * m_dim.N1 * m_dim.N2 * m_dim.N3 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_5() const { return m_stride * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_6() const { return m_stride * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_7() const { return m_stride * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 * m_dim.N6 ; }
+
+  // Stride with [ rank ] value is the total length
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  void stride( iType * const s ) const
+    {
+      s[0] = 1 ;
+      if ( 0 < dimension_type::rank ) { s[1] = m_stride ; }
+      if ( 1 < dimension_type::rank ) { s[2] = s[1] * m_dim.N1 ; }
+      if ( 2 < dimension_type::rank ) { s[3] = s[2] * m_dim.N2 ; }
+      if ( 3 < dimension_type::rank ) { s[4] = s[3] * m_dim.N3 ; }
+      if ( 4 < dimension_type::rank ) { s[5] = s[4] * m_dim.N4 ; }
+      if ( 5 < dimension_type::rank ) { s[6] = s[5] * m_dim.N5 ; }
+      if ( 6 < dimension_type::rank ) { s[7] = s[6] * m_dim.N6 ; }
+      if ( 7 < dimension_type::rank ) { s[8] = s[7] * m_dim.N7 ; }
+    }
+
+  //----------------------------------------
+
+private:
+
+  template< unsigned TrivialScalarSize >
+  struct Padding {
+    enum { div = TrivialScalarSize == 0 ? 0 : Kokkos::Impl::MEMORY_ALIGNMENT / ( TrivialScalarSize ? TrivialScalarSize : 1 ) };
+    enum { mod = TrivialScalarSize == 0 ? 0 : Kokkos::Impl::MEMORY_ALIGNMENT % ( TrivialScalarSize ? TrivialScalarSize : 1 ) };
+
+    // If memory alignment is a multiple of the trivial scalar size then attempt to align.
+    enum { align = 0 != TrivialScalarSize && 0 == mod ? div : 0 };
+    enum { div_ok = div ? div : 1 }; // To valid modulo zero in constexpr
+
+    KOKKOS_INLINE_FUNCTION
+    static constexpr size_t stride( size_t const N )
+      {
+        return ( align && ( Kokkos::Impl::MEMORY_ALIGNMENT_THRESHOLD * align < N ) && ( N % div_ok ) )
+               ? N + align - ( N % div_ok ) : N ;
+      }
+  };
+
+public:
+
+  ViewOffset() = default ;
+  ViewOffset( const ViewOffset & ) = default ;
+  ViewOffset & operator = ( const ViewOffset & ) = default ;
+
+  /* Enable padding for trivial scalar types with non-zero trivial scalar size */
+  template< unsigned TrivialScalarSize >
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewOffset
+    ( std::integral_constant<unsigned,TrivialScalarSize> const & padding_type_size
+    , Kokkos::LayoutLeft const & arg_layout
+    )
+    : m_dim( arg_layout.dimension[0] , arg_layout.dimension[1]
+           , arg_layout.dimension[2] , arg_layout.dimension[3]
+           , arg_layout.dimension[4] , arg_layout.dimension[5]
+           , arg_layout.dimension[6] , arg_layout.dimension[7]
+           )
+    , m_stride( Padding<TrivialScalarSize>::stride( arg_layout.dimension[0] ) )
+    {}
+
+  template< class DimRHS >
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewOffset( const ViewOffset< DimRHS , Kokkos::LayoutLeft , void > & rhs )
+    : m_dim( rhs.m_dim.N0 , rhs.m_dim.N1 , rhs.m_dim.N2 , rhs.m_dim.N3 
+           , rhs.m_dim.N4 , rhs.m_dim.N5 , rhs.m_dim.N6 , rhs.m_dim.N7 )
+    , m_stride( rhs.stride_1() )
+    {
+      static_assert( int(DimRHS::rank) == int(dimension_type::rank) , "ViewOffset assignment requires equal rank" );
+      // Also requires equal static dimensions ...
+    } 
+
+  //----------------------------------------
+  // Subview construction
+  // This subview must be 2 == rank and 2 == rank_dynamic
+  // due to only having stride #0.
+  // The source dimension #0 must be non-zero for stride-one leading dimension.
+  // At most subsequent dimension can be non-zero.
+
+  template< class DimRHS >
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewOffset
+    ( const ViewOffset< DimRHS , Kokkos::LayoutLeft , void > & rhs ,
+      const SubviewExtents< DimRHS::rank , dimension_type::rank > & sub )
+    : m_dim( sub.range_extent(0)
+           , sub.range_extent(1)
+           , 0, 0, 0, 0, 0, 0 )
+    , m_stride( ( 1 == sub.range_index(1) ? rhs.stride_1() :
+                ( 2 == sub.range_index(1) ? rhs.stride_2() :
+                ( 3 == sub.range_index(1) ? rhs.stride_3() :
+                ( 4 == sub.range_index(1) ? rhs.stride_4() :
+                ( 5 == sub.range_index(1) ? rhs.stride_5() :
+                ( 6 == sub.range_index(1) ? rhs.stride_6() :
+                ( 7 == sub.range_index(1) ? rhs.stride_7() : 0 ))))))))
+    {
+      static_assert( ( 2 == dimension_type::rank ) &&
+                     ( 2 == dimension_type::rank_dynamic ) &&
+                     ( 2 <= DimRHS::rank )
+                   , "ViewOffset subview construction requires compatible rank" );
+    }
+};
+
+//----------------------------------------------------------------------------
+// LayoutRight AND ( 1 >= rank OR 0 == rank_dynamic ) : no padding / striding
+template < class Dimension >
+struct ViewOffset< Dimension , Kokkos::LayoutRight
+                 , typename std::enable_if<( 1 >= Dimension::rank
+                                             ||
+                                             0 == Dimension::rank_dynamic
+                                           )>::type >
+{
+  using is_mapping_plugin = std::true_type ;
+  using is_regular        = std::true_type ;
+
+  typedef size_t              size_type ;
+  typedef Dimension           dimension_type ;
+  typedef Kokkos::LayoutRight array_layout ;
+
+  dimension_type m_dim ;
+
+  //----------------------------------------
+
+  // rank 1
+  template< typename I0 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0 ) const { return i0 ; }
+
+  // rank 2
+  template < typename I0 , typename I1 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0 , I1 const & i1 ) const
+    { return i1 + m_dim.N1 * i0 ; }
+
+  //rank 3
+  template < typename I0, typename I1, typename I2 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2 ) const
+  {
+    return i2 + m_dim.N2 * ( i1 + m_dim.N1 * ( i0 ));
+  }
+
+  //rank 4
+  template < typename I0, typename I1, typename I2, typename I3 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3 ) const
+  {
+    return i3 + m_dim.N3 * (
+           i2 + m_dim.N2 * (
+           i1 + m_dim.N1 * ( i0 )));
+  }
+
+  //rank 5
+  template < typename I0, typename I1, typename I2, typename I3
+           , typename I4 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3
+                      , I4 const & i4 ) const
+  {
+    return i4 + m_dim.N4 * (
+           i3 + m_dim.N3 * (
+           i2 + m_dim.N2 * (
+           i1 + m_dim.N1 * ( i0 ))));
+  }
+
+  //rank 6
+  template < typename I0, typename I1, typename I2, typename I3
+           , typename I4, typename I5 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3
+                      , I4 const & i4, I5 const & i5 ) const
+  {
+    return i5 + m_dim.N5 * (
+           i4 + m_dim.N4 * (
+           i3 + m_dim.N3 * (
+           i2 + m_dim.N2 * (
+           i1 + m_dim.N1 * ( i0 )))));
+  }
+
+  //rank 7
+  template < typename I0, typename I1, typename I2, typename I3
+           , typename I4, typename I5, typename I6 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3
+                      , I4 const & i4, I5 const & i5, I6 const & i6 ) const
+  {
+    return i6 + m_dim.N6 * (
+           i5 + m_dim.N5 * (
+           i4 + m_dim.N4 * (
+           i3 + m_dim.N3 * (
+           i2 + m_dim.N2 * (
+           i1 + m_dim.N1 * ( i0 ))))));
+  }
+
+  //rank 8
+  template < typename I0, typename I1, typename I2, typename I3
+           , typename I4, typename I5, typename I6, typename I7 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3
+                      , I4 const & i4, I5 const & i5, I6 const & i6, I7 const & i7 ) const
+  {
+    return i7 + m_dim.N7 * (
+           i6 + m_dim.N6 * (
+           i5 + m_dim.N5 * (
+           i4 + m_dim.N4 * (
+           i3 + m_dim.N3 * (
+           i2 + m_dim.N2 * (
+           i1 + m_dim.N1 * ( i0 )))))));
+  }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr array_layout layout() const
+    {
+      return array_layout( m_dim.N0 , m_dim.N1 , m_dim.N2 , m_dim.N3
+                         , m_dim.N4 , m_dim.N5 , m_dim.N6 , m_dim.N7 );
+    }
+
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_0() const { return m_dim.N0 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_1() const { return m_dim.N1 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_2() const { return m_dim.N2 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_3() const { return m_dim.N3 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_4() const { return m_dim.N4 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_5() const { return m_dim.N5 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_6() const { return m_dim.N6 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_7() const { return m_dim.N7 ; }
+
+  /* Cardinality of the domain index space */
+  KOKKOS_INLINE_FUNCTION
+  constexpr size_type size() const
+    { return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 * m_dim.N6 * m_dim.N7 ; }
+
+  /* Span of the range space */
+  KOKKOS_INLINE_FUNCTION
+  constexpr size_type span() const
+    { return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 * m_dim.N6 * m_dim.N7 ; }
+
+  KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { return true ; }
+
+  /* Strides of dimensions */
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_7() const { return 1 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_6() const { return m_dim.N7 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_5() const { return m_dim.N7 * m_dim.N6 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_4() const { return m_dim.N7 * m_dim.N6 * m_dim.N5 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_3() const { return m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_2() const { return m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4 * m_dim.N3 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_1() const { return m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4 * m_dim.N3 * m_dim.N2 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_0() const { return m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4 * m_dim.N3 * m_dim.N2 * m_dim.N1 ; }
+
+  // Stride with [ rank ] value is the total length
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  void stride( iType * const s ) const
+    {
+      size_type n = 1 ;
+      if ( 7 < dimension_type::rank ) { s[7] = n ; n *= m_dim.N7 ; }
+      if ( 6 < dimension_type::rank ) { s[6] = n ; n *= m_dim.N6 ; }
+      if ( 5 < dimension_type::rank ) { s[5] = n ; n *= m_dim.N5 ; }
+      if ( 4 < dimension_type::rank ) { s[4] = n ; n *= m_dim.N4 ; }
+      if ( 3 < dimension_type::rank ) { s[3] = n ; n *= m_dim.N3 ; }
+      if ( 2 < dimension_type::rank ) { s[2] = n ; n *= m_dim.N2 ; }
+      if ( 1 < dimension_type::rank ) { s[1] = n ; n *= m_dim.N1 ; }
+      if ( 0 < dimension_type::rank ) { s[0] = n ; }
+      s[dimension_type::rank] = n * m_dim.N0 ;
+    }
+
+  //----------------------------------------
+
+  ViewOffset() = default ;
+  ViewOffset( const ViewOffset & ) = default ;
+  ViewOffset & operator = ( const ViewOffset & ) = default ;
+
+  template< unsigned TrivialScalarSize >
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewOffset
+    ( std::integral_constant<unsigned,TrivialScalarSize> const &
+    , Kokkos::LayoutRight const & arg_layout
+    )
+    : m_dim( arg_layout.dimension[0], 0, 0, 0, 0, 0, 0, 0 )
+    {}
+
+  template< class DimRHS >
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewOffset( const ViewOffset< DimRHS , Kokkos::LayoutRight , void > & rhs )
+    : m_dim( rhs.m_dim.N0 , rhs.m_dim.N1 , rhs.m_dim.N2 , rhs.m_dim.N3 
+           , rhs.m_dim.N4 , rhs.m_dim.N5 , rhs.m_dim.N6 , rhs.m_dim.N7 )
+    {
+      static_assert( int(DimRHS::rank) == int(dimension_type::rank) , "ViewOffset assignment requires equal rank" );
+      // Also requires equal static dimensions ...
+    } 
+
+  template< class DimRHS >
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewOffset( const ViewOffset< DimRHS , Kokkos::LayoutLeft , void > & rhs )
+    : m_dim( rhs.m_dim.N0, 0, 0, 0, 0, 0, 0, 0 )
+    {
+      static_assert( DimRHS::rank == 1 && dimension_type::rank == 1 && dimension_type::rank_dynamic == 1
+                   , "ViewOffset LayoutRight and LayoutLeft are only compatible when rank == 1" );
+    }
+
+  template< class DimRHS >
+  KOKKOS_INLINE_FUNCTION
+  ViewOffset( const ViewOffset< DimRHS , Kokkos::LayoutStride , void > & rhs )
+    : m_dim( rhs.m_dim.N0, 0, 0, 0, 0, 0, 0, 0 )
+    {
+      static_assert( DimRHS::rank == 1 && dimension_type::rank == 1 && dimension_type::rank_dynamic == 1
+                   , "ViewOffset LayoutLeft/Right and LayoutStride are only compatible when rank == 1" );
+      if ( rhs.m_stride.S0 != 1 ) {
+        Kokkos::abort("Kokkos::Experimental::ViewOffset assignment of LayoutLeft/Right from LayoutStride  requires stride == 1" );
+      }
+    }
+
+  //----------------------------------------
+  // Subview construction
+
+  template< class DimRHS >
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewOffset
+    ( const ViewOffset< DimRHS , Kokkos::LayoutRight , void > & rhs
+    , const SubviewExtents< DimRHS::rank , dimension_type::rank > & sub
+    )
+    : m_dim( sub.range_extent(0) , 0, 0, 0, 0, 0, 0, 0 )
+    {
+      static_assert( ( 0 == dimension_type::rank_dynamic ) ||
+                     ( 1 == dimension_type::rank && 1 == dimension_type::rank_dynamic && 1 <= DimRHS::rank )
+                   , "ViewOffset subview construction requires compatible rank" );
+    }
+};
+
+//----------------------------------------------------------------------------
+// LayoutRight AND ( 1 < rank AND 0 < rank_dynamic ) : has padding / striding
+template < class Dimension >
+struct ViewOffset< Dimension , Kokkos::LayoutRight
+                 , typename std::enable_if<( 1 < Dimension::rank
+                                             &&
+                                             0 < Dimension::rank_dynamic
+                                           )>::type >
+{
+  using is_mapping_plugin = std::true_type ;
+  using is_regular        = std::true_type ;
+
+  typedef size_t               size_type ;
+  typedef Dimension            dimension_type ;
+  typedef Kokkos::LayoutRight  array_layout ;
+
+  dimension_type m_dim ;
+  size_type      m_stride ;
+
+  //----------------------------------------
+
+  // rank 1
+  template< typename I0 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0 ) const { return i0 ; }
+
+  // rank 2
+  template < typename I0 , typename I1 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0 , I1 const & i1 ) const
+  { return i1 + i0 * m_stride ; }
+
+  //rank 3
+  template < typename I0, typename I1, typename I2 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2 ) const
+  { return i2 + m_dim.N2 * ( i1 ) + i0 * m_stride ; }
+
+  //rank 4
+  template < typename I0, typename I1, typename I2, typename I3 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3 ) const
+  {
+    return i3 + m_dim.N3 * (
+           i2 + m_dim.N2 * ( i1 )) +
+           i0 * m_stride ;
+  }
+
+  //rank 5
+  template < typename I0, typename I1, typename I2, typename I3
+           , typename I4 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3
+                      , I4 const & i4 ) const
+  {
+    return i4 + m_dim.N4 * (
+           i3 + m_dim.N3 * (
+           i2 + m_dim.N2 * ( i1 ))) +
+           i0 * m_stride ;
+  }
+
+  //rank 6
+  template < typename I0, typename I1, typename I2, typename I3
+           , typename I4, typename I5 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3
+                      , I4 const & i4, I5 const & i5 ) const
+  {
+    return i5 + m_dim.N5 * (
+           i4 + m_dim.N4 * (
+           i3 + m_dim.N3 * (
+           i2 + m_dim.N2 * ( i1 )))) +
+           i0 * m_stride ;
+  }
+
+  //rank 7
+  template < typename I0, typename I1, typename I2, typename I3
+           , typename I4, typename I5, typename I6 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3
+                      , I4 const & i4, I5 const & i5, I6 const & i6 ) const
+  {
+    return i6 + m_dim.N6 * (
+           i5 + m_dim.N5 * (
+           i4 + m_dim.N4 * (
+           i3 + m_dim.N3 * (
+           i2 + m_dim.N2 * ( i1 ))))) +
+           i0 * m_stride ;
+  }
+
+  //rank 8
+  template < typename I0, typename I1, typename I2, typename I3
+           , typename I4, typename I5, typename I6, typename I7 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3
+                      , I4 const & i4, I5 const & i5, I6 const & i6, I7 const & i7 ) const
+  {
+    return i7 + m_dim.N7 * (
+           i6 + m_dim.N6 * (
+           i5 + m_dim.N5 * (
+           i4 + m_dim.N4 * (
+           i3 + m_dim.N3 * (
+           i2 + m_dim.N2 * ( i1 )))))) +
+           i0 * m_stride ;
+  }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr array_layout layout() const
+    {
+      return array_layout( m_dim.N0 , m_dim.N1 , m_dim.N2 , m_dim.N3
+                         , m_dim.N4 , m_dim.N5 , m_dim.N6 , m_dim.N7 );
+    }
+
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_0() const { return m_dim.N0 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_1() const { return m_dim.N1 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_2() const { return m_dim.N2 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_3() const { return m_dim.N3 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_4() const { return m_dim.N4 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_5() const { return m_dim.N5 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_6() const { return m_dim.N6 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_7() const { return m_dim.N7 ; }
+
+  /* Cardinality of the domain index space */
+  KOKKOS_INLINE_FUNCTION
+  constexpr size_type size() const
+    { return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 * m_dim.N6 * m_dim.N7 ; }
+
+  /* Span of the range space */
+  KOKKOS_INLINE_FUNCTION
+  constexpr size_type span() const
+    { return m_dim.N0 * m_stride ; }
+
+  KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const
+    { return m_stride == m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4 * m_dim.N3 * m_dim.N2 * m_dim.N1 ; }
+
+  /* Strides of dimensions */
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_7() const { return 1 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_6() const { return m_dim.N7 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_5() const { return m_dim.N7 * m_dim.N6 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_4() const { return m_dim.N7 * m_dim.N6 * m_dim.N5 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_3() const { return m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_2() const { return m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4 * m_dim.N3 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_1() const { return m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4 * m_dim.N3 * m_dim.N2 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_0() const { return m_stride ; }
+
+  // Stride with [ rank ] value is the total length
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  void stride( iType * const s ) const
+    {
+      size_type n = 1 ;
+      if ( 7 < dimension_type::rank ) { s[7] = n ; n *= m_dim.N7 ; }
+      if ( 6 < dimension_type::rank ) { s[6] = n ; n *= m_dim.N6 ; }
+      if ( 5 < dimension_type::rank ) { s[5] = n ; n *= m_dim.N5 ; }
+      if ( 4 < dimension_type::rank ) { s[4] = n ; n *= m_dim.N4 ; }
+      if ( 3 < dimension_type::rank ) { s[3] = n ; n *= m_dim.N3 ; }
+      if ( 2 < dimension_type::rank ) { s[2] = n ; n *= m_dim.N2 ; }
+      if ( 1 < dimension_type::rank ) { s[1] = n ; }
+      if ( 0 < dimension_type::rank ) { s[0] = m_stride ; }
+      s[dimension_type::rank] = m_stride * m_dim.N0 ;
+    }
+
+  //----------------------------------------
+
+private:
+
+  template< unsigned TrivialScalarSize >
+  struct Padding {
+    enum { div = TrivialScalarSize == 0 ? 0 : Kokkos::Impl::MEMORY_ALIGNMENT / ( TrivialScalarSize ? TrivialScalarSize : 1 ) };
+    enum { mod = TrivialScalarSize == 0 ? 0 : Kokkos::Impl::MEMORY_ALIGNMENT % ( TrivialScalarSize ? TrivialScalarSize : 1 ) };
+
+    // If memory alignment is a multiple of the trivial scalar size then attempt to align.
+    enum { align = 0 != TrivialScalarSize && 0 == mod ? div : 0 };
+    enum { div_ok = div ? div : 1 }; // To valid modulo zero in constexpr
+
+    KOKKOS_INLINE_FUNCTION
+    static constexpr size_t stride( size_t const N )
+    {
+      return ( align && ( Kokkos::Impl::MEMORY_ALIGNMENT_THRESHOLD * align < N ) && ( N % div_ok ) )
+             ? N + align - ( N % div_ok ) : N ;
+    }
+  };
+
+public:
+
+  ViewOffset() = default ;
+  ViewOffset( const ViewOffset & ) = default ;
+  ViewOffset & operator = ( const ViewOffset & ) = default ;
+
+  /* Enable padding for trivial scalar types with non-zero trivial scalar size.  */
+  template< unsigned TrivialScalarSize >
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewOffset
+    ( std::integral_constant<unsigned,TrivialScalarSize> const & padding_type_size
+    , Kokkos::LayoutRight const & arg_layout
+    )
+    : m_dim( arg_layout.dimension[0] , arg_layout.dimension[1]
+           , arg_layout.dimension[2] , arg_layout.dimension[3]
+           , arg_layout.dimension[4] , arg_layout.dimension[5]
+           , arg_layout.dimension[6] , arg_layout.dimension[7]
+           )
+    , m_stride( Padding<TrivialScalarSize>::
+                  stride( /* 2 <= rank */
+                          m_dim.N1 * ( dimension_type::rank == 2 ? 1 :
+                          m_dim.N2 * ( dimension_type::rank == 3 ? 1 :
+                          m_dim.N3 * ( dimension_type::rank == 4 ? 1 :
+                          m_dim.N4 * ( dimension_type::rank == 5 ? 1 :
+                          m_dim.N5 * ( dimension_type::rank == 6 ? 1 :
+                          m_dim.N6 * ( dimension_type::rank == 7 ? 1 : m_dim.N7 )))))) ))
+    {}
+
+  template< class DimRHS >
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewOffset( const ViewOffset< DimRHS , Kokkos::LayoutRight , void > & rhs )
+    : m_dim( rhs.m_dim.N0 , rhs.m_dim.N1 , rhs.m_dim.N2 , rhs.m_dim.N3 
+           , rhs.m_dim.N4 , rhs.m_dim.N5 , rhs.m_dim.N6 , rhs.m_dim.N7 )
+    , m_stride( rhs.stride_0() )
+    {
+      static_assert( int(DimRHS::rank) == int(dimension_type::rank) , "ViewOffset assignment requires equal rank" );
+      // Also requires equal static dimensions ...
+    } 
+
+  //----------------------------------------
+  // Subview construction
+  // Last dimension must be non-zero
+
+  template< class DimRHS >
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewOffset
+    ( const ViewOffset< DimRHS , Kokkos::LayoutRight , void > & rhs
+    , const SubviewExtents< DimRHS::rank , dimension_type::rank > & sub
+    )
+    : m_dim( sub.range_extent(0)
+           , sub.range_extent(1)
+           , 0, 0, 0, 0, 0, 0 ) 
+    , m_stride( 0 == sub.range_index(0) ? rhs.stride_0() : (
+                1 == sub.range_index(0) ? rhs.stride_1() : (
+                2 == sub.range_index(0) ? rhs.stride_2() : (
+                3 == sub.range_index(0) ? rhs.stride_3() : (
+                4 == sub.range_index(0) ? rhs.stride_4() : (
+                5 == sub.range_index(0) ? rhs.stride_5() : (
+                6 == sub.range_index(0) ? rhs.stride_6() : 0 )))))))
+    {
+      // This subview must be 2 == rank and 2 == rank_dynamic
+      // due to only having stride #0.
+      // The source dimension #0 must be non-zero for stride-one leading dimension.
+      // At most subsequent dimension can be non-zero.
+
+      static_assert( ( 2 == dimension_type::rank ) &&
+                     ( 2 <= DimRHS::rank )
+                   , "ViewOffset subview construction requires compatible rank" );
+    }
+};
+
+//----------------------------------------------------------------------------
+/* Strided array layout only makes sense for 0 < rank */
+/* rank = 0 included for DynRankView case */
+
+template< unsigned Rank >
+struct ViewStride ;
+
+template<>
+struct ViewStride<0> {
+  enum { S0 = 0 , S1 = 0 , S2 = 0 , S3 = 0 , S4 = 0 , S5 = 0 , S6 = 0 , S7 = 0 };
+
+  ViewStride() = default ;
+  ViewStride( const ViewStride & ) = default ;
+  ViewStride & operator = ( const ViewStride & ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewStride( size_t , size_t , size_t , size_t
+                      , size_t , size_t , size_t , size_t )
+    {}
+};
+
+template<>
+struct ViewStride<1> {
+  size_t S0 ;
+  enum { S1 = 0 , S2 = 0 , S3 = 0 , S4 = 0 , S5 = 0 , S6 = 0 , S7 = 0 };
+
+  ViewStride() = default ;
+  ViewStride( const ViewStride & ) = default ;
+  ViewStride & operator = ( const ViewStride & ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewStride( size_t aS0 , size_t , size_t , size_t
+                      , size_t , size_t , size_t , size_t )
+    : S0( aS0 )
+    {}
+};
+
+template<>
+struct ViewStride<2> {
+  size_t S0 , S1 ;
+  enum { S2 = 0 , S3 = 0 , S4 = 0 , S5 = 0 , S6 = 0 , S7 = 0 };
+
+  ViewStride() = default ;
+  ViewStride( const ViewStride & ) = default ;
+  ViewStride & operator = ( const ViewStride & ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewStride( size_t aS0 , size_t aS1 , size_t , size_t
+                      , size_t , size_t , size_t , size_t )
+    : S0( aS0 ) , S1( aS1 )
+    {}
+};
+
+template<>
+struct ViewStride<3> {
+  size_t S0 , S1 , S2 ;
+  enum { S3 = 0 , S4 = 0 , S5 = 0 , S6 = 0 , S7 = 0 };
+
+  ViewStride() = default ;
+  ViewStride( const ViewStride & ) = default ;
+  ViewStride & operator = ( const ViewStride & ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewStride( size_t aS0 , size_t aS1 , size_t aS2 , size_t
+                      , size_t , size_t , size_t , size_t )
+    : S0( aS0 ) , S1( aS1 ) , S2( aS2 )
+    {}
+};
+
+template<>
+struct ViewStride<4> {
+  size_t S0 , S1 , S2 , S3 ;
+  enum { S4 = 0 , S5 = 0 , S6 = 0 , S7 = 0 };
+
+  ViewStride() = default ;
+  ViewStride( const ViewStride & ) = default ;
+  ViewStride & operator = ( const ViewStride & ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewStride( size_t aS0 , size_t aS1 , size_t aS2 , size_t aS3
+                      , size_t , size_t , size_t , size_t )
+    : S0( aS0 ) , S1( aS1 ) , S2( aS2 ) , S3( aS3 )
+    {}
+};
+
+template<>
+struct ViewStride<5> {
+  size_t S0 , S1 , S2 , S3 , S4 ;
+  enum { S5 = 0 , S6 = 0 , S7 = 0 };
+
+  ViewStride() = default ;
+  ViewStride( const ViewStride & ) = default ;
+  ViewStride & operator = ( const ViewStride & ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewStride( size_t aS0 , size_t aS1 , size_t aS2 , size_t aS3
+                      , size_t aS4 , size_t , size_t , size_t )
+    : S0( aS0 ) , S1( aS1 ) , S2( aS2 ) , S3( aS3 )
+    , S4( aS4 )
+    {}
+};
+
+template<>
+struct ViewStride<6> {
+  size_t S0 , S1 , S2 , S3 , S4 , S5 ;
+  enum { S6 = 0 , S7 = 0 };
+
+  ViewStride() = default ;
+  ViewStride( const ViewStride & ) = default ;
+  ViewStride & operator = ( const ViewStride & ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewStride( size_t aS0 , size_t aS1 , size_t aS2 , size_t aS3
+                      , size_t aS4 , size_t aS5 , size_t , size_t )
+    : S0( aS0 ) , S1( aS1 ) , S2( aS2 ) , S3( aS3 )
+    , S4( aS4 ) , S5( aS5 )
+    {}
+};
+
+template<>
+struct ViewStride<7> {
+  size_t S0 , S1 , S2 , S3 , S4 , S5 , S6 ;
+  enum { S7 = 0 };
+
+  ViewStride() = default ;
+  ViewStride( const ViewStride & ) = default ;
+  ViewStride & operator = ( const ViewStride & ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewStride( size_t aS0 , size_t aS1 , size_t aS2 , size_t aS3
+                      , size_t aS4 , size_t aS5 , size_t aS6 , size_t )
+    : S0( aS0 ) , S1( aS1 ) , S2( aS2 ) , S3( aS3 )
+    , S4( aS4 ) , S5( aS5 ) , S6( aS6 )
+    {}
+};
+
+template<>
+struct ViewStride<8> {
+  size_t S0 , S1 , S2 , S3 , S4 , S5 , S6 , S7 ;
+
+  ViewStride() = default ;
+  ViewStride( const ViewStride & ) = default ;
+  ViewStride & operator = ( const ViewStride & ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewStride( size_t aS0 , size_t aS1 , size_t aS2 , size_t aS3
+                      , size_t aS4 , size_t aS5 , size_t aS6 , size_t aS7 )
+    : S0( aS0 ) , S1( aS1 ) , S2( aS2 ) , S3( aS3 )
+    , S4( aS4 ) , S5( aS5 ) , S6( aS6 ) , S7( aS7 )
+    {}
+};
+
+template < class Dimension >
+struct ViewOffset< Dimension , Kokkos::LayoutStride
+                 , void >
+{
+private:
+  typedef ViewStride< Dimension::rank >  stride_type ;
+public:
+
+  using is_mapping_plugin = std::true_type ;
+  using is_regular        = std::true_type ;
+
+  typedef size_t                size_type ;
+  typedef Dimension             dimension_type ;
+  typedef Kokkos::LayoutStride  array_layout ;
+
+  dimension_type  m_dim ;
+  stride_type     m_stride ;
+
+  //----------------------------------------
+
+  // rank 1
+  template< typename I0 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0 ) const
+  {
+    return i0 * m_stride.S0 ;
+  }
+
+  // rank 2
+  template < typename I0 , typename I1 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0 , I1 const & i1 ) const
+  {
+    return i0 * m_stride.S0 +
+           i1 * m_stride.S1 ;
+  }
+
+  //rank 3
+  template < typename I0, typename I1, typename I2 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2 ) const
+  {
+    return i0 * m_stride.S0 +
+           i1 * m_stride.S1 +
+           i2 * m_stride.S2 ;
+  }
+
+  //rank 4
+  template < typename I0, typename I1, typename I2, typename I3 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3 ) const
+  {
+    return i0 * m_stride.S0 +
+           i1 * m_stride.S1 +
+           i2 * m_stride.S2 +
+           i3 * m_stride.S3 ;
+  }
+
+  //rank 5
+  template < typename I0, typename I1, typename I2, typename I3
+           , typename I4 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3
+                      , I4 const & i4 ) const
+  {
+    return i0 * m_stride.S0 +
+           i1 * m_stride.S1 +
+           i2 * m_stride.S2 +
+           i3 * m_stride.S3 +
+           i4 * m_stride.S4 ;
+  }
+
+  //rank 6
+  template < typename I0, typename I1, typename I2, typename I3
+           , typename I4, typename I5 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3
+                      , I4 const & i4, I5 const & i5 ) const
+  {
+    return i0 * m_stride.S0 +
+           i1 * m_stride.S1 +
+           i2 * m_stride.S2 +
+           i3 * m_stride.S3 +
+           i4 * m_stride.S4 +
+           i5 * m_stride.S5 ;
+  }
+
+  //rank 7
+  template < typename I0, typename I1, typename I2, typename I3
+           , typename I4, typename I5, typename I6 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3
+                      , I4 const & i4, I5 const & i5, I6 const & i6 ) const
+  {
+    return i0 * m_stride.S0 +
+           i1 * m_stride.S1 +
+           i2 * m_stride.S2 +
+           i3 * m_stride.S3 +
+           i4 * m_stride.S4 +
+           i5 * m_stride.S5 +
+           i6 * m_stride.S6 ;
+  }
+
+  //rank 8
+  template < typename I0, typename I1, typename I2, typename I3
+           , typename I4, typename I5, typename I6, typename I7 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3
+                      , I4 const & i4, I5 const & i5, I6 const & i6, I7 const & i7 ) const
+  {
+    return i0 * m_stride.S0 +
+           i1 * m_stride.S1 +
+           i2 * m_stride.S2 +
+           i3 * m_stride.S3 +
+           i4 * m_stride.S4 +
+           i5 * m_stride.S5 +
+           i6 * m_stride.S6 +
+           i7 * m_stride.S7 ;
+  }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr array_layout layout() const
+    {
+      return array_layout( m_dim.N0 , m_stride.S0
+                         , m_dim.N1 , m_stride.S1
+                         , m_dim.N2 , m_stride.S2
+                         , m_dim.N3 , m_stride.S3
+                         , m_dim.N4 , m_stride.S4
+                         , m_dim.N5 , m_stride.S5
+                         , m_dim.N6 , m_stride.S6
+                         , m_dim.N7 , m_stride.S7
+                         );
+    }
+
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_0() const { return m_dim.N0 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_1() const { return m_dim.N1 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_2() const { return m_dim.N2 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_3() const { return m_dim.N3 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_4() const { return m_dim.N4 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_5() const { return m_dim.N5 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_6() const { return m_dim.N6 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_7() const { return m_dim.N7 ; }
+
+  /* Cardinality of the domain index space */
+  KOKKOS_INLINE_FUNCTION
+  constexpr size_type size() const
+    { return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 * m_dim.N6 * m_dim.N7 ; }
+
+private:
+
+  KOKKOS_INLINE_FUNCTION
+  static constexpr size_type Max( size_type lhs , size_type rhs )
+    { return lhs < rhs ? rhs : lhs ; }
+
+public:
+
+  /* Span of the range space, largest stride * dimension */
+  KOKKOS_INLINE_FUNCTION
+  constexpr size_type span() const
+    {
+      return Max( m_dim.N0 * m_stride.S0 ,
+             Max( m_dim.N1 * m_stride.S1 ,
+             Max( m_dim.N2 * m_stride.S2 ,
+             Max( m_dim.N3 * m_stride.S3 ,
+             Max( m_dim.N4 * m_stride.S4 ,
+             Max( m_dim.N5 * m_stride.S5 ,
+             Max( m_dim.N6 * m_stride.S6 ,
+                  m_dim.N7 * m_stride.S7 )))))));
+    }
+
+  KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { return span() == size(); }
+
+  /* Strides of dimensions */
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_0() const { return m_stride.S0 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_1() const { return m_stride.S1 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_2() const { return m_stride.S2 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_3() const { return m_stride.S3 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_4() const { return m_stride.S4 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_5() const { return m_stride.S5 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_6() const { return m_stride.S6 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_7() const { return m_stride.S7 ; }
+
+  // Stride with [ rank ] value is the total length
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  void stride( iType * const s ) const
+    {
+      if ( 0 < dimension_type::rank ) { s[0] = m_stride.S0 ; }
+      if ( 1 < dimension_type::rank ) { s[1] = m_stride.S1 ; }
+      if ( 2 < dimension_type::rank ) { s[2] = m_stride.S2 ; }
+      if ( 3 < dimension_type::rank ) { s[3] = m_stride.S3 ; }
+      if ( 4 < dimension_type::rank ) { s[4] = m_stride.S4 ; }
+      if ( 5 < dimension_type::rank ) { s[5] = m_stride.S5 ; }
+      if ( 6 < dimension_type::rank ) { s[6] = m_stride.S6 ; }
+      if ( 7 < dimension_type::rank ) { s[7] = m_stride.S7 ; }
+      s[dimension_type::rank] = span();
+    }
+
+  //----------------------------------------
+
+  ViewOffset() = default ;
+  ViewOffset( const ViewOffset & ) = default ;
+  ViewOffset & operator = ( const ViewOffset & ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewOffset( std::integral_constant<unsigned,0> const &
+                      , Kokkos::LayoutStride const & rhs )
+    : m_dim( rhs.dimension[0] , rhs.dimension[1] , rhs.dimension[2] , rhs.dimension[3]
+           , rhs.dimension[4] , rhs.dimension[5] , rhs.dimension[6] , rhs.dimension[7] )
+    , m_stride( rhs.stride[0] , rhs.stride[1] , rhs.stride[2] , rhs.stride[3]
+              , rhs.stride[4] , rhs.stride[5] , rhs.stride[6] , rhs.stride[7] )
+    {}
+
+  template< class DimRHS , class LayoutRHS >
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewOffset( const ViewOffset< DimRHS , LayoutRHS , void > & rhs )
+    : m_dim( rhs.m_dim.N0 , rhs.m_dim.N1 , rhs.m_dim.N2 , rhs.m_dim.N3 
+           , rhs.m_dim.N4 , rhs.m_dim.N5 , rhs.m_dim.N6 , rhs.m_dim.N7 )
+    , m_stride( rhs.stride_0() , rhs.stride_1() , rhs.stride_2() , rhs.stride_3()
+              , rhs.stride_4() , rhs.stride_5() , rhs.stride_6() , rhs.stride_7() )
+    {
+      static_assert( int(DimRHS::rank) == int(dimension_type::rank) , "ViewOffset assignment requires equal rank" );
+      // Also requires equal static dimensions ...
+    }
+
+  //----------------------------------------
+  // Subview construction
+
+private:
+
+  template< class DimRHS , class LayoutRHS >
+  KOKKOS_INLINE_FUNCTION static
+  constexpr size_t stride
+    ( unsigned r , const ViewOffset< DimRHS , LayoutRHS , void > & rhs )
+    {
+      return r >  7 ? 0 : (
+             r == 0 ? rhs.stride_0() : (
+             r == 1 ? rhs.stride_1() : (
+             r == 2 ? rhs.stride_2() : (
+             r == 3 ? rhs.stride_3() : (
+             r == 4 ? rhs.stride_4() : (
+             r == 5 ? rhs.stride_5() : (
+             r == 6 ? rhs.stride_6() : rhs.stride_7() )))))));
+    }
+
+public:
+
+  template< class DimRHS , class LayoutRHS >
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewOffset
+    ( const ViewOffset< DimRHS , LayoutRHS , void > & rhs
+    , const SubviewExtents< DimRHS::rank , dimension_type::rank > & sub
+    )
+    // range_extent(r) returns 0 when dimension_type::rank <= r
+    : m_dim( sub.range_extent(0)
+           , sub.range_extent(1)
+           , sub.range_extent(2)
+           , sub.range_extent(3)
+           , sub.range_extent(4)
+           , sub.range_extent(5)
+           , sub.range_extent(6)
+           , sub.range_extent(7)
+           )
+    // range_index(r) returns ~0u when dimension_type::rank <= r
+    , m_stride( stride( sub.range_index(0), rhs )
+              , stride( sub.range_index(1), rhs )
+              , stride( sub.range_index(2), rhs )
+              , stride( sub.range_index(3), rhs )
+              , stride( sub.range_index(4), rhs )
+              , stride( sub.range_index(5), rhs )
+              , stride( sub.range_index(6), rhs )
+              , stride( sub.range_index(7), rhs )
+              )
+    {}
+};
+
+}}} // namespace Kokkos::Experimental::Impl
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+/** \brief  ViewDataHandle provides the type of the 'data handle' which the view
+ *          uses to access data with the [] operator. It also provides
+ *          an allocate function and a function to extract a raw ptr from the
+ *          data handle. ViewDataHandle also defines an enum ReferenceAble which
+ *          specifies whether references/pointers to elements can be taken and a
+ *          'return_type' which is what the view operators will give back.
+ *          Specialisation of this object allows three things depending
+ *          on ViewTraits and compiler options:
+ *          (i)   Use special allocator (e.g. huge pages/small pages and pinned memory)
+ *          (ii)  Use special data handle type (e.g. add Cuda Texture Object)
+ *          (iii) Use special access intrinsics (e.g. texture fetch and non-caching loads)
+ */
+template< class Traits , class Enable = void >
+struct ViewDataHandle {
+
+  typedef typename Traits::value_type   value_type  ;
+  typedef typename Traits::value_type * handle_type ;
+  typedef typename Traits::value_type & return_type ;
+  typedef Kokkos::Experimental::Impl::SharedAllocationTracker  track_type  ;
+
+  KOKKOS_INLINE_FUNCTION
+  static handle_type assign( value_type * arg_data_ptr
+                           , track_type const & /*arg_tracker*/ )
+  {
+    return handle_type( arg_data_ptr );
+  }
+};
+
+template< class Traits >
+struct ViewDataHandle< Traits ,
+  typename std::enable_if<( std::is_same< typename Traits::non_const_value_type
+                                        , typename Traits::value_type >::value
+                            &&
+                            std::is_same< typename Traits::specialize , void >::value
+                            &&
+                            Traits::memory_traits::Atomic
+                          )>::type >
+{
+  typedef typename Traits::value_type  value_type ;
+  typedef typename Kokkos::Impl::AtomicViewDataHandle< Traits >  handle_type ;
+  typedef typename Kokkos::Impl::AtomicDataElement< Traits >     return_type ;
+  typedef Kokkos::Experimental::Impl::SharedAllocationTracker    track_type  ;
+
+  KOKKOS_INLINE_FUNCTION
+  static handle_type assign( value_type * arg_data_ptr
+                           , track_type const & /*arg_tracker*/ )
+  {
+    return handle_type( arg_data_ptr );
+  }
+};
+
+}}} // namespace Kokkos::Experimental::Impl
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+
+/*
+ *  The construction, assignment to default, and destruction
+ *  are merged into a single functor.
+ *  Primarily to work around an unresolved CUDA back-end bug
+ *  that would lose the destruction cuda device function when
+ *  called from the shared memory tracking destruction.
+ *  Secondarily to have two fewer partial specializations.
+ */
+template< class ExecSpace
+        , class ValueType
+        , bool IsScalar = std::is_scalar< ValueType >::value
+        >
+struct ViewValueFunctor ;
+
+template< class ExecSpace , class ValueType >
+struct ViewValueFunctor< ExecSpace , ValueType , false /* is_scalar */ >
+{
+  typedef Kokkos::RangePolicy< ExecSpace > PolicyType ;
+
+  ExecSpace   space ;
+  ValueType * ptr ;
+  size_t      n ;
+  bool        destroy ;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_t i ) const
+    {
+      if ( destroy ) { (ptr+i)->~ValueType(); }
+      else           { new (ptr+i) ValueType(); }
+    }
+
+  ViewValueFunctor() = default ;
+  ViewValueFunctor( const ViewValueFunctor & ) = default ;
+  ViewValueFunctor & operator = ( const ViewValueFunctor & ) = default ;
+
+  ViewValueFunctor( ExecSpace   const & arg_space
+                  , ValueType * const arg_ptr
+                  , size_t      const arg_n )
+    : space( arg_space )
+    , ptr( arg_ptr )
+    , n( arg_n )
+    , destroy( false )
+    {}
+
+  void execute( bool arg )
+    {
+      destroy = arg ;
+      if ( ! space.in_parallel() ) {
+        const Kokkos::Impl::ParallelFor< ViewValueFunctor , PolicyType >
+          closure( *this , PolicyType( 0 , n ) );
+        closure.execute();
+        space.fence();
+      }
+      else {
+        for ( size_t i = 0 ; i < n ; ++i ) operator()(i);
+      }
+    }
+
+  void construct_shared_allocation()
+    { execute( false ); }
+
+  void destroy_shared_allocation()
+    { execute( true ); }
+};
+
+
+template< class ExecSpace , class ValueType >
+struct ViewValueFunctor< ExecSpace , ValueType , true /* is_scalar */ >
+{
+  typedef Kokkos::RangePolicy< ExecSpace > PolicyType ;
+
+  ExecSpace   space ;
+  ValueType * ptr ;
+  size_t      n ;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_t i ) const
+    { ptr[i] = ValueType(); }
+
+  ViewValueFunctor() = default ;
+  ViewValueFunctor( const ViewValueFunctor & ) = default ;
+  ViewValueFunctor & operator = ( const ViewValueFunctor & ) = default ;
+
+  ViewValueFunctor( ExecSpace   const & arg_space
+                  , ValueType * const arg_ptr
+                  , size_t      const arg_n )
+    : space( arg_space )
+    , ptr( arg_ptr )
+    , n( arg_n )
+    {}
+
+  void construct_shared_allocation()
+    {
+      if ( ! space.in_parallel() ) {
+        const Kokkos::Impl::ParallelFor< ViewValueFunctor , PolicyType >
+          closure( *this , PolicyType( 0 , n ) );
+        closure.execute();
+        space.fence();
+      }
+      else {
+        for ( size_t i = 0 ; i < n ; ++i ) operator()(i);
+      }
+    }
+
+  void destroy_shared_allocation() {}
+};
+
+//----------------------------------------------------------------------------
+/** \brief  View mapping for non-specialized data type and standard layout */
+template< class Traits >
+class ViewMapping< Traits ,
+  typename std::enable_if<(
+    std::is_same< typename Traits::specialize , void >::value
+    &&
+    ViewOffset< typename Traits::dimension
+              , typename Traits::array_layout
+              , void >::is_mapping_plugin::value
+  )>::type >
+{
+private:
+
+  template< class , class ... > friend class ViewMapping ;
+  template< class , class ... > friend class Kokkos::Experimental::View ;
+
+  typedef ViewOffset< typename Traits::dimension
+                    , typename Traits::array_layout
+                    , void
+                    >  offset_type ;
+
+  typedef typename ViewDataHandle< Traits >::handle_type  handle_type ;
+
+  handle_type  m_handle ;
+  offset_type  m_offset ;
+
+  KOKKOS_INLINE_FUNCTION
+  ViewMapping( const handle_type & arg_handle , const offset_type & arg_offset )
+    : m_handle( arg_handle )
+    , m_offset( arg_offset )
+    {}
+
+public:
+
+  //----------------------------------------
+  // Domain dimensions
+
+  enum { Rank = Traits::dimension::rank };
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION constexpr size_t extent( const iType & r ) const
+    { return m_offset.m_dim.extent(r); }
+
+  KOKKOS_INLINE_FUNCTION constexpr
+  typename Traits::array_layout layout() const
+    { return m_offset.layout(); }
+
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_0() const { return m_offset.dimension_0(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_1() const { return m_offset.dimension_1(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_2() const { return m_offset.dimension_2(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_3() const { return m_offset.dimension_3(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_4() const { return m_offset.dimension_4(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_5() const { return m_offset.dimension_5(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_6() const { return m_offset.dimension_6(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_7() const { return m_offset.dimension_7(); }
+
+  // Is a regular layout with uniform striding for each index.
+  using is_regular = typename offset_type::is_regular ;
+
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_0() const { return m_offset.stride_0(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_1() const { return m_offset.stride_1(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_2() const { return m_offset.stride_2(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_3() const { return m_offset.stride_3(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_4() const { return m_offset.stride_4(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_5() const { return m_offset.stride_5(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_6() const { return m_offset.stride_6(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_7() const { return m_offset.stride_7(); }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION void stride( iType * const s ) const { m_offset.stride(s); }
+
+  //----------------------------------------
+  // Range span
+
+  /** \brief  Span of the mapped range */
+  KOKKOS_INLINE_FUNCTION constexpr size_t span() const { return m_offset.span(); }
+
+  /** \brief  Is the mapped range span contiguous */
+  KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { return m_offset.span_is_contiguous(); }
+
+  typedef typename ViewDataHandle< Traits >::return_type  reference_type ;
+  typedef typename Traits::value_type *                   pointer_type ;
+
+  /** \brief  If data references are lvalue_reference than can query pointer to memory */
+  KOKKOS_INLINE_FUNCTION constexpr pointer_type data() const
+    {
+      return std::is_lvalue_reference< reference_type >::value
+             ? (pointer_type) m_handle
+             : (pointer_type) 0 ;
+    }
+
+  //----------------------------------------
+  // The View class performs all rank and bounds checking before
+  // calling these element reference methods.
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  reference_type reference() const { return m_handle[0]; }
+
+  template< typename I0 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename
+    std::enable_if< std::is_integral<I0>::value &&
+                    ! std::is_same< typename Traits::array_layout , Kokkos::LayoutStride >::value
+                  , reference_type >::type
+  reference( const I0 & i0 ) const { return m_handle[i0]; }
+
+  template< typename I0 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename
+    std::enable_if< std::is_integral<I0>::value &&
+                    std::is_same< typename Traits::array_layout , Kokkos::LayoutStride >::value
+                  , reference_type >::type
+  reference( const I0 & i0 ) const { return m_handle[ m_offset(i0) ]; }
+
+  template< typename I0 , typename I1 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  reference_type reference( const I0 & i0 , const I1 & i1 ) const
+    { return m_handle[ m_offset(i0,i1) ]; }
+
+  template< typename I0 , typename I1 , typename I2 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  reference_type reference( const I0 & i0 , const I1 & i1 , const I2 & i2 ) const
+    { return m_handle[ m_offset(i0,i1,i2) ]; }
+
+  template< typename I0 , typename I1 , typename I2 , typename I3 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  reference_type reference( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3 ) const
+    { return m_handle[ m_offset(i0,i1,i2,i3) ]; }
+
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , typename I4 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  reference_type reference( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+                          , const I4 & i4 ) const
+    { return m_handle[ m_offset(i0,i1,i2,i3,i4) ]; }
+
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , typename I4 , typename I5 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  reference_type reference( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+                          , const I4 & i4 , const I5 & i5 ) const
+    { return m_handle[ m_offset(i0,i1,i2,i3,i4,i5) ]; }
+
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , typename I4 , typename I5 , typename I6 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  reference_type reference( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+                          , const I4 & i4 , const I5 & i5 , const I6 & i6 ) const
+    { return m_handle[ m_offset(i0,i1,i2,i3,i4,i5,i6) ]; }
+
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , typename I4 , typename I5 , typename I6 , typename I7 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  reference_type reference( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+                          , const I4 & i4 , const I5 & i5 , const I6 & i6 , const I7 & i7 ) const
+    { return m_handle[ m_offset(i0,i1,i2,i3,i4,i5,i6,i7) ]; }
+
+  //----------------------------------------
+
+private:
+
+  enum { MemorySpanMask = 8 - 1 /* Force alignment on 8 byte boundary */ };
+  enum { MemorySpanSize = sizeof(typename Traits::value_type) };
+
+public:
+
+  /** \brief  Span, in bytes, of the referenced memory */
+  KOKKOS_INLINE_FUNCTION constexpr size_t memory_span() const
+    {
+      return ( m_offset.span() * sizeof(typename Traits::value_type) + MemorySpanMask ) & ~size_t(MemorySpanMask);
+    }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION ~ViewMapping() {}
+  KOKKOS_INLINE_FUNCTION ViewMapping() : m_handle(), m_offset() {}
+  KOKKOS_INLINE_FUNCTION ViewMapping( const ViewMapping & rhs )
+    : m_handle( rhs.m_handle ), m_offset( rhs.m_offset ) {}
+  KOKKOS_INLINE_FUNCTION ViewMapping & operator = ( const ViewMapping & rhs )
+    { m_handle = rhs.m_handle ; m_offset = rhs.m_offset ; return *this ; }
+
+  KOKKOS_INLINE_FUNCTION ViewMapping( ViewMapping && rhs )
+    : m_handle( rhs.m_handle ), m_offset( rhs.m_offset ) {}
+  KOKKOS_INLINE_FUNCTION ViewMapping & operator = ( ViewMapping && rhs )
+    { m_handle = rhs.m_handle ; m_offset = rhs.m_offset ; return *this ; }
+
+  //----------------------------------------
+
+  /**\brief  Span, in bytes, of the required memory */
+  KOKKOS_INLINE_FUNCTION
+  static constexpr size_t memory_span( typename Traits::array_layout const & arg_layout )
+    {
+      typedef std::integral_constant< unsigned , 0 >  padding ;
+      return ( offset_type( padding(), arg_layout ).span() * MemorySpanSize + MemorySpanMask ) & ~size_t(MemorySpanMask);
+    }
+
+  /**\brief  Wrap a span of memory */
+  template< class ... P >
+  KOKKOS_INLINE_FUNCTION
+  ViewMapping( ViewCtorProp< P ... > const & arg_prop
+             , typename Traits::array_layout const & arg_layout
+             )
+    : m_handle( ( (ViewCtorProp<void,pointer_type> const &) arg_prop ).value )
+    , m_offset( std::integral_constant< unsigned , 0 >() , arg_layout )
+    {}
+
+  //----------------------------------------
+  /*  Allocate and construct mapped array.
+   *  Allocate via shared allocation record and
+   *  return that record for allocation tracking.
+   */
+  template< class ... P >
+  SharedAllocationRecord<> *
+  allocate_shared( ViewCtorProp< P... > const & arg_prop
+                 , typename Traits::array_layout const & arg_layout )
+  {
+    typedef ViewCtorProp< P... > alloc_prop ;
+
+    typedef typename alloc_prop::execution_space  execution_space ;
+    typedef typename Traits::memory_space         memory_space ;
+    typedef typename Traits::value_type           value_type ;
+    typedef ViewValueFunctor< execution_space , value_type > functor_type ;
+    typedef SharedAllocationRecord< memory_space , functor_type > record_type ;
+
+    // Query the mapping for byte-size of allocation.
+    // If padding is allowed then pass in sizeof value type
+    // for padding computation.
+    typedef std::integral_constant
+      < unsigned
+      , alloc_prop::allow_padding ? sizeof(value_type) : 0
+      > padding ;
+
+    m_offset = offset_type( padding(), arg_layout );
+
+    const size_t alloc_size =
+      ( m_offset.span() * MemorySpanSize + MemorySpanMask ) & ~size_t(MemorySpanMask);
+
+    // Create shared memory tracking record with allocate memory from the memory space
+    record_type * const record =
+      record_type::allocate( ( (ViewCtorProp<void,memory_space> const &) arg_prop ).value
+                           , ( (ViewCtorProp<void,std::string>  const &) arg_prop ).value
+                           , alloc_size );
+
+    //  Only set the the pointer and initialize if the allocation is non-zero.
+    //  May be zero if one of the dimensions is zero.
+    if ( alloc_size ) {
+
+      m_handle = handle_type( reinterpret_cast< pointer_type >( record->data() ) );
+
+      if ( alloc_prop::initialize ) {
+        // Assume destruction is only required when construction is requested.
+        // The ViewValueFunctor has both value construction and destruction operators.
+        record->m_destroy = functor_type( ( (ViewCtorProp<void,execution_space> const &) arg_prop).value
+                                        , (value_type *) m_handle
+                                        , m_offset.span()
+                                        );
+
+        // Construct values
+        record->m_destroy.construct_shared_allocation();
+      }
+    }
+
+    return record ;
+  }
+};
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+/** \brief  Assign compatible default mappings */
+
+template< class DstTraits , class SrcTraits >
+class ViewMapping< DstTraits , SrcTraits ,
+  typename std::enable_if<(
+    std::is_same< typename DstTraits::memory_space , typename SrcTraits::memory_space >::value
+    &&
+    std::is_same< typename DstTraits::specialize , void >::value
+    &&
+    std::is_same< typename SrcTraits::specialize , void >::value
+    &&
+    (
+      std::is_same< typename DstTraits::array_layout , typename SrcTraits::array_layout >::value
+      ||
+      (
+        (
+          std::is_same< typename DstTraits::array_layout , Kokkos::LayoutLeft >::value ||
+          std::is_same< typename DstTraits::array_layout , Kokkos::LayoutRight >::value ||
+          std::is_same< typename DstTraits::array_layout , Kokkos::LayoutStride >::value
+        )
+        &&
+        (
+          std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutLeft >::value ||
+          std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutRight >::value ||
+          std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutStride >::value
+        )
+      )
+    )
+  )>::type >
+{
+private:
+
+  enum { is_assignable_value_type =
+    std::is_same< typename DstTraits::value_type
+                , typename SrcTraits::value_type >::value ||
+    std::is_same< typename DstTraits::value_type
+                , typename SrcTraits::const_value_type >::value };
+
+  enum { is_assignable_dimension =
+    ViewDimensionAssignable< typename DstTraits::dimension
+                           , typename SrcTraits::dimension >::value };
+
+  enum { is_assignable_layout =
+    std::is_same< typename DstTraits::array_layout
+                , typename SrcTraits::array_layout >::value ||
+    std::is_same< typename DstTraits::array_layout
+                , Kokkos::LayoutStride >::value ||
+    ( DstTraits::dimension::rank == 0 ) ||
+    ( DstTraits::dimension::rank == 1 &&
+      DstTraits::dimension::rank_dynamic == 1 )
+    };
+
+public:
+
+  enum { is_assignable = is_assignable_value_type &&
+                         is_assignable_dimension &&
+                         is_assignable_layout };
+
+  typedef Kokkos::Experimental::Impl::SharedAllocationTracker  TrackType ;
+  typedef ViewMapping< DstTraits , void >  DstType ;
+  typedef ViewMapping< SrcTraits , void >  SrcType ;
+
+  KOKKOS_INLINE_FUNCTION
+  static void assign( DstType & dst , const SrcType & src , const TrackType & src_track )
+    {
+      static_assert( is_assignable_value_type
+                   , "View assignment must have same value type or const = non-const" );
+
+      static_assert( is_assignable_dimension
+                   , "View assignment must have compatible dimensions" );
+
+      static_assert( is_assignable_layout
+                   , "View assignment must have compatible layout or have rank <= 1" );
+
+      typedef typename DstType::offset_type  dst_offset_type ;
+
+      if ( size_t(DstTraits::dimension::rank_dynamic) < size_t(SrcTraits::dimension::rank_dynamic) ) {
+        typedef typename DstTraits::dimension dst_dim;
+        bool assignable =
+          ( ( 1 > DstTraits::dimension::rank_dynamic && 1 <= SrcTraits::dimension::rank_dynamic ) ?
+            dst_dim::ArgN0 == src.dimension_0() : true ) &&
+          ( ( 2 > DstTraits::dimension::rank_dynamic && 2 <= SrcTraits::dimension::rank_dynamic ) ?
+            dst_dim::ArgN1 == src.dimension_1() : true ) &&
+          ( ( 3 > DstTraits::dimension::rank_dynamic && 3 <= SrcTraits::dimension::rank_dynamic ) ?
+            dst_dim::ArgN2 == src.dimension_2() : true ) &&
+          ( ( 4 > DstTraits::dimension::rank_dynamic && 4 <= SrcTraits::dimension::rank_dynamic ) ?
+            dst_dim::ArgN3 == src.dimension_3() : true ) &&
+          ( ( 5 > DstTraits::dimension::rank_dynamic && 5 <= SrcTraits::dimension::rank_dynamic ) ?
+            dst_dim::ArgN4 == src.dimension_4() : true ) &&
+          ( ( 6 > DstTraits::dimension::rank_dynamic && 6 <= SrcTraits::dimension::rank_dynamic ) ?
+            dst_dim::ArgN5 == src.dimension_5() : true ) &&
+          ( ( 7 > DstTraits::dimension::rank_dynamic && 7 <= SrcTraits::dimension::rank_dynamic ) ?
+            dst_dim::ArgN6 == src.dimension_6() : true ) &&
+          ( ( 8 > DstTraits::dimension::rank_dynamic && 8 <= SrcTraits::dimension::rank_dynamic ) ?
+            dst_dim::ArgN7 == src.dimension_7() : true )
+          ;
+        if(!assignable)
+          Kokkos::abort("View Assignment: trying to assign runtime dimension to non matching compile time dimension.");
+      }
+      dst.m_offset = dst_offset_type( src.m_offset );
+      dst.m_handle = Kokkos::Experimental::Impl::ViewDataHandle< DstTraits >::assign( src.m_handle , src_track );
+    }
+};
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+// Subview mapping.
+// Deduce destination view type from source view traits and subview arguments
+
+template< class SrcTraits , class ... Args >
+struct ViewMapping
+  < typename std::enable_if<(
+      std::is_same< typename SrcTraits::specialize , void >::value
+      &&
+      (
+        std::is_same< typename SrcTraits::array_layout
+                    , Kokkos::LayoutLeft >::value ||
+        std::is_same< typename SrcTraits::array_layout
+                    , Kokkos::LayoutRight >::value ||
+        std::is_same< typename SrcTraits::array_layout
+                    , Kokkos::LayoutStride >::value
+      )
+    )>::type
+  , SrcTraits
+  , Args ... >
+{
+private:
+
+  static_assert( SrcTraits::rank == sizeof...(Args) ,
+    "Subview mapping requires one argument for each dimension of source View" );
+
+  enum
+    { RZ = false
+    , R0 = bool(is_integral_extent<0,Args...>::value)
+    , R1 = bool(is_integral_extent<1,Args...>::value)
+    , R2 = bool(is_integral_extent<2,Args...>::value)
+    , R3 = bool(is_integral_extent<3,Args...>::value)
+    , R4 = bool(is_integral_extent<4,Args...>::value)
+    , R5 = bool(is_integral_extent<5,Args...>::value)
+    , R6 = bool(is_integral_extent<6,Args...>::value)
+    , R7 = bool(is_integral_extent<7,Args...>::value)
+    };
+
+  enum { rank = unsigned(R0) + unsigned(R1) + unsigned(R2) + unsigned(R3)
+              + unsigned(R4) + unsigned(R5) + unsigned(R6) + unsigned(R7) };
+
+  // Whether right-most rank is a range.
+  enum { R0_rev = ( 0 == SrcTraits::rank ? RZ : (
+                    1 == SrcTraits::rank ? R0 : (
+                    2 == SrcTraits::rank ? R1 : (
+                    3 == SrcTraits::rank ? R2 : (
+                    4 == SrcTraits::rank ? R3 : (
+                    5 == SrcTraits::rank ? R4 : (
+                    6 == SrcTraits::rank ? R5 : (
+                    7 == SrcTraits::rank ? R6 : R7 )))))))) };
+
+  // Subview's layout
+  typedef typename std::conditional<
+      ( /* Same array layout IF */
+        ( rank == 0 ) /* output rank zero */
+        ||
+        // OutputRank 1 or 2, InputLayout Left, Interval 0
+        // because single stride one or second index has a stride.
+        ( rank <= 2 && R0 && std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutLeft >::value ) //replace with input rank
+        ||
+        // OutputRank 1 or 2, InputLayout Right, Interval [InputRank-1]
+        // because single stride one or second index has a stride.
+        ( rank <= 2 && R0_rev && std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutRight >::value ) //replace input rank
+      ), typename SrcTraits::array_layout , Kokkos::LayoutStride
+      >::type array_layout ;
+
+  typedef typename SrcTraits::value_type  value_type ;
+
+  typedef typename std::conditional< rank == 0 , value_type ,
+          typename std::conditional< rank == 1 , value_type * ,
+          typename std::conditional< rank == 2 , value_type ** ,
+          typename std::conditional< rank == 3 , value_type *** ,
+          typename std::conditional< rank == 4 , value_type **** ,
+          typename std::conditional< rank == 5 , value_type ***** ,
+          typename std::conditional< rank == 6 , value_type ****** ,
+          typename std::conditional< rank == 7 , value_type ******* ,
+                                                 value_type ********
+          >::type >::type >::type >::type >::type >::type >::type >::type
+     data_type ;
+
+public:
+
+  typedef Kokkos::Experimental::ViewTraits
+    < data_type
+    , array_layout 
+    , typename SrcTraits::device_type
+    , typename SrcTraits::memory_traits > traits_type ;
+
+  typedef Kokkos::Experimental::View
+    < data_type
+    , array_layout 
+    , typename SrcTraits::device_type
+    , typename SrcTraits::memory_traits > type ;
+
+  template< class MemoryTraits >
+  struct apply {
+
+    static_assert( Kokkos::Impl::is_memory_traits< MemoryTraits >::value , "" );
+
+    typedef Kokkos::Experimental::ViewTraits
+      < data_type 
+      , array_layout
+      , typename SrcTraits::device_type
+      , MemoryTraits > traits_type ;
+
+    typedef Kokkos::Experimental::View
+      < data_type 
+      , array_layout
+      , typename SrcTraits::device_type
+      , MemoryTraits > type ;
+  };
+
+  // The presumed type is 'ViewMapping< traits_type , void >'
+  // However, a compatible ViewMapping is acceptable.
+  template< class DstTraits >
+  KOKKOS_INLINE_FUNCTION
+  static void assign( ViewMapping< DstTraits , void > & dst
+                    , ViewMapping< SrcTraits , void > const & src
+                    , Args ... args )
+    {
+      static_assert(
+        ViewMapping< DstTraits , traits_type , void >::is_assignable ,
+        "Subview destination type must be compatible with subview derived type" );
+
+      typedef ViewMapping< DstTraits , void >  DstType ;
+
+      typedef typename DstType::offset_type  dst_offset_type ;
+      typedef typename DstType::handle_type  dst_handle_type ;
+
+      const SubviewExtents< SrcTraits::rank , rank >
+        extents( src.m_offset.m_dim , args... );
+
+      dst.m_offset = dst_offset_type( src.m_offset , extents );
+      dst.m_handle = dst_handle_type( src.m_handle +
+                                      src.m_offset( extents.domain_offset(0)
+                                                  , extents.domain_offset(1)
+                                                  , extents.domain_offset(2)
+                                                  , extents.domain_offset(3)
+                                                  , extents.domain_offset(4)
+                                                  , extents.domain_offset(5)
+                                                  , extents.domain_offset(6)
+                                                  , extents.domain_offset(7)
+                                                  ) );
+    }
+};
+
+
+
+//----------------------------------------------------------------------------
+
+}}} // namespace Kokkos::Experimental::Impl
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template< unsigned , class MapType >
+KOKKOS_INLINE_FUNCTION
+bool view_verify_operator_bounds( const MapType & )
+{ return true ; }
+
+template< unsigned R , class MapType , class iType , class ... Args >
+KOKKOS_INLINE_FUNCTION
+bool view_verify_operator_bounds
+  ( const MapType & map
+  , const iType   & i
+  , Args ... args
+  )
+{
+  return ( size_t(i) < map.extent(R) )
+         && view_verify_operator_bounds<R+1>( map , args ... );
+}
+
+template< unsigned , class MapType >
+inline
+void view_error_operator_bounds( char * , int , const MapType & )
+{}
+
+template< unsigned R , class MapType , class iType , class ... Args >
+inline
+void view_error_operator_bounds
+  ( char * buf
+  , int len
+  , const MapType & map
+  , const iType   & i
+  , Args ... args
+  )
+{
+  const int n =
+    snprintf(buf,len," %ld < %ld %c"
+            , static_cast<unsigned long>(i)
+            , static_cast<unsigned long>( map.extent(R) )
+            , ( sizeof...(Args) ? ',' : ')' )
+            );
+  view_error_operator_bounds<R+1>(buf+n,len-n,map,args...);
+}
+
+template< class MapType , class ... Args >
+KOKKOS_INLINE_FUNCTION
+void view_verify_operator_bounds
+  ( const MapType & map , Args ... args )
+{
+  if ( ! view_verify_operator_bounds<0>( map , args ... ) ) {
+#if defined( KOKKOS_ACTIVE_EXECUTION_SPACE_HOST )
+    enum { LEN = 1024 };
+    char buffer[ LEN ];
+    int n = snprintf(buf,LEN,"View bounds error(" );
+    view_error_operator_bounds<0>( buffer + n , LEN - n , map , args ... );
+    Kokkos::Impl::throw_runtime_exception(std::string(buffer));
+#else
+    Kokkos::abort("View bounds error");
+#endif
+  }
+}
+
+
+class Error_view_scalar_reference_to_non_scalar_view ;
+
+} /* namespace Impl */
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_EXPERIMENTAL_VIEW_MAPPING_HPP */
+
diff --git a/lib/kokkos/core/src/impl/KokkosExp_ViewTile.hpp b/lib/kokkos/core/src/impl/KokkosExp_ViewTile.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..8b3749e853a85eea341c4ce8462aec755de4bb11
--- /dev/null
+++ b/lib/kokkos/core/src/impl/KokkosExp_ViewTile.hpp
@@ -0,0 +1,227 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_EXPERIMENTAL_VIEWTILE_HPP
+#define KOKKOS_EXPERIMENTAL_VIEWTILE_HPP
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+// View mapping for rank two tiled array
+
+template< class L >
+struct is_layout_tile : public std::false_type {};
+
+template< unsigned N0 , unsigned N1 >
+struct is_layout_tile< Kokkos::LayoutTileLeft<N0,N1,true> > : public std::true_type {};
+
+template< class Dimension , class Layout >
+struct ViewOffset< Dimension , Layout ,
+  typename std::enable_if<(
+    ( Dimension::rank == 2 )
+    &&
+    is_layout_tile< Layout >::value
+  )>::type >
+{
+public:
+
+  enum { SHIFT_0 = Kokkos::Impl::integral_power_of_two(Layout::N0) };
+  enum { SHIFT_1 = Kokkos::Impl::integral_power_of_two(Layout::N1) };
+  enum { SHIFT_T = SHIFT_0 + SHIFT_1 };
+  enum { MASK_0  = Layout::N0 - 1 };
+  enum { MASK_1  = Layout::N1 - 1 };
+
+  // Is an irregular layout that does not have uniform striding for each index.
+  using is_mapping_plugin = std::true_type ;
+  using is_regular        = std::false_type ;
+
+  typedef size_t     size_type ;
+  typedef Dimension  dimension_type ;
+  typedef Layout     array_layout ;
+
+  dimension_type m_dim ;
+  size_type      m_tile_N0 ;
+
+  //----------------------------------------
+
+  // Only instantiated for rank 2
+  template< typename I0 , typename I1 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0 , I1 const & i1
+                      , int = 0 , int = 0
+                      , int = 0 , int = 0
+                      , int = 0 , int = 0
+                      ) const
+    {
+      return /* ( ( Tile offset                               ) * Tile size ) */
+                ( ( (i0>>SHIFT_0) + m_tile_N0 * (i1>>SHIFT_1) ) << SHIFT_T) +
+             /* ( Offset within tile                       ) */
+                ( (i0 & MASK_0) + ((i1 & MASK_1)<<SHIFT_0) ) ;
+    }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION constexpr
+  array_layout layout() const
+    { return array_layout( m_dim.N0 , m_dim.N1 ); }
+
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_0() const { return m_dim.N0 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_1() const { return m_dim.N1 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_2() const { return 1 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_3() const { return 1 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_4() const { return 1 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_5() const { return 1 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_6() const { return 1 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_7() const { return 1 ; }
+
+  KOKKOS_INLINE_FUNCTION constexpr size_type size() const { return m_dim.N0 * m_dim.N1 ; }
+
+  // Strides are meaningless due to irregularity
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_0() const { return 0 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_1() const { return 0 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_2() const { return 0 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_3() const { return 0 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_4() const { return 0 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_5() const { return 0 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_6() const { return 0 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_7() const { return 0 ; }
+
+  KOKKOS_INLINE_FUNCTION constexpr size_type span() const
+    {
+      // ( TileDim0 * ( TileDim1 ) ) * TileSize
+      return ( m_tile_N0 * ( ( m_dim.N1 + MASK_1 ) >> SHIFT_1 ) ) << SHIFT_T ;
+    }
+
+  KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const
+    {
+      // Only if dimensions align with tile size
+      return ( m_dim.N0 & MASK_0 ) == 0 && ( m_dim.N1 & MASK_1 ) == 0 ;
+    }
+
+  //----------------------------------------
+
+  ~ViewOffset() = default ;
+  ViewOffset() = default ;
+  ViewOffset( const ViewOffset & ) = default ;
+  ViewOffset & operator = ( const ViewOffset & ) = default ;
+
+  template< unsigned TrivialScalarSize >
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewOffset( std::integral_constant<unsigned,TrivialScalarSize> const & ,
+                        array_layout const arg_layout )
+    : m_dim( arg_layout.dimension[0], arg_layout.dimension[1], 0, 0, 0, 0, 0, 0 )
+    , m_tile_N0( ( arg_layout.dimension[0] + MASK_0 ) >> SHIFT_0 /* number of tiles in first dimension */ )
+    {}
+};
+
+template< typename T , unsigned N0 , unsigned N1 , class ... P
+        , typename iType0 , typename iType1
+        >
+struct ViewMapping
+  < void
+  , Kokkos::Experimental::ViewTraits<T**,Kokkos::LayoutTileLeft<N0,N1,true>,P...>
+  , Kokkos::LayoutTileLeft<N0,N1,true>
+  , iType0
+  , iType1 >
+{
+  typedef Kokkos::LayoutTileLeft<N0,N1,true>  src_layout ;
+  typedef Kokkos::Experimental::ViewTraits< T** , src_layout , P... > src_traits ;
+  typedef Kokkos::Experimental::ViewTraits< T[N0][N1] , LayoutLeft , P ... > traits ;
+  typedef Kokkos::Experimental::View< T[N0][N1] , LayoutLeft , P ... > type ;
+
+  KOKKOS_INLINE_FUNCTION static
+  void assign( ViewMapping< traits , void > & dst
+             , const ViewMapping< src_traits , void > & src
+             , const src_layout &
+             , const size_t i_tile0
+             , const size_t i_tile1
+             )
+    {
+      typedef ViewMapping< traits , void >        dst_map_type ;
+      typedef ViewMapping< src_traits , void >    src_map_type ;
+      typedef typename dst_map_type::handle_type  dst_handle_type ;
+      typedef typename dst_map_type::offset_type  dst_offset_type ;
+      typedef typename src_map_type::offset_type  src_offset_type ;
+
+      dst = dst_map_type(
+         dst_handle_type( src.m_handle +
+                        ( ( i_tile0 + src.m_offset.m_tile_N0 * i_tile1 ) << src_offset_type::SHIFT_T ) ) ,
+         dst_offset_type() );
+    }
+};
+
+} /* namespace Impl */
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+namespace Kokkos {
+namespace Experimental {
+
+template< typename T , unsigned N0 , unsigned N1 , class ... P >
+KOKKOS_INLINE_FUNCTION
+Kokkos::Experimental::View< T[N0][N1] , LayoutLeft , P... >
+tile_subview( const Kokkos::Experimental::View<T**,Kokkos::LayoutTileLeft<N0,N1,true>,P...> & src
+            , const size_t i_tile0
+            , const size_t i_tile1
+            )
+{
+  // Force the specialized ViewMapping for extracting a tile
+  // by using the first subview argument as the layout.
+  typedef Kokkos::LayoutTileLeft<N0,N1,true> SrcLayout ;
+
+  return Kokkos::Experimental::View< T[N0][N1] , LayoutLeft , P... >
+    ( src , SrcLayout() , i_tile0 , i_tile1 );
+}
+
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_EXPERIENTAL_VIEWTILE_HPP */
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp b/lib/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..0246a7b9af8f968fe6295bfad20f765265049906
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp
@@ -0,0 +1,197 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_IMPL_ANALYZE_POLICY_HPP
+#define KOKKOS_IMPL_ANALYZE_POLICY_HPP
+
+#include <Kokkos_Core_fwd.hpp>
+#include <Kokkos_Concepts.hpp>
+#include <impl/Kokkos_Tags.hpp>
+
+namespace Kokkos { namespace Impl {
+
+template < typename ExecutionSpace   = void
+         , typename Schedule         = void
+         , typename WorkTag          = void
+         , typename IndexType        = void
+         , typename IterationPattern = void
+         >
+struct PolicyTraitsBase
+{
+  using type = PolicyTraitsBase< ExecutionSpace, Schedule, WorkTag, IndexType, IterationPattern>;
+
+  using execution_space   = ExecutionSpace;
+  using schedule_type     = Schedule;
+  using work_tag          = WorkTag;
+  using index_type        = IndexType;
+  using iteration_pattern = IterationPattern;
+};
+
+
+template <typename PolicyBase, typename ExecutionSpace>
+struct SetExecutionSpace
+{
+  static_assert( is_void<typename PolicyBase::execution_space>::value
+               , "Kokkos Error: More than one execution space given" );
+  using type = PolicyTraitsBase< ExecutionSpace
+                               , typename PolicyBase::schedule_type
+                               , typename PolicyBase::work_tag
+                               , typename PolicyBase::index_type
+                               , typename PolicyBase::iteration_pattern
+                               >;
+};
+
+template <typename PolicyBase, typename Schedule>
+struct SetSchedule
+{
+  static_assert( is_void<typename PolicyBase::schedule_type>::value
+               , "Kokkos Error: More than one schedule type given" );
+  using type = PolicyTraitsBase< typename PolicyBase::execution_space
+                               , Schedule
+                               , typename PolicyBase::work_tag
+                               , typename PolicyBase::index_type
+                               , typename PolicyBase::iteration_pattern
+                               >;
+};
+
+template <typename PolicyBase, typename WorkTag>
+struct SetWorkTag
+{
+  static_assert( is_void<typename PolicyBase::work_tag>::value
+               , "Kokkos Error: More than one work tag given" );
+  using type = PolicyTraitsBase< typename PolicyBase::execution_space
+                               , typename PolicyBase::schedule_type
+                               , WorkTag
+                               , typename PolicyBase::index_type
+                               , typename PolicyBase::iteration_pattern
+                               >;
+};
+
+template <typename PolicyBase, typename IndexType>
+struct SetIndexType
+{
+  static_assert( is_void<typename PolicyBase::index_type>::value
+               , "Kokkos Error: More than one index type given" );
+  using type = PolicyTraitsBase< typename PolicyBase::execution_space
+                               , typename PolicyBase::schedule_type
+                               , typename PolicyBase::work_tag
+                               , IndexType
+                               , typename PolicyBase::iteration_pattern
+                               >;
+};
+
+
+template <typename PolicyBase, typename IterationPattern>
+struct SetIterationPattern
+{
+  static_assert( is_void<typename PolicyBase::iteration_pattern>::value
+               , "Kokkos Error: More than one iteration_pattern given" );
+  using type = PolicyTraitsBase< typename PolicyBase::execution_space
+                               , typename PolicyBase::schedule_type
+                               , typename PolicyBase::work_tag
+                               , typename PolicyBase::index_type
+                               , IterationPattern
+                               >;
+};
+
+
+template <typename Base, typename... Traits>
+struct AnalyzePolicy;
+
+template <typename Base, typename T, typename... Traits>
+struct AnalyzePolicy<Base, T, Traits...> : public
+  AnalyzePolicy<
+      typename std::conditional< is_execution_space<T>::value  , SetExecutionSpace<Base,T>
+    , typename std::conditional< is_schedule_type<T>::value    , SetSchedule<Base,T>
+    , typename std::conditional< is_index_type<T>::value       , SetIndexType<Base,T>
+    , typename std::conditional< std::is_integral<T>::value    , SetIndexType<Base, IndexType<T> >
+    , typename std::conditional< is_iteration_pattern<T>::value, SetIterationPattern<Base,T>
+    , SetWorkTag<Base,T>
+    >::type >::type >::type >::type>::type::type
+  , Traits...
+  >
+{};
+
+template <typename Base>
+struct AnalyzePolicy<Base>
+{
+  using execution_space = typename std::conditional< is_void< typename Base::execution_space >::value
+                                                   , DefaultExecutionSpace
+                                                   , typename Base::execution_space
+                                                   >::type;
+
+  using schedule_type = typename std::conditional< is_void< typename Base::schedule_type >::value
+                                                 , Schedule< Static >
+                                                 , typename Base::schedule_type
+                                                 >::type;
+
+  using work_tag = typename Base::work_tag;
+
+  using index_type = typename std::conditional< is_void< typename Base::index_type >::value
+                                              , IndexType< typename execution_space::size_type >
+                                              , typename Base::index_type
+                                              >::type
+                                               ::type // nasty hack to make index_type into an integral_type
+                                              ;       // instead of the wrapped IndexType<T> for backwards compatibility
+
+  using iteration_pattern = typename std::conditional< is_void< typename Base::iteration_pattern >::value
+                                                     , void // TODO set default iteration pattern
+                                                     , typename Base::iteration_pattern
+                                                     >::type;
+  using type = PolicyTraitsBase< execution_space
+                               , schedule_type
+                               , work_tag
+                               , index_type
+                               , iteration_pattern
+                               >;
+};
+
+template <typename... Traits>
+struct PolicyTraits
+  : public AnalyzePolicy< PolicyTraitsBase<>, Traits... >::type
+{};
+
+}} // namespace Kokkos::Impl
+
+
+#endif //KOKKOS_IMPL_ANALYZE_POLICY_HPP
diff --git a/lib/kokkos/core/src/impl/Kokkos_AnalyzeShape.hpp b/lib/kokkos/core/src/impl/Kokkos_AnalyzeShape.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..2de9df008ee5b42b5d38727ead56bae768869c43
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_AnalyzeShape.hpp
@@ -0,0 +1,260 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_ANALYZESHAPE_HPP
+#define KOKKOS_ANALYZESHAPE_HPP
+
+#include <impl/Kokkos_Shape.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+
+/** \brief  Analyze the array shape defined by a Kokkos::View data type.
+ *
+ *  It is presumed that the data type can be mapped down to a multidimensional
+ *  array of an intrinsic scalar numerical type (double, float, int, ... ).
+ *  The 'value_type' of an array may be an embedded aggregate type such
+ *  as a fixed length array 'Array<T,N>'.
+ *  In this case the 'array_intrinsic_type' represents the
+ *  underlying array of intrinsic scalar numerical type.
+ *
+ *  The embedded aggregate type must have an AnalyzeShape specialization
+ *  to map it down to a shape and intrinsic scalar numerical type.
+ */
+template< class T >
+struct AnalyzeShape : public Shape< sizeof(T) , 0 >
+{
+  typedef void specialize ;
+
+  typedef Shape< sizeof(T), 0 >  shape ;
+
+  typedef       T  array_intrinsic_type ;
+  typedef       T  value_type ;
+  typedef       T  type ;
+
+  typedef const T  const_array_intrinsic_type ;
+  typedef const T  const_value_type ;
+  typedef const T  const_type ;
+
+  typedef       T  non_const_array_intrinsic_type ;
+  typedef       T  non_const_value_type ;
+  typedef       T  non_const_type ;
+};
+
+template<>
+struct AnalyzeShape<void> : public Shape< 0 , 0 >
+{
+  typedef void specialize ;
+
+  typedef Shape< 0 , 0 >  shape ;
+
+  typedef       void  array_intrinsic_type ;
+  typedef       void  value_type ;
+  typedef       void  type ;
+  typedef const void  const_array_intrinsic_type ;
+  typedef const void  const_value_type ;
+  typedef const void  const_type ;
+  typedef       void  non_const_array_intrinsic_type ;
+  typedef       void  non_const_value_type ;
+  typedef       void  non_const_type ;
+};
+
+template< class T >
+struct AnalyzeShape< const T > : public AnalyzeShape<T>::shape
+{
+private:
+  typedef AnalyzeShape<T> nested ;
+public:
+
+  typedef typename nested::specialize specialize ;
+
+  typedef typename nested::shape shape ;
+
+  typedef typename nested::const_array_intrinsic_type  array_intrinsic_type ;
+  typedef typename nested::const_value_type            value_type ;
+  typedef typename nested::const_type                  type ;
+
+  typedef typename nested::const_array_intrinsic_type  const_array_intrinsic_type ;
+  typedef typename nested::const_value_type            const_value_type ;
+  typedef typename nested::const_type                  const_type ;
+
+  typedef typename nested::non_const_array_intrinsic_type  non_const_array_intrinsic_type ;
+  typedef typename nested::non_const_value_type            non_const_value_type ;
+  typedef typename nested::non_const_type                  non_const_type ;
+};
+
+template< class T >
+struct AnalyzeShape< T * >
+  : public ShapeInsert< typename AnalyzeShape<T>::shape , 0 >::type
+{
+private:
+  typedef AnalyzeShape<T> nested ;
+public:
+
+  typedef typename nested::specialize specialize ;
+
+  typedef typename ShapeInsert< typename nested::shape , 0 >::type shape ;
+
+  typedef typename nested::array_intrinsic_type * array_intrinsic_type ;
+  typedef typename nested::value_type             value_type ;
+  typedef typename nested::type                 * type ;
+
+  typedef typename nested::const_array_intrinsic_type * const_array_intrinsic_type ;
+  typedef typename nested::const_value_type             const_value_type ;
+  typedef typename nested::const_type                 * const_type ;
+
+  typedef typename nested::non_const_array_intrinsic_type * non_const_array_intrinsic_type ;
+  typedef typename nested::non_const_value_type             non_const_value_type ;
+  typedef typename nested::non_const_type                 * non_const_type ;
+};
+
+template< class T >
+struct AnalyzeShape< T[] >
+  : public ShapeInsert< typename AnalyzeShape<T>::shape , 0 >::type
+{
+private:
+  typedef AnalyzeShape<T> nested ;
+public:
+
+  typedef typename nested::specialize specialize ;
+
+  typedef typename ShapeInsert< typename nested::shape , 0 >::type shape ;
+
+  typedef typename nested::array_intrinsic_type  array_intrinsic_type [] ;
+  typedef typename nested::value_type            value_type ;
+  typedef typename nested::type                  type [] ;
+
+  typedef typename nested::const_array_intrinsic_type  const_array_intrinsic_type [] ;
+  typedef typename nested::const_value_type            const_value_type ;
+  typedef typename nested::const_type                  const_type [] ;
+
+  typedef typename nested::non_const_array_intrinsic_type  non_const_array_intrinsic_type [] ;
+  typedef typename nested::non_const_value_type            non_const_value_type ;
+  typedef typename nested::non_const_type                  non_const_type [] ;
+};
+
+template< class T >
+struct AnalyzeShape< const T[] >
+  : public ShapeInsert< typename AnalyzeShape< const T >::shape , 0 >::type
+{
+private:
+  typedef AnalyzeShape< const T > nested ;
+public:
+
+  typedef typename nested::specialize specialize ;
+
+  typedef typename ShapeInsert< typename nested::shape , 0 >::type shape ;
+
+  typedef typename nested::array_intrinsic_type  array_intrinsic_type [] ;
+  typedef typename nested::value_type            value_type ;
+  typedef typename nested::type                  type [] ;
+
+  typedef typename nested::const_array_intrinsic_type  const_array_intrinsic_type [] ;
+  typedef typename nested::const_value_type            const_value_type ;
+  typedef typename nested::const_type                  const_type [] ;
+
+  typedef typename nested::non_const_array_intrinsic_type  non_const_array_intrinsic_type [] ;
+  typedef typename nested::non_const_value_type            non_const_value_type ;
+  typedef typename nested::non_const_type                  non_const_type [] ;
+};
+
+template< class T , unsigned N >
+struct AnalyzeShape< T[N] >
+  : public ShapeInsert< typename AnalyzeShape<T>::shape , N >::type
+{
+private:
+  typedef AnalyzeShape<T> nested ;
+public:
+
+  typedef typename nested::specialize specialize ;
+
+  typedef typename ShapeInsert< typename nested::shape , N >::type shape ;
+
+  typedef typename nested::array_intrinsic_type  array_intrinsic_type [N] ;
+  typedef typename nested::value_type            value_type ;
+  typedef typename nested::type                  type [N] ;
+
+  typedef typename nested::const_array_intrinsic_type  const_array_intrinsic_type [N] ;
+  typedef typename nested::const_value_type            const_value_type ;
+  typedef typename nested::const_type                  const_type [N] ;
+
+  typedef typename nested::non_const_array_intrinsic_type  non_const_array_intrinsic_type [N] ;
+  typedef typename nested::non_const_value_type            non_const_value_type ;
+  typedef typename nested::non_const_type                  non_const_type [N] ;
+};
+
+template< class T , unsigned N >
+struct AnalyzeShape< const T[N] >
+  : public ShapeInsert< typename AnalyzeShape< const T >::shape , N >::type
+{
+private:
+  typedef AnalyzeShape< const T > nested ;
+public:
+
+  typedef typename nested::specialize specialize ;
+
+  typedef typename ShapeInsert< typename nested::shape , N >::type shape ;
+
+  typedef typename nested::array_intrinsic_type  array_intrinsic_type [N] ;
+  typedef typename nested::value_type            value_type ;
+  typedef typename nested::type                  type [N] ;
+
+  typedef typename nested::const_array_intrinsic_type  const_array_intrinsic_type [N] ;
+  typedef typename nested::const_value_type            const_value_type ;
+  typedef typename nested::const_type                  const_type [N] ;
+
+  typedef typename nested::non_const_array_intrinsic_type  non_const_array_intrinsic_type [N] ;
+  typedef typename nested::non_const_value_type            non_const_value_type ;
+  typedef typename nested::non_const_type                  non_const_type [N] ;
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+#endif /* #ifndef KOKKOS_ANALYZESHAPE_HPP */
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Assembly.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Assembly.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..500e2b5a21f50c34200920946d7e852eb057a16f
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Assembly.hpp
@@ -0,0 +1,112 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_ASSEMBLY_HPP )
+#define KOKKOS_ATOMIC_ASSEMBLY_HPP
+namespace Kokkos {
+
+namespace Impl {
+  struct cas128_t
+  {
+    uint64_t lower;
+    uint64_t upper;
+
+    KOKKOS_INLINE_FUNCTION
+    cas128_t () {
+      lower = 0;
+      upper = 0;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    cas128_t (const cas128_t& a) {
+      lower = a.lower;
+      upper = a.upper;
+    }
+    KOKKOS_INLINE_FUNCTION
+    cas128_t (volatile cas128_t* a) {
+      lower = a->lower;
+      upper = a->upper;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    bool operator != (const cas128_t& a) const {
+      return (lower != a.lower) || upper!=a.upper;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator = (const cas128_t& a) {
+      lower = a.lower;
+      upper = a.upper;
+    }
+    KOKKOS_INLINE_FUNCTION
+    void operator = (const cas128_t& a) volatile {
+      lower = a.lower;
+      upper = a.upper;
+    }
+  }
+  __attribute__ (( __aligned__( 16 ) ));
+
+
+  #if defined( KOKKOS_ENABLE_ASM ) && defined ( KOKKOS_USE_ISA_X86_64 )
+  inline cas128_t cas128( volatile cas128_t * ptr, cas128_t cmp,  cas128_t swap )
+  {
+      bool swapped = false;
+      __asm__ __volatile__
+      (
+       "lock cmpxchg16b %1\n\t"
+       "setz %0"
+       : "=q" ( swapped )
+       , "+m" ( *ptr )
+       , "+d" ( cmp.upper )
+       , "+a" ( cmp.lower )
+       : "c" ( swap.upper )
+       , "b" ( swap.lower )
+       , "q" ( swapped )
+     );
+      return cmp;
+  }
+  #endif
+
+}
+}
+
+#endif
diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..fd7ea845e7633d7415b0b9cd147f1da51ef93632
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp
@@ -0,0 +1,271 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_COMPARE_EXCHANGE_STRONG_HPP )
+#define KOKKOS_ATOMIC_COMPARE_EXCHANGE_STRONG_HPP
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+// Cuda native CAS supports int, unsigned int, and unsigned long long int (non-standard type).
+// Must cast-away 'volatile' for the CAS call.
+
+#if defined( KOKKOS_ATOMICS_USE_CUDA )
+
+__inline__ __device__
+int atomic_compare_exchange( volatile int * const dest, const int compare, const int val)
+{ return atomicCAS((int*)dest,compare,val); }
+
+__inline__ __device__
+unsigned int atomic_compare_exchange( volatile unsigned int * const dest, const unsigned int compare, const unsigned int val)
+{ return atomicCAS((unsigned int*)dest,compare,val); }
+
+__inline__ __device__
+unsigned long long int atomic_compare_exchange( volatile unsigned long long int * const dest ,
+                                                const unsigned long long int compare ,
+                                                const unsigned long long int val )
+{ return atomicCAS((unsigned long long int*)dest,compare,val); }
+
+template < typename T >
+__inline__ __device__
+T atomic_compare_exchange( volatile T * const dest , const T & compare ,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T & >::type val )
+{
+  const int tmp = atomicCAS( (int*) dest , *((int*)&compare) , *((int*)&val) );
+  return *((T*)&tmp);
+}
+
+template < typename T >
+__inline__ __device__
+T atomic_compare_exchange( volatile T * const dest , const T & compare ,
+  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) == sizeof(unsigned long long int) , const T & >::type val )
+{
+  typedef unsigned long long int type ;
+  const type tmp = atomicCAS( (type*) dest , *((type*)&compare) , *((type*)&val) );
+  return *((T*)&tmp);
+}
+
+template < typename T >
+__inline__ __device__
+T atomic_compare_exchange( volatile T * const dest , const T & compare ,
+    typename ::Kokkos::Impl::enable_if<
+                  ( sizeof(T) != 4 )
+               && ( sizeof(T) != 8 )
+             , const T >::type& val )
+{
+  T return_val;
+  // This is a way to (hopefully) avoid dead lock in a warp
+  int done = 1;
+  while ( done>0 ) {
+    done++;
+    if( Impl::lock_address_cuda_space( (void*) dest ) ) {
+      return_val = *dest;
+      if( return_val == compare )
+        *dest = val;
+      Impl::unlock_address_cuda_space( (void*) dest );
+      done = 0;
+    }
+  }
+  return return_val;
+}
+
+//----------------------------------------------------------------------------
+// GCC native CAS supports int, long, unsigned int, unsigned long.
+// Intel native CAS support int and long with the same interface as GCC.
+
+#elif defined(KOKKOS_ATOMICS_USE_GCC) || defined(KOKKOS_ATOMICS_USE_INTEL)
+
+KOKKOS_INLINE_FUNCTION
+int atomic_compare_exchange( volatile int * const dest, const int compare, const int val)
+{ return __sync_val_compare_and_swap(dest,compare,val); }
+
+KOKKOS_INLINE_FUNCTION
+long atomic_compare_exchange( volatile long * const dest, const long compare, const long val )
+{ return __sync_val_compare_and_swap(dest,compare,val); }
+
+#if defined( KOKKOS_ATOMICS_USE_GCC )
+
+// GCC supports unsigned
+
+KOKKOS_INLINE_FUNCTION
+unsigned int atomic_compare_exchange( volatile unsigned int * const dest, const unsigned int compare, const unsigned int val )
+{ return __sync_val_compare_and_swap(dest,compare,val); }
+
+KOKKOS_INLINE_FUNCTION
+unsigned long atomic_compare_exchange( volatile unsigned long * const dest ,
+                                       const unsigned long compare ,
+                                       const unsigned long val )
+{ return __sync_val_compare_and_swap(dest,compare,val); }
+
+#endif
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_compare_exchange( volatile T * const dest, const T & compare,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T & >::type val )
+{
+#ifdef KOKKOS_HAVE_CXX11
+  union U {
+    int i ;
+    T t ;
+    KOKKOS_INLINE_FUNCTION U() {};
+  } tmp ;
+#else
+  union U {
+    int i ;
+    T t ;
+  } tmp ;
+#endif
+
+  tmp.i = __sync_val_compare_and_swap( (int*) dest , *((int*)&compare) , *((int*)&val) );
+  return tmp.t ;
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_compare_exchange( volatile T * const dest, const T & compare,
+  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) == sizeof(long) , const T & >::type val )
+{
+#ifdef KOKKOS_HAVE_CXX11
+  union U {
+    long i ;
+    T t ;
+    KOKKOS_INLINE_FUNCTION U() {};
+  } tmp ;
+#else
+  union U {
+    long i ;
+    T t ;
+  } tmp ;
+#endif
+
+  tmp.i = __sync_val_compare_and_swap( (long*) dest , *((long*)&compare) , *((long*)&val) );
+  return tmp.t ;
+}
+
+#if defined( KOKKOS_ENABLE_ASM) && defined ( KOKKOS_USE_ISA_X86_64 )
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_compare_exchange( volatile T * const dest, const T & compare,
+  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) != sizeof(long) &&
+                                    sizeof(T) == sizeof(Impl::cas128_t), const T & >::type val )
+{
+  union U {
+    Impl::cas128_t i ;
+    T t ;
+    KOKKOS_INLINE_FUNCTION U() {};
+  } tmp ;
+
+  tmp.i = Impl::cas128( (Impl::cas128_t*) dest , *((Impl::cas128_t*)&compare) , *((Impl::cas128_t*)&val) );
+  return tmp.t ;
+}
+#endif
+
+template < typename T >
+inline
+T atomic_compare_exchange( volatile T * const dest , const T compare ,
+    typename ::Kokkos::Impl::enable_if<
+                  ( sizeof(T) != 4 )
+               && ( sizeof(T) != 8 )
+            #if defined(KOKKOS_ENABLE_ASM) && defined ( KOKKOS_USE_ISA_X86_64 )
+               && ( sizeof(T) != 16 )
+            #endif
+             , const T >::type& val )
+{
+  while( !Impl::lock_address_host_space( (void*) dest ) );
+  T return_val = *dest;
+  if( return_val == compare ) {
+    // Don't use the following line of code here:
+    //
+    //const T tmp = *dest = val;
+    //
+    // Instead, put each assignment in its own statement.  This is
+    // because the overload of T::operator= for volatile *this should
+    // return void, not volatile T&.  See Kokkos #177:
+    //
+    // https://github.com/kokkos/kokkos/issues/177
+    *dest = val;
+    const T tmp = *dest;
+    #ifndef KOKKOS_COMPILER_CLANG
+    (void) tmp;
+    #endif
+  }
+  Impl::unlock_address_host_space( (void*) dest );
+  return return_val;
+}
+//----------------------------------------------------------------------------
+
+#elif defined( KOKKOS_ATOMICS_USE_OMP31 )
+
+template< typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_compare_exchange( volatile T * const dest, const T compare, const T val )
+{
+  T retval;
+#pragma omp critical
+  {
+    retval = dest[0];
+    if ( retval == compare )
+        dest[0] = val;
+  }
+  return retval;
+}
+
+#endif
+
+template <typename T>
+KOKKOS_INLINE_FUNCTION
+bool atomic_compare_exchange_strong(volatile T* const dest, const T compare, const T val)
+{
+  return compare == atomic_compare_exchange(dest, compare, val);
+}
+
+//----------------------------------------------------------------------------
+
+} // namespace Kokkos
+
+#endif
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Decrement.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Decrement.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..1438a37e454e556832549e2137202d971b4a09ce
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Decrement.hpp
@@ -0,0 +1,117 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#if defined( KOKKOS_ATOMIC_HPP) && ! defined( KOKKOS_ATOMIC_DECREMENT )
+#define KOKKOS_ATOMIC_DECREMENT
+
+namespace Kokkos {
+
+// Atomic increment
+template<>
+KOKKOS_INLINE_FUNCTION
+void atomic_decrement<char>(volatile char* a) {
+#if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_USE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__)
+  __asm__ __volatile__(
+      "lock decb %0"
+      : /* no output registers */
+      : "m" (a[0])
+      : "memory"
+    );
+#else
+  Kokkos::atomic_fetch_add(a,-1);
+#endif
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+void atomic_decrement<short>(volatile short* a) {
+#if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_USE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__)
+  __asm__ __volatile__(
+      "lock decw %0"
+      : /* no output registers */
+      : "m" (a[0])
+      : "memory"
+    );
+#else
+  Kokkos::atomic_fetch_add(a,-1);
+#endif
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+void atomic_decrement<int>(volatile int* a) {
+#if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_USE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__)
+  __asm__ __volatile__(
+      "lock decl %0"
+      : /* no output registers */
+      : "m" (a[0])
+      : "memory"
+    );
+#else
+  Kokkos::atomic_fetch_add(a,-1);
+#endif
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+void atomic_decrement<long long int>(volatile long long int* a) {
+#if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_USE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__)
+  __asm__ __volatile__(
+      "lock decq %0"
+      : /* no output registers */
+      : "m" (a[0])
+      : "memory"
+    );
+#else
+  Kokkos::atomic_fetch_add(a,-1);
+#endif
+}
+
+template<typename T>
+KOKKOS_INLINE_FUNCTION
+void atomic_decrement(volatile T* a) {
+  Kokkos::atomic_fetch_add(a,-1);
+}
+
+} // End of namespace Kokkos
+#endif
diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e8cac4ba3b82ba097016a3ba80b03b010a7df8c3
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp
@@ -0,0 +1,359 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_EXCHANGE_HPP )
+#define KOKKOS_ATOMIC_EXCHANGE_HPP
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_ATOMICS_USE_CUDA )
+
+__inline__ __device__
+int atomic_exchange( volatile int * const dest , const int val )
+{
+  // return __iAtomicExch( (int*) dest , val );
+  return atomicExch( (int*) dest , val );
+}
+
+__inline__ __device__
+unsigned int atomic_exchange( volatile unsigned int * const dest , const unsigned int val )
+{
+  // return __uAtomicExch( (unsigned int*) dest , val );
+  return atomicExch( (unsigned int*) dest , val );
+}
+
+__inline__ __device__
+unsigned long long int atomic_exchange( volatile unsigned long long int * const dest , const unsigned long long int val )
+{
+  // return __ullAtomicExch( (unsigned long long*) dest , val );
+  return atomicExch( (unsigned long long*) dest , val );
+}
+
+/** \brief  Atomic exchange for any type with compatible size */
+template< typename T >
+__inline__ __device__
+T atomic_exchange(
+  volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T & >::type val )
+{
+  // int tmp = __ullAtomicExch( (int*) dest , *((int*)&val) );
+  int tmp = atomicExch( ((int*)dest) , *((int*)&val) );
+  return *((T*)&tmp);
+}
+
+template< typename T >
+__inline__ __device__
+T atomic_exchange(
+  volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) == sizeof(unsigned long long int) , const T & >::type val )
+{
+  typedef unsigned long long int type ;
+  // type tmp = __ullAtomicExch( (type*) dest , *((type*)&val) );
+  type tmp = atomicExch( ((type*)dest) , *((type*)&val) );
+  return *((T*)&tmp);
+}
+
+template < typename T >
+__inline__ __device__
+T atomic_exchange( volatile T * const dest ,
+    typename ::Kokkos::Impl::enable_if<
+                  ( sizeof(T) != 4 )
+               && ( sizeof(T) != 8 )
+             , const T >::type& val )
+{
+  T return_val;
+  // This is a way to (hopefully) avoid dead lock in a warp
+  int done = 1;
+  while ( done > 0 ) {
+    done++;
+    if( Impl::lock_address_cuda_space( (void*) dest ) ) {
+      return_val = *dest;
+      *dest = val;
+      Impl::unlock_address_cuda_space( (void*) dest );
+      done = 0;
+    }
+  }
+  return return_val;
+}
+/** \brief  Atomic exchange for any type with compatible size */
+template< typename T >
+__inline__ __device__
+void atomic_assign(
+  volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T & >::type val )
+{
+  // (void) __ullAtomicExch( (int*) dest , *((int*)&val) );
+  (void) atomicExch( ((int*)dest) , *((int*)&val) );
+}
+
+template< typename T >
+__inline__ __device__
+void atomic_assign(
+  volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) == sizeof(unsigned long long int) , const T & >::type val )
+{
+  typedef unsigned long long int type ;
+  // (void) __ullAtomicExch( (type*) dest , *((type*)&val) );
+  (void) atomicExch( ((type*)dest) , *((type*)&val) );
+}
+
+template< typename T >
+__inline__ __device__
+void atomic_assign(
+  volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) != sizeof(unsigned long long int)
+                                  , const T & >::type val )
+{
+  (void) atomic_exchange(dest,val);
+}
+
+//----------------------------------------------------------------------------
+
+#elif defined(KOKKOS_ATOMICS_USE_GCC) || defined(KOKKOS_ATOMICS_USE_INTEL)
+
+template< typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_exchange( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) || sizeof(T) == sizeof(long)
+                                  , const T & >::type val )
+{
+  typedef typename Kokkos::Impl::if_c< sizeof(T) == sizeof(int) , int , long >::type type ;
+
+  const type v = *((type*)&val); // Extract to be sure the value doesn't change
+
+  type assumed ;
+
+#ifdef KOKKOS_HAVE_CXX11
+  union U {
+    T val_T ;
+    type val_type ;
+    KOKKOS_INLINE_FUNCTION U() {};
+  } old ;
+#else
+  union { T val_T ; type val_type ; } old ;
+#endif
+
+  old.val_T = *dest ;
+
+  do {
+    assumed = old.val_type ;
+    old.val_type = __sync_val_compare_and_swap( (volatile type *) dest , assumed , v );
+  } while ( assumed != old.val_type );
+
+  return old.val_T ;
+}
+
+#if defined(KOKKOS_ENABLE_ASM) && defined ( KOKKOS_USE_ISA_X86_64 )
+template< typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_exchange( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(Impl::cas128_t)
+                                  , const T & >::type val )
+{
+  union U {
+    Impl::cas128_t i ;
+    T t ;
+    KOKKOS_INLINE_FUNCTION U() {};
+  } assume , oldval , newval ;
+
+  oldval.t = *dest ;
+  newval.t = val;
+
+  do {
+    assume.i = oldval.i ;
+    oldval.i = Impl::cas128( (volatile Impl::cas128_t*) dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return oldval.t ;
+}
+#endif
+
+//----------------------------------------------------------------------------
+
+template < typename T >
+inline
+T atomic_exchange( volatile T * const dest ,
+    typename ::Kokkos::Impl::enable_if<
+                  ( sizeof(T) != 4 )
+               && ( sizeof(T) != 8 )
+              #if defined(KOKKOS_ENABLE_ASM) && defined ( KOKKOS_USE_ISA_X86_64 )
+               && ( sizeof(T) != 16 )
+              #endif
+                 , const T >::type& val )
+{
+  while( !Impl::lock_address_host_space( (void*) dest ) );
+  T return_val = *dest;
+  // Don't use the following line of code here:
+  //
+  //const T tmp = *dest = val;
+  //
+  // Instead, put each assignment in its own statement.  This is
+  // because the overload of T::operator= for volatile *this should
+  // return void, not volatile T&.  See Kokkos #177:
+  //
+  // https://github.com/kokkos/kokkos/issues/177
+  *dest = val;
+  const T tmp = *dest;
+  #ifndef KOKKOS_COMPILER_CLANG
+  (void) tmp;
+  #endif
+  Impl::unlock_address_host_space( (void*) dest );
+  return return_val;
+}
+
+template< typename T >
+KOKKOS_INLINE_FUNCTION
+void atomic_assign( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) || sizeof(T) == sizeof(long)
+                                  , const T & >::type val )
+{
+  typedef typename Kokkos::Impl::if_c< sizeof(T) == sizeof(int) , int , long >::type type ;
+
+  const type v = *((type*)&val); // Extract to be sure the value doesn't change
+
+  type assumed ;
+
+#ifdef KOKKOS_HAVE_CXX11
+  union U {
+    T val_T ;
+    type val_type ;
+    KOKKOS_INLINE_FUNCTION U() {};
+  } old ;
+#else
+  union { T val_T ; type val_type ; } old ;
+#endif
+
+  old.val_T = *dest ;
+
+  do {
+    assumed = old.val_type ;
+    old.val_type = __sync_val_compare_and_swap( (volatile type *) dest , assumed , v );
+  } while ( assumed != old.val_type );
+}
+
+#if defined( KOKKOS_ENABLE_ASM ) && defined ( KOKKOS_USE_ISA_X86_64 )
+template< typename T >
+KOKKOS_INLINE_FUNCTION
+void atomic_assign( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(Impl::cas128_t)
+                                  , const T & >::type val )
+{
+  union U {
+    Impl::cas128_t i ;
+    T t ;
+    KOKKOS_INLINE_FUNCTION U() {};
+  } assume , oldval , newval ;
+
+  oldval.t = *dest ;
+  newval.t = val;
+  do {
+    assume.i = oldval.i ;
+    oldval.i = Impl::cas128( (volatile Impl::cas128_t*) dest , assume.i , newval.i);
+  } while ( assume.i != oldval.i );
+}
+#endif
+
+template < typename T >
+inline
+void atomic_assign( volatile T * const dest ,
+    typename ::Kokkos::Impl::enable_if<
+                  ( sizeof(T) != 4 )
+               && ( sizeof(T) != 8 )
+              #if defined(KOKKOS_ENABLE_ASM) && defined ( KOKKOS_USE_ISA_X86_64 )
+               && ( sizeof(T) != 16 )
+              #endif
+                 , const T >::type& val )
+{
+  while( !Impl::lock_address_host_space( (void*) dest ) );
+  // This is likely an aggregate type with a defined
+  // 'volatile T & operator = ( const T & ) volatile'
+  // member.  The volatile return value implicitly defines a
+  // dereference that some compilers (gcc 4.7.2) warn is being ignored.
+  // Suppress warning by casting return to void.
+  //(void)( *dest = val );
+  *dest = val;
+
+  Impl::unlock_address_host_space( (void*) dest );
+}
+//----------------------------------------------------------------------------
+
+#elif defined( KOKKOS_ATOMICS_USE_OMP31 )
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_exchange( volatile T * const dest , const T val )
+{
+  T retval;
+//#pragma omp atomic capture
+  #pragma omp critical
+  {
+    retval = dest[0];
+    dest[0] = val;
+  }
+  return retval;
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+void atomic_assign( volatile T * const dest , const T val )
+{
+//#pragma omp atomic
+  #pragma omp critical
+  {
+    dest[0] = val;
+  }
+}
+
+#endif
+
+} // namespace Kokkos
+
+#endif
+
+//----------------------------------------------------------------------------
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..62dfcdd2f88934f8d48b51e0637e9487d92c9a7e
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp
@@ -0,0 +1,340 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_ADD_HPP )
+#define KOKKOS_ATOMIC_FETCH_ADD_HPP
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_ATOMICS_USE_CUDA )
+
+// Support for int, unsigned int, unsigned long long int, and float
+
+__inline__ __device__
+int atomic_fetch_add( volatile int * const dest , const int val )
+{ return atomicAdd((int*)dest,val); }
+
+__inline__ __device__
+unsigned int atomic_fetch_add( volatile unsigned int * const dest , const unsigned int val )
+{ return atomicAdd((unsigned int*)dest,val); }
+
+__inline__ __device__
+unsigned long long int atomic_fetch_add( volatile unsigned long long int * const dest ,
+                                         const unsigned long long int val )
+{ return atomicAdd((unsigned long long int*)dest,val); }
+
+__inline__ __device__
+float atomic_fetch_add( volatile float * const dest , const float val )
+{ return atomicAdd((float*)dest,val); }
+
+template < typename T >
+__inline__ __device__
+T atomic_fetch_add( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T >::type val )
+{
+#ifdef KOKKOS_HAVE_CXX11
+  union U {
+    int i ;
+    T t ;
+    KOKKOS_INLINE_FUNCTION U() {};
+  } assume , oldval , newval ;
+#else
+  union U {
+    int i ;
+    T t ;
+  } assume , oldval , newval ;
+#endif
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = assume.t + val ;
+    oldval.i = atomicCAS( (int*)dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return oldval.t ;
+}
+
+template < typename T >
+__inline__ __device__
+T atomic_fetch_add( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) == sizeof(unsigned long long int) , const T >::type val )
+{
+#ifdef KOKKOS_HAVE_CXX11
+  union U {
+    unsigned long long int i ;
+    T t ;
+    KOKKOS_INLINE_FUNCTION U() {};
+  } assume , oldval , newval ;
+#else
+  union U {
+    unsigned long long int i ;
+    T t ;
+  } assume , oldval , newval ;
+#endif
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = assume.t + val ;
+    oldval.i = atomicCAS( (unsigned long long int*)dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return oldval.t ;
+}
+
+//----------------------------------------------------------------------------
+
+template < typename T >
+__inline__ __device__
+T atomic_fetch_add( volatile T * const dest ,
+    typename ::Kokkos::Impl::enable_if<
+                  ( sizeof(T) != 4 )
+               && ( sizeof(T) != 8 )
+             , const T >::type& val )
+{
+  T return_val;
+  // This is a way to (hopefully) avoid dead lock in a warp
+  int done = 1;
+  while ( done>0 ) {
+    done++;
+    if( Impl::lock_address_cuda_space( (void*) dest ) ) {
+      return_val = *dest;
+      *dest = return_val + val;
+      Impl::unlock_address_cuda_space( (void*) dest );
+      done = 0;
+    }
+  }
+  return return_val;
+}
+//----------------------------------------------------------------------------
+
+#elif defined(KOKKOS_ATOMICS_USE_GCC) || defined(KOKKOS_ATOMICS_USE_INTEL)
+
+#if defined( KOKKOS_ENABLE_ASM ) && defined ( KOKKOS_USE_ISA_X86_64 )
+KOKKOS_INLINE_FUNCTION
+int atomic_fetch_add( volatile int * dest , const int val )
+{
+        int original = val;
+
+        __asm__ __volatile__(
+                "lock xadd %1, %0"
+                : "+m" (*dest), "+r" (original)
+                : "m" (*dest), "r" (original)
+                : "memory"
+        );
+
+        return original;
+}
+#else
+KOKKOS_INLINE_FUNCTION
+int atomic_fetch_add( volatile int * const dest , const int val )
+{ return __sync_fetch_and_add(dest, val); }
+#endif
+
+KOKKOS_INLINE_FUNCTION
+long int atomic_fetch_add( volatile long int * const dest , const long int val )
+{ return __sync_fetch_and_add(dest,val); }
+
+#if defined( KOKKOS_ATOMICS_USE_GCC )
+
+KOKKOS_INLINE_FUNCTION
+unsigned int atomic_fetch_add( volatile unsigned int * const dest , const unsigned int val )
+{ return __sync_fetch_and_add(dest,val); }
+
+KOKKOS_INLINE_FUNCTION
+unsigned long int atomic_fetch_add( volatile unsigned long int * const dest , const unsigned long int val )
+{ return __sync_fetch_and_add(dest,val); }
+
+#endif
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_add( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T >::type val )
+{
+#ifdef KOKKOS_HAVE_CXX11
+  union U {
+    int i ;
+    T t ;
+    KOKKOS_INLINE_FUNCTION U() {};
+  } assume , oldval , newval ;
+#else
+  union U {
+    int i ;
+    T t ;
+  } assume , oldval , newval ;
+#endif
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = assume.t + val ;
+    oldval.i = __sync_val_compare_and_swap( (int*) dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return oldval.t ;
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_add( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) == sizeof(long) , const T >::type val )
+{
+#ifdef KOKKOS_HAVE_CXX11
+  union U {
+    long i ;
+    T t ;
+    KOKKOS_INLINE_FUNCTION U() {};
+  } assume , oldval , newval ;
+#else
+  union U {
+    long i ;
+    T t ;
+  } assume , oldval , newval ;
+#endif
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = assume.t + val ;
+    oldval.i = __sync_val_compare_and_swap( (long*) dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return oldval.t ;
+}
+
+#if defined( KOKKOS_ENABLE_ASM ) && defined ( KOKKOS_USE_ISA_X86_64 )
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_add( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) != sizeof(long) &&
+                                    sizeof(T) == sizeof(Impl::cas128_t) , const T >::type val )
+{
+  union U {
+    Impl::cas128_t i ;
+    T t ;
+    KOKKOS_INLINE_FUNCTION U() {};
+  } assume , oldval , newval ;
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = assume.t + val ;
+    oldval.i = Impl::cas128( (volatile Impl::cas128_t*) dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return oldval.t ;
+}
+#endif
+
+//----------------------------------------------------------------------------
+
+template < typename T >
+inline
+T atomic_fetch_add( volatile T * const dest ,
+    typename ::Kokkos::Impl::enable_if<
+                  ( sizeof(T) != 4 )
+               && ( sizeof(T) != 8 )
+              #if defined(KOKKOS_ENABLE_ASM) && defined ( KOKKOS_USE_ISA_X86_64 )
+               && ( sizeof(T) != 16 )
+              #endif
+                 , const T >::type& val )
+{
+  while( !Impl::lock_address_host_space( (void*) dest ) );
+  T return_val = *dest;
+  // Don't use the following line of code here:
+  //
+  //const T tmp = *dest = return_val + val;
+  //
+  // Instead, put each assignment in its own statement.  This is
+  // because the overload of T::operator= for volatile *this should
+  // return void, not volatile T&.  See Kokkos #177:
+  //
+  // https://github.com/kokkos/kokkos/issues/177
+  *dest = return_val + val;
+  const T tmp = *dest;
+  (void) tmp;
+  Impl::unlock_address_host_space( (void*) dest );
+  return return_val;
+}
+//----------------------------------------------------------------------------
+
+#elif defined( KOKKOS_ATOMICS_USE_OMP31 )
+
+template< typename T >
+T atomic_fetch_add( volatile T * const dest , const T val )
+{
+  T retval;
+#pragma omp atomic capture
+  {
+    retval = dest[0];
+    dest[0] += val;
+  }
+  return retval;
+}
+
+#endif
+
+//----------------------------------------------------------------------------
+
+// Simpler version of atomic_fetch_add without the fetch
+template <typename T>
+KOKKOS_INLINE_FUNCTION
+void atomic_add(volatile T * const dest, const T src) {
+  atomic_fetch_add(dest,src);
+}
+
+}
+#endif
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_And.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_And.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..9b7ebae4ac6df12bae659e50aa7da34429ac3187
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_And.hpp
@@ -0,0 +1,125 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_AND_HPP )
+#define KOKKOS_ATOMIC_FETCH_AND_HPP
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_ATOMICS_USE_CUDA )
+
+// Support for int, unsigned int, unsigned long long int, and float
+
+__inline__ __device__
+int atomic_fetch_and( volatile int * const dest , const int val )
+{ return atomicAnd((int*)dest,val); }
+
+__inline__ __device__
+unsigned int atomic_fetch_and( volatile unsigned int * const dest , const unsigned int val )
+{ return atomicAnd((unsigned int*)dest,val); }
+
+#if defined( __CUDA_ARCH__ ) && ( 350 <= __CUDA_ARCH__ )
+__inline__ __device__
+unsigned long long int atomic_fetch_and( volatile unsigned long long int * const dest ,
+                                         const unsigned long long int val )
+{ return atomicAnd((unsigned long long int*)dest,val); }
+#endif
+
+//----------------------------------------------------------------------------
+
+#elif defined(KOKKOS_ATOMICS_USE_GCC) || defined(KOKKOS_ATOMICS_USE_INTEL)
+
+KOKKOS_INLINE_FUNCTION
+int atomic_fetch_and( volatile int * const dest , const int val )
+{ return __sync_fetch_and_and(dest,val); }
+
+KOKKOS_INLINE_FUNCTION
+long int atomic_fetch_and( volatile long int * const dest , const long int val )
+{ return __sync_fetch_and_and(dest,val); }
+
+#if defined( KOKKOS_ATOMICS_USE_GCC )
+
+KOKKOS_INLINE_FUNCTION
+unsigned int atomic_fetch_and( volatile unsigned int * const dest , const unsigned int val )
+{ return __sync_fetch_and_and(dest,val); }
+
+KOKKOS_INLINE_FUNCTION
+unsigned long int atomic_fetch_and( volatile unsigned long int * const dest , const unsigned long int val )
+{ return __sync_fetch_and_and(dest,val); }
+
+#endif
+
+//----------------------------------------------------------------------------
+
+#elif defined( KOKKOS_ATOMICS_USE_OMP31 )
+
+template< typename T >
+T atomic_fetch_and( volatile T * const dest , const T val )
+{
+  T retval;
+#pragma omp atomic capture
+  {
+    retval = dest[0];
+    dest[0] &= val;
+  }
+  return retval;
+}
+
+#endif
+
+//----------------------------------------------------------------------------
+
+// Simpler version of atomic_fetch_and without the fetch
+template <typename T>
+KOKKOS_INLINE_FUNCTION
+void atomic_and(volatile T * const dest, const T src) {
+  (void)atomic_fetch_and(dest,src);
+}
+
+}
+
+#endif
+
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Or.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Or.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f15e61a3aea2ac2e7120d88a7151390cc2bf0b73
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Or.hpp
@@ -0,0 +1,125 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_OR_HPP )
+#define KOKKOS_ATOMIC_FETCH_OR_HPP
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_ATOMICS_USE_CUDA )
+
+// Support for int, unsigned int, unsigned long long int, and float
+
+__inline__ __device__
+int atomic_fetch_or( volatile int * const dest , const int val )
+{ return atomicOr((int*)dest,val); }
+
+__inline__ __device__
+unsigned int atomic_fetch_or( volatile unsigned int * const dest , const unsigned int val )
+{ return atomicOr((unsigned int*)dest,val); }
+
+#if defined( __CUDA_ARCH__ ) && ( 350 <= __CUDA_ARCH__ )
+__inline__ __device__
+unsigned long long int atomic_fetch_or( volatile unsigned long long int * const dest ,
+                                         const unsigned long long int val )
+{ return atomicOr((unsigned long long int*)dest,val); }
+#endif
+
+//----------------------------------------------------------------------------
+
+#elif defined(KOKKOS_ATOMICS_USE_GCC) || defined(KOKKOS_ATOMICS_USE_INTEL)
+
+KOKKOS_INLINE_FUNCTION
+int atomic_fetch_or( volatile int * const dest , const int val )
+{ return __sync_fetch_and_or(dest,val); }
+
+KOKKOS_INLINE_FUNCTION
+long int atomic_fetch_or( volatile long int * const dest , const long int val )
+{ return __sync_fetch_and_or(dest,val); }
+
+#if defined( KOKKOS_ATOMICS_USE_GCC )
+
+KOKKOS_INLINE_FUNCTION
+unsigned int atomic_fetch_or( volatile unsigned int * const dest , const unsigned int val )
+{ return __sync_fetch_and_or(dest,val); }
+
+KOKKOS_INLINE_FUNCTION
+unsigned long int atomic_fetch_or( volatile unsigned long int * const dest , const unsigned long int val )
+{ return __sync_fetch_and_or(dest,val); }
+
+#endif
+
+//----------------------------------------------------------------------------
+
+#elif defined( KOKKOS_ATOMICS_USE_OMP31 )
+
+template< typename T >
+T atomic_fetch_or( volatile T * const dest , const T val )
+{
+  T retval;
+#pragma omp atomic capture
+  {
+    retval = dest[0];
+    dest[0] |= val;
+  }
+  return retval;
+}
+
+#endif
+
+//----------------------------------------------------------------------------
+
+// Simpler version of atomic_fetch_or without the fetch
+template <typename T>
+KOKKOS_INLINE_FUNCTION
+void atomic_or(volatile T * const dest, const T src) {
+  (void)atomic_fetch_or(dest,src);
+}
+
+}
+
+#endif
+
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a3a57aa81c7f303cf74fe5d8d7c6a50dc36eeb2d
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp
@@ -0,0 +1,235 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_SUB_HPP )
+#define KOKKOS_ATOMIC_FETCH_SUB_HPP
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_ATOMICS_USE_CUDA )
+
+// Support for int, unsigned int, unsigned long long int, and float
+
+__inline__ __device__
+int atomic_fetch_sub( volatile int * const dest , const int val )
+{ return atomicSub((int*)dest,val); }
+
+__inline__ __device__
+unsigned int atomic_fetch_sub( volatile unsigned int * const dest , const unsigned int val )
+{ return atomicSub((unsigned int*)dest,val); }
+
+template < typename T >
+__inline__ __device__
+T atomic_fetch_sub( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T >::type val )
+{
+  union { int i ; T t ; } oldval , assume , newval ;
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = assume.t - val ;
+    oldval.i = atomicCAS( (int*)dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return oldval.t ;
+}
+
+template < typename T >
+__inline__ __device__
+T atomic_fetch_sub( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) == sizeof(unsigned long long int) , const T >::type val )
+{
+  union { unsigned long long int i ; T t ; } oldval , assume , newval ;
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = assume.t - val ;
+    oldval.i = atomicCAS( (unsigned long long int*)dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return oldval.t ;
+}
+
+
+//----------------------------------------------------------------------------
+
+template < typename T >
+__inline__ __device__
+T atomic_fetch_sub( volatile T * const dest ,
+    typename ::Kokkos::Impl::enable_if<
+                  ( sizeof(T) != 4 )
+               && ( sizeof(T) != 8 )
+             , const T >::type& val )
+{
+  T return_val;
+  // This is a way to (hopefully) avoid dead lock in a warp
+  int done = 0;
+  while ( done>0 ) {
+    done++;
+    if( Impl::lock_address_cuda_space( (void*) dest ) ) {
+      return_val = *dest;
+      *dest = return_val - val;
+      Impl::unlock_address_cuda_space( (void*) dest );
+      done = 0;
+    }
+  }
+  return return_val;
+}
+
+//----------------------------------------------------------------------------
+
+#elif defined(KOKKOS_ATOMICS_USE_GCC) || defined(KOKKOS_ATOMICS_USE_INTEL)
+
+KOKKOS_INLINE_FUNCTION
+int atomic_fetch_sub( volatile int * const dest , const int val )
+{ return __sync_fetch_and_sub(dest,val); }
+
+KOKKOS_INLINE_FUNCTION
+long int atomic_fetch_sub( volatile long int * const dest , const long int val )
+{ return __sync_fetch_and_sub(dest,val); }
+
+#if defined( KOKKOS_ATOMICS_USE_GCC )
+
+KOKKOS_INLINE_FUNCTION
+unsigned int atomic_fetch_sub( volatile unsigned int * const dest , const unsigned int val )
+{ return __sync_fetch_and_sub(dest,val); }
+
+KOKKOS_INLINE_FUNCTION
+unsigned long int atomic_fetch_sub( volatile unsigned long int * const dest , const unsigned long int val )
+{ return __sync_fetch_and_sub(dest,val); }
+
+#endif
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_sub( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T >::type val )
+{
+  union { int i ; T t ; } assume , oldval , newval ;
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = assume.t - val ;
+    oldval.i = __sync_val_compare_and_swap( (int*) dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return oldval.t ;
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_sub( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) == sizeof(long) , const T >::type val )
+{
+  union { long i ; T t ; } assume , oldval , newval ;
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = assume.t - val ;
+    oldval.i = __sync_val_compare_and_swap( (long*) dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return oldval.t ;
+}
+
+
+//----------------------------------------------------------------------------
+
+template < typename T >
+inline
+T atomic_fetch_sub( volatile T * const dest ,
+    typename ::Kokkos::Impl::enable_if<
+                  ( sizeof(T) != 4 )
+               && ( sizeof(T) != 8 )
+             , const T >::type& val )
+{
+  while( !Impl::lock_address_host_space( (void*) dest ) );
+  T return_val = *dest;
+  *dest = return_val - val;
+  Impl::unlock_address_host_space( (void*) dest );
+  return return_val;
+}
+
+//----------------------------------------------------------------------------
+
+#elif defined( KOKKOS_ATOMICS_USE_OMP31 )
+
+template< typename T >
+T atomic_fetch_sub( volatile T * const dest , const T val )
+{
+  T retval;
+#pragma omp atomic capture
+  {
+    retval = dest[0];
+    dest[0] -= val;
+  }
+  return retval;
+}
+
+#endif
+
+// Simpler version of atomic_fetch_sub without the fetch
+template <typename T>
+KOKKOS_INLINE_FUNCTION
+void atomic_sub(volatile T * const dest, const T src) {
+  atomic_fetch_sub(dest,src);
+}
+
+}
+
+#include<impl/Kokkos_Atomic_Assembly.hpp>
+#endif
+
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..343e9bf4c48fa499199930ebbf9a1fb893e475da
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp
@@ -0,0 +1,419 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_GENERIC_HPP )
+#define KOKKOS_ATOMIC_GENERIC_HPP
+#include <Kokkos_Macros.hpp>
+
+// Combination operands to be used in an Compare and Exchange based atomic operation
+namespace Kokkos {
+namespace Impl {
+
+template<class Scalar1, class Scalar2>
+struct MaxOper {
+  KOKKOS_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return (val1 > val2 ? val1 : val2);
+  }
+};
+
+template<class Scalar1, class Scalar2>
+struct MinOper {
+  KOKKOS_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return (val1 < val2 ? val1 : val2);
+  }
+};
+
+template<class Scalar1, class Scalar2>
+struct AddOper {
+  KOKKOS_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return val1+val2;
+  }
+};
+
+template<class Scalar1, class Scalar2>
+struct SubOper {
+  KOKKOS_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return val1-val2;
+  }
+};
+
+template<class Scalar1, class Scalar2>
+struct MulOper {
+  KOKKOS_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return val1*val2;
+  }
+};
+
+template<class Scalar1, class Scalar2>
+struct DivOper {
+  KOKKOS_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return val1/val2;
+  }
+};
+
+template<class Scalar1, class Scalar2>
+struct ModOper {
+  KOKKOS_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return val1%val2;
+  }
+};
+
+template<class Scalar1, class Scalar2>
+struct AndOper {
+  KOKKOS_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return val1&val2;
+  }
+};
+
+template<class Scalar1, class Scalar2>
+struct OrOper {
+  KOKKOS_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return val1|val2;
+  }
+};
+
+template<class Scalar1, class Scalar2>
+struct XorOper {
+  KOKKOS_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return val1^val2;
+  }
+};
+
+template<class Scalar1, class Scalar2>
+struct LShiftOper {
+  KOKKOS_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return val1<<val2;
+  }
+};
+
+template<class Scalar1, class Scalar2>
+struct RShiftOper {
+  KOKKOS_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return val1>>val2;
+  }
+};
+
+template < class Oper, typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_oper( const Oper& op, volatile T * const dest ,
+  typename ::Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) == sizeof(unsigned long long int) , const T >::type val )
+{
+  union { unsigned long long int i ; T t ; } oldval , assume , newval ;
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = Oper::apply(assume.t, val) ;
+    oldval.i = ::Kokkos::atomic_compare_exchange( (unsigned long long int*)dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return oldval.t ;
+}
+
+template < class Oper, typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_oper_fetch( const Oper& op, volatile T * const dest ,
+  typename ::Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) == sizeof(unsigned long long int) , const T >::type val )
+{
+  union { unsigned long long int i ; T t ; } oldval , assume , newval ;
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = Oper::apply(assume.t, val) ;
+    oldval.i = ::Kokkos::atomic_compare_exchange( (unsigned long long int*)dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return newval.t ;
+}
+
+template < class Oper, typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_oper( const Oper& op, volatile T * const dest ,
+  typename ::Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T >::type val )
+{
+  union { int i ; T t ; } oldval , assume , newval ;
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = Oper::apply(assume.t, val) ;
+    oldval.i = ::Kokkos::atomic_compare_exchange( (int*)dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return oldval.t ;
+}
+
+template < class Oper, typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_oper_fetch( const Oper& op, volatile T * const dest ,
+  typename ::Kokkos::Impl::enable_if< sizeof(T) == sizeof(int), const T >::type val )
+{
+  union { int i ; T t ; } oldval , assume , newval ;
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = Oper::apply(assume.t, val) ;
+    oldval.i = ::Kokkos::atomic_compare_exchange( (int*)dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return newval.t ;
+}
+
+template < class Oper, typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_oper( const Oper& op, volatile T * const dest ,
+  typename ::Kokkos::Impl::enable_if<
+                ( sizeof(T) != 4 )
+             && ( sizeof(T) != 8 )
+          #if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST)
+             && ( sizeof(T) != 16 )
+          #endif
+           , const T >::type val )
+{
+
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+  while( !Impl::lock_address_host_space( (void*) dest ) );
+  T return_val = *dest;
+  *dest = Oper::apply(return_val, val);
+  Impl::unlock_address_host_space( (void*) dest );
+  return return_val;
+#else
+  // This is a way to (hopefully) avoid dead lock in a warp
+  int done = 1;
+  while ( done>0 ) {
+    done++;
+    if( Impl::lock_address_cuda_space( (void*) dest ) ) {
+      T return_val = *dest;
+      *dest = Oper::apply(return_val, val);;
+      Impl::unlock_address_cuda_space( (void*) dest );
+      done=0;
+    }
+  }
+  return return_val;
+#endif
+}
+
+template < class Oper, typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_oper_fetch( const Oper& op, volatile T * const dest ,
+  typename ::Kokkos::Impl::enable_if<
+                ( sizeof(T) != 4 )
+             && ( sizeof(T) != 8 )
+          #if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST)
+             && ( sizeof(T) != 16 )
+          #endif
+           , const T >::type& val )
+{
+
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+  while( !Impl::lock_address_host_space( (void*) dest ) );
+  T return_val = Oper::apply(*dest, val);
+  *dest = return_val;
+  Impl::unlock_address_host_space( (void*) dest );
+  return return_val;
+#else
+  // This is a way to (hopefully) avoid dead lock in a warp
+  int done = 1;
+  while ( done>0 ) {
+    done++;
+    if( Impl::lock_address_cuda_space( (void*) dest ) ) {
+      T return_val = Oper::apply(*dest, val);
+      *dest = return_val;
+      Impl::unlock_address_cuda_space( (void*) dest );
+      done=0;
+    }
+  }
+  return return_val;
+#endif
+}
+
+}
+}
+
+namespace Kokkos {
+
+// Fetch_Oper atomics: return value before operation
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_max(volatile T * const dest, const T val) {
+  return Impl::atomic_fetch_oper(Impl::MaxOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_min(volatile T * const dest, const T val) {
+  return Impl::atomic_fetch_oper(Impl::MinOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_mul(volatile T * const dest, const T val) {
+  return Impl::atomic_fetch_oper(Impl::MulOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_div(volatile T * const dest, const T val) {
+  return Impl::atomic_fetch_oper(Impl::DivOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_mod(volatile T * const dest, const T val) {
+  return Impl::atomic_fetch_oper(Impl::ModOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_and(volatile T * const dest, const T val) {
+  return Impl::atomic_fetch_oper(Impl::AndOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_or(volatile T * const dest, const T val) {
+  return Impl::atomic_fetch_oper(Impl::OrOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_xor(volatile T * const dest, const T val) {
+  return Impl::atomic_fetch_oper(Impl::XorOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_lshift(volatile T * const dest, const unsigned int val) {
+  return Impl::atomic_fetch_oper(Impl::LShiftOper<T,const unsigned int>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_rshift(volatile T * const dest, const unsigned int val) {
+  return Impl::atomic_fetch_oper(Impl::RShiftOper<T,const unsigned int>(),dest,val);
+}
+
+
+// Oper Fetch atomics: return value after operation
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_max_fetch(volatile T * const dest, const T val) {
+  return Impl::atomic_oper_fetch(Impl::MaxOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_min_fetch(volatile T * const dest, const T val) {
+  return Impl::atomic_oper_fetch(Impl::MinOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_mul_fetch(volatile T * const dest, const T val) {
+  return Impl::atomic_oper_fetch(Impl::MulOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_div_fetch(volatile T * const dest, const T val) {
+  return Impl::atomic_oper_fetch(Impl::DivOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_mod_fetch(volatile T * const dest, const T val) {
+  return Impl::atomic_oper_fetch(Impl::ModOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_and_fetch(volatile T * const dest, const T val) {
+  return Impl::atomic_oper_fetch(Impl::AndOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_or_fetch(volatile T * const dest, const T val) {
+  return Impl::atomic_oper_fetch(Impl::OrOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_xor_fetch(volatile T * const dest, const T val) {
+  return Impl::atomic_oper_fetch(Impl::XorOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_lshift_fetch(volatile T * const dest, const unsigned int val) {
+  return Impl::atomic_oper_fetch(Impl::LShiftOper<T,const unsigned int>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_rshift_fetch(volatile T * const dest, const unsigned int val) {
+  return Impl::atomic_oper_fetch(Impl::RShiftOper<T,const unsigned int>(),dest,val);
+}
+
+
+}
+#endif
diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Increment.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Increment.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..eecda29f1c20524c4ff95acec646417c9160c1a6
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Increment.hpp
@@ -0,0 +1,117 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#if defined( KOKKOS_ATOMIC_HPP) && ! defined( KOKKOS_ATOMIC_INCREMENT )
+#define KOKKOS_ATOMIC_INCREMENT
+
+namespace Kokkos {
+
+// Atomic increment
+template<>
+KOKKOS_INLINE_FUNCTION
+void atomic_increment<char>(volatile char* a) {
+#if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_USE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__)
+  __asm__ __volatile__(
+      "lock incb %0"
+      : /* no output registers */
+      : "m" (a[0])
+      : "memory"
+    );
+#else
+  Kokkos::atomic_fetch_add(a,1);
+#endif
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+void atomic_increment<short>(volatile short* a) {
+#if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_USE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__)
+  __asm__ __volatile__(
+      "lock incw %0"
+      : /* no output registers */
+      : "m" (a[0])
+      : "memory"
+    );
+#else
+  Kokkos::atomic_fetch_add(a,1);
+#endif
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+void atomic_increment<int>(volatile int* a) {
+#if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_USE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__)
+  __asm__ __volatile__(
+      "lock incl %0"
+      : /* no output registers */
+      : "m" (a[0])
+      : "memory"
+    );
+#else
+  Kokkos::atomic_fetch_add(a,1);
+#endif
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+void atomic_increment<long long int>(volatile long long int* a) {
+#if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_USE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__)
+  __asm__ __volatile__(
+      "lock incq %0"
+      : /* no output registers */
+      : "m" (a[0])
+      : "memory"
+    );
+#else
+  Kokkos::atomic_fetch_add(a,1);
+#endif
+}
+
+template<typename T>
+KOKKOS_INLINE_FUNCTION
+void atomic_increment(volatile T* a) {
+  Kokkos::atomic_fetch_add(a,1);
+}
+
+} // End of namespace Kokkos
+#endif
diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_View.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_View.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..6e48faa6948e808c3460b4408ebb85a75617d035
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_View.hpp
@@ -0,0 +1,430 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+#ifndef KOKKOS_ATOMIC_VIEW_HPP
+#define KOKKOS_ATOMIC_VIEW_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <Kokkos_Atomic.hpp>
+
+namespace Kokkos { namespace Impl {
+
+//The following tag is used to prevent an implicit call of the constructor when trying
+//to assign a literal 0 int ( = 0 );
+struct AtomicViewConstTag {};
+
+template<class ViewTraits>
+class AtomicDataElement {
+public:
+  typedef typename ViewTraits::value_type value_type;
+  typedef typename ViewTraits::const_value_type const_value_type;
+  typedef typename ViewTraits::non_const_value_type non_const_value_type;
+  volatile value_type* const ptr;
+
+  KOKKOS_INLINE_FUNCTION
+  AtomicDataElement(value_type* ptr_, AtomicViewConstTag ):ptr(ptr_){}
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator = (const_value_type& val) const {
+    *ptr = val;
+    return val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator = (volatile const_value_type& val) const {
+    *ptr = val;
+    return val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void inc() const {
+    Kokkos::atomic_increment(ptr);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void dec() const {
+    Kokkos::atomic_decrement(ptr);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator ++ () const {
+    const_value_type tmp = Kokkos::atomic_fetch_add(ptr,1);
+    return tmp+1;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator -- () const {
+    const_value_type tmp = Kokkos::atomic_fetch_add(ptr,-1);
+    return tmp-1;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator ++ (int) const {
+    return Kokkos::atomic_fetch_add(ptr,1);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator -- (int) const {
+    return Kokkos::atomic_fetch_add(ptr,-1);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator += (const_value_type& val) const {
+    const_value_type tmp = Kokkos::atomic_fetch_add(ptr,val);
+    return tmp+val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator += (volatile const_value_type& val) const {
+    const_value_type tmp = Kokkos::atomic_fetch_add(ptr,val);
+    return tmp+val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator -= (const_value_type& val) const {
+    const_value_type tmp = Kokkos::atomic_fetch_add(ptr,-val);
+    return tmp-val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator -= (volatile const_value_type& val) const {
+    const_value_type tmp = Kokkos::atomic_fetch_add(ptr,-val);
+    return tmp-val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator *= (const_value_type& val) const {
+    return Kokkos::atomic_mul_fetch(ptr,val);
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator *= (volatile const_value_type& val) const {
+    return Kokkos::atomic_mul_fetch(ptr,val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator /= (const_value_type& val) const {
+    return Kokkos::atomic_div_fetch(ptr,val);
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator /= (volatile const_value_type& val) const {
+    return Kokkos::atomic_div_fetch(ptr,val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator %= (const_value_type& val) const {
+    return Kokkos::atomic_mod_fetch(ptr,val);
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator %= (volatile const_value_type& val) const {
+    return Kokkos::atomic_mod_fetch(ptr,val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator &= (const_value_type& val) const {
+    return Kokkos::atomic_and_fetch(ptr,val);
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator &= (volatile const_value_type& val) const {
+    return Kokkos::atomic_and_fetch(ptr,val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator ^= (const_value_type& val) const {
+    return Kokkos::atomic_xor_fetch(ptr,val);
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator ^= (volatile const_value_type& val) const {
+    return Kokkos::atomic_xor_fetch(ptr,val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator |= (const_value_type& val) const {
+    return Kokkos::atomic_or_fetch(ptr,val);
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator |= (volatile const_value_type& val) const {
+    return Kokkos::atomic_or_fetch(ptr,val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator <<= (const_value_type& val) const {
+    return Kokkos::atomic_lshift_fetch(ptr,val);
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator <<= (volatile const_value_type& val) const {
+    return Kokkos::atomic_lshift_fetch(ptr,val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator >>= (const_value_type& val) const {
+    return Kokkos::atomic_rshift_fetch(ptr,val);
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator >>= (volatile const_value_type& val) const {
+    return Kokkos::atomic_rshift_fetch(ptr,val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator + (const_value_type& val) const {
+    return *ptr+val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator + (volatile const_value_type& val) const {
+    return *ptr+val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator - (const_value_type& val) const {
+    return *ptr-val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator - (volatile const_value_type& val) const {
+    return *ptr-val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator * (const_value_type& val) const {
+    return *ptr*val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator * (volatile const_value_type& val) const {
+    return *ptr*val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator / (const_value_type& val) const {
+    return *ptr/val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator / (volatile const_value_type& val) const {
+    return *ptr/val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator % (const_value_type& val) const {
+    return *ptr^val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator % (volatile const_value_type& val) const {
+    return *ptr^val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator ! () const {
+    return !*ptr;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator && (const_value_type& val) const {
+    return *ptr&&val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator && (volatile const_value_type& val) const {
+    return *ptr&&val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator || (const_value_type& val) const {
+    return *ptr|val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator || (volatile const_value_type& val) const {
+    return *ptr|val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator & (const_value_type& val) const {
+    return *ptr&val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator & (volatile const_value_type& val) const {
+    return *ptr&val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator | (const_value_type& val) const {
+    return *ptr|val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator | (volatile const_value_type& val) const {
+    return *ptr|val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator ^ (const_value_type& val) const {
+    return *ptr^val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator ^ (volatile const_value_type& val) const {
+    return *ptr^val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator ~ () const {
+    return ~*ptr;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator << (const unsigned int& val) const {
+    return *ptr<<val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator << (volatile const unsigned int& val) const {
+    return *ptr<<val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator >> (const unsigned int& val) const {
+    return *ptr>>val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator >> (volatile const unsigned int& val) const {
+    return *ptr>>val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  bool operator == (const_value_type& val) const {
+    return *ptr == val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  bool operator == (volatile const_value_type& val) const {
+    return *ptr == val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  bool operator != (const_value_type& val) const {
+    return *ptr != val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  bool operator != (volatile const_value_type& val) const {
+    return *ptr != val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  bool operator >= (const_value_type& val) const {
+    return *ptr >= val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  bool operator >= (volatile const_value_type& val) const {
+    return *ptr >= val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  bool operator <= (const_value_type& val) const {
+    return *ptr <= val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  bool operator <= (volatile const_value_type& val) const {
+    return *ptr <= val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  bool operator < (const_value_type& val) const {
+    return *ptr < val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  bool operator < (volatile const_value_type& val) const {
+    return *ptr < val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  bool operator > (const_value_type& val) const {
+    return *ptr > val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  bool operator > (volatile const_value_type& val) const {
+    return *ptr > val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  operator const_value_type () const {
+    //return Kokkos::atomic_load(ptr);
+    return *ptr;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  operator volatile non_const_value_type () volatile const {
+    //return Kokkos::atomic_load(ptr);
+    return *ptr;
+  }
+};
+
+template<class ViewTraits>
+class AtomicViewDataHandle {
+public:
+  typename ViewTraits::value_type* ptr;
+
+  KOKKOS_INLINE_FUNCTION
+  AtomicViewDataHandle()
+    : ptr(NULL)
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  AtomicViewDataHandle(typename ViewTraits::value_type* ptr_)
+    :ptr(ptr_)
+  {}
+
+  template<class iType>
+  KOKKOS_INLINE_FUNCTION
+  AtomicDataElement<ViewTraits> operator[] (const iType& i) const {
+    return AtomicDataElement<ViewTraits>(ptr+i,AtomicViewConstTag());
+  }
+
+
+  KOKKOS_INLINE_FUNCTION
+  operator typename ViewTraits::value_type * () const { return ptr ; }
+
+};
+
+template<unsigned Size>
+struct Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars;
+
+template<>
+struct Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars<4> {
+  typedef int type;
+};
+
+template<>
+struct Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars<8> {
+  typedef int64_t type;
+};
+
+}} // namespace Kokkos::Impl
+
+#endif
diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Windows.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Windows.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..14066e8be25f42e7a33b2f9261d90769dff6060d
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Windows.hpp
@@ -0,0 +1,232 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+#ifndef KOKKOS_ATOMIC_WINDOWS_HPP
+#define KOKKOS_ATOMIC_WINDOWS_HPP
+#ifdef _WIN32
+
+#define NOMINMAX
+#include <winsock2.h>
+#include <Windows.h>
+
+namespace Kokkos {
+  namespace Impl {
+    _declspec(align(16))
+    struct cas128_t
+    {
+      LONGLONG lower;
+      LONGLONG upper;
+      KOKKOS_INLINE_FUNCTION
+        bool operator != (const cas128_t& a) const {
+        return (lower != a.lower) || upper != a.upper;
+      }
+    };
+  }
+
+  template < typename T >
+  KOKKOS_INLINE_FUNCTION
+    T atomic_compare_exchange(volatile T * const dest, const T & compare,
+    typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(LONG), const T & >::type val)
+  {
+    union U {
+      LONG i;
+      T t;
+      KOKKOS_INLINE_FUNCTION U() {};
+    } tmp;
+
+    tmp.i = _InterlockedCompareExchange((LONG*)dest, *((LONG*)&val), *((LONG*)&compare));
+    return tmp.t;
+  }
+
+  template < typename T >
+  KOKKOS_INLINE_FUNCTION
+    T atomic_compare_exchange(volatile T * const dest, const T & compare,
+    typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(LONGLONG), const T & >::type val)
+  {
+    union U {
+      LONGLONG i;
+      T t;
+      KOKKOS_INLINE_FUNCTION U() {};
+    } tmp;
+
+    tmp.i = _InterlockedCompareExchange64((LONGLONG*)dest, *((LONGLONG*)&val), *((LONGLONG*)&compare));
+    return tmp.t;
+  }
+
+  template < typename T >
+  KOKKOS_INLINE_FUNCTION
+    T atomic_compare_exchange(volatile T * const dest, const T & compare,
+    typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(Impl::cas128_t), const T & >::type val)
+  {
+    union U {
+      Impl::cas128_t i;
+      T t;
+      KOKKOS_INLINE_FUNCTION U() {};
+    } tmp, newval;
+    newval.t = val;
+    _InterlockedCompareExchange128((LONGLONG*)dest, newval.i.upper, newval.i.lower, ((LONGLONG*)&compare));
+    tmp.t = dest;
+    return tmp.t;
+  }
+
+  template < typename T >
+  KOKKOS_INLINE_FUNCTION
+    T atomic_compare_exchange_strong(volatile T * const dest, const T & compare, const T & val)
+  {
+    return atomic_compare_exchange(dest,compare,val);
+  }
+
+  template< typename T >
+  T atomic_fetch_or(volatile T * const dest, const T val) {
+    T oldval = *dest;
+    T assume;
+    do {
+      assume = oldval;
+      T newval = val | oldval;
+      oldval = atomic_compare_exchange(dest, assume, newval);
+    } while (assume != oldval);
+
+    return oldval;
+  }
+
+  template< typename T >
+  T atomic_fetch_and(volatile T * const dest, const T val) {
+    T oldval = *dest;
+    T assume;
+    do {
+      assume = oldval;
+      T newval = val & oldval;
+      oldval = atomic_compare_exchange(dest, assume, newval);
+    } while (assume != oldval);
+
+    return oldval;
+  }
+
+  template< typename T >
+  T atomic_fetch_add(volatile T * const dest, const T val) {
+    T oldval = *dest;
+    T assume;
+    do {
+      assume = oldval;
+      T newval = val + oldval;
+      oldval = atomic_compare_exchange(dest, assume, newval);
+    } while (assume != oldval);
+
+    return oldval;
+  }
+
+  template< typename T >
+  T atomic_fetch_sub(volatile T * const dest, const T val) {
+    T oldval = *dest;
+    T assume;
+    do {
+      assume = oldval;
+      T newval = val - oldval;
+      oldval = atomic_compare_exchange(dest, assume, newval);
+    } while (assume != oldval);
+
+    return oldval;
+  }
+
+  template< typename T >
+  T atomic_exchange(volatile T * const dest, const T val) {
+    T oldval = *dest;
+    T assume;
+    do {
+      assume = oldval;
+      oldval = atomic_compare_exchange(dest, assume, val);
+    } while (assume != oldval);
+
+    return oldval;
+  }
+
+  template< typename T >
+  void atomic_or(volatile T * const dest, const T val) {
+    atomic_fetch_or(dest, val);
+  }
+
+  template< typename T >
+  void atomic_and(volatile T * const dest, const T val) {
+    atomic_fetch_and(dest, val);
+  }
+
+  template< typename T >
+  void atomic_add(volatile T * const dest, const T val) {
+    atomic_fetch_add(dest, val);
+  }
+
+  template< typename T >
+  void atomic_sub(volatile T * const dest, const T val) {
+    atomic_fetch_sub(dest, val);
+  }
+
+  template< typename T >
+  void atomic_assign(volatile T * const dest, const T val) {
+    atomic_fetch_exchange(dest, val);
+  }
+
+  template< typename T >
+  T atomic_increment(volatile T * const dest) {
+    T oldval = *dest;
+    T assume;
+    do {
+      assume = oldval;
+      T newval = assume++;
+      oldval = atomic_compare_exchange(dest, assume, newval);
+    } while (assume != oldval);
+  }
+
+  template< typename T >
+  T atomic_decrement(volatile T * const dest) {
+    T oldval = *dest;
+    T assume;
+    do {
+      assume = oldval;
+      T newval = assume--;
+      oldval = atomic_compare_exchange(dest, assume, newval);
+    } while (assume != oldval);
+  }
+
+}
+#endif
+#endif
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_BitOps.hpp b/lib/kokkos/core/src/impl/Kokkos_BitOps.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..0ffbc0548ab663c9b6afa8799f162e3c7bbd7510
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_BitOps.hpp
@@ -0,0 +1,122 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_BITOPS_HPP
+#define KOKKOS_BITOPS_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <stdint.h>
+#include <climits>
+
+namespace Kokkos {
+namespace Impl {
+
+KOKKOS_FORCEINLINE_FUNCTION
+int bit_scan_forward( unsigned i )
+{
+#if defined( __CUDA_ARCH__ )
+  return __ffs(i) - 1;
+#elif defined( __GNUC__ ) || defined( __GNUG__ )
+  return __builtin_ffs(i) - 1;
+#elif defined( __INTEL_COMPILER )
+  return _bit_scan_forward(i);
+#else
+
+  unsigned t = 1u;
+  int r = 0;
+  while ( i && ( i & t == 0 ) )
+  {
+    t = t << 1;
+    ++r;
+  }
+  return r;
+#endif
+}
+
+KOKKOS_FORCEINLINE_FUNCTION
+int bit_scan_reverse( unsigned i )
+{
+  enum { shift = static_cast<int>( sizeof(unsigned) * CHAR_BIT - 1 ) };
+#if defined( __CUDA_ARCH__ )
+  return shift - __clz(i);
+#elif defined( __GNUC__ ) || defined( __GNUG__ )
+  return shift - __builtin_clz(i);
+#elif defined( __INTEL_COMPILER )
+  return _bit_scan_reverse(i);
+#else
+  unsigned t = 1u << shift;
+  int r = 0;
+  while ( i && ( i & t == 0 ) )
+  {
+    t = t >> 1;
+    ++r;
+  }
+  return r;
+#endif
+}
+
+/// Count the number of bits set.
+KOKKOS_FORCEINLINE_FUNCTION
+int bit_count( unsigned i )
+{
+#if defined( __CUDA_ARCH__ )
+  return __popc(i);
+#elif defined( __GNUC__ ) || defined( __GNUG__ )
+  return __builtin_popcount(i);
+#elif defined ( __INTEL_COMPILER )
+  return _popcnt32(i);
+#else
+  // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetNaive
+  i = i - ( ( i >> 1 ) & ~0u / 3u );                             // temp
+  i = ( i & ~0u / 15u * 3u ) + ( ( i >> 2 ) & ~0u / 15u * 3u );  // temp
+  i = ( i + ( i >> 4 ) ) & ~0u / 255u * 15u;                     // temp
+
+  // count
+  return (int)( ( i * ( ~0u / 255u ) ) >> ( sizeof(unsigned) - 1 ) * CHAR_BIT );
+#endif
+}
+
+} // namespace Impl
+} // namespace Kokkos
+
+#endif // KOKKOS_BITOPS_HPP
diff --git a/lib/kokkos/core/src/impl/Kokkos_CPUDiscovery.cpp b/lib/kokkos/core/src/impl/Kokkos_CPUDiscovery.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b9d23bd815433a0a91c282dd6e787b7d16f8b0e3
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_CPUDiscovery.cpp
@@ -0,0 +1,124 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifdef _WIN32
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#else
+#include <unistd.h>
+#endif
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <cerrno>
+
+namespace Kokkos {
+namespace Impl {
+
+//The following function (processors_per_node) is copied from here:
+// https://lists.gnu.org/archive/html/autoconf/2002-08/msg00126.html
+// Philip Willoughby
+
+int processors_per_node() {
+  int nprocs = -1;
+  int nprocs_max = -1;
+#ifdef _WIN32
+#ifndef _SC_NPROCESSORS_ONLN
+SYSTEM_INFO info;
+GetSystemInfo(&info);
+#define sysconf(a) info.dwNumberOfProcessors
+#define _SC_NPROCESSORS_ONLN
+#endif
+#endif
+#ifdef _SC_NPROCESSORS_ONLN
+  nprocs = sysconf(_SC_NPROCESSORS_ONLN);
+  if (nprocs < 1)
+  {
+    return -1;
+  }
+  nprocs_max = sysconf(_SC_NPROCESSORS_CONF);
+  if (nprocs_max < 1)
+  {
+    return -1;
+  }
+  return nprocs;
+#else
+  return -1;
+#endif
+}
+
+int mpi_ranks_per_node() {
+  char *str;
+  int ppn = 1;
+  if ((str = getenv("SLURM_TASKS_PER_NODE"))) {
+    ppn = atoi(str);
+    if(ppn<=0) ppn = 1;
+  }
+  if ((str = getenv("MV2_COMM_WORLD_LOCAL_SIZE"))) {
+    ppn = atoi(str);
+    if(ppn<=0) ppn = 1;
+  }
+  if ((str = getenv("OMPI_COMM_WORLD_LOCAL_SIZE"))) {
+    ppn = atoi(str);
+    if(ppn<=0) ppn = 1;
+  }
+  return ppn;
+}
+
+int mpi_local_rank_on_node() {
+  char *str;
+  int local_rank=0;
+  if ((str = getenv("SLURM_LOCALID"))) {
+    local_rank = atoi(str);
+  }
+  if ((str = getenv("MV2_COMM_WORLD_LOCAL_RANK"))) {
+    local_rank = atoi(str);
+  }
+  if ((str = getenv("OMPI_COMM_WORLD_LOCAL_RANK"))) {
+    local_rank = atoi(str);
+  }
+  return local_rank;
+}
+
+}
+}
diff --git a/lib/kokkos/core/src/impl/Kokkos_CPUDiscovery.hpp b/lib/kokkos/core/src/impl/Kokkos_CPUDiscovery.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..af474bc40617968fbc87fb2cf6b70e1bec0d42f5
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_CPUDiscovery.hpp
@@ -0,0 +1,51 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+namespace Kokkos {
+namespace Impl {
+
+int processors_per_node();
+int mpi_ranks_per_node();
+int mpi_local_rank_on_node();
+
+}
+}
diff --git a/lib/kokkos/core/src/impl/Kokkos_Core.cpp b/lib/kokkos/core/src/impl/Kokkos_Core.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..567a2141405719e3331b2327ca40097c24af775a
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Core.cpp
@@ -0,0 +1,454 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <impl/Kokkos_Error.hpp>
+#include <cctype>
+#include <cstring>
+#include <iostream>
+#include <cstdlib>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+namespace {
+
+bool is_unsigned_int(const char* str)
+{
+  const size_t len = strlen (str);
+  for (size_t i = 0; i < len; ++i) {
+    if (! isdigit (str[i])) {
+      return false;
+    }
+  }
+  return true;
+}
+
+void initialize_internal(const InitArguments& args)
+{
+// This is an experimental setting
+// For KNL in Flat mode this variable should be set, so that
+// memkind allocates high bandwidth memory correctly.
+#ifdef KOKKOS_HAVE_HBWSPACE
+setenv("MEMKIND_HBW_NODES", "1", 0);
+#endif
+
+  // Protect declarations, to prevent "unused variable" warnings.
+#if defined( KOKKOS_HAVE_OPENMP ) || defined( KOKKOS_HAVE_PTHREAD )
+  const int num_threads = args.num_threads;
+  const int use_numa = args.num_numa;
+#endif // defined( KOKKOS_HAVE_OPENMP ) || defined( KOKKOS_HAVE_PTHREAD )
+#if defined( KOKKOS_HAVE_CUDA )
+  const int use_gpu = args.device_id;
+#endif // defined( KOKKOS_HAVE_CUDA )
+
+#if defined( KOKKOS_HAVE_OPENMP )
+  if( Impl::is_same< Kokkos::OpenMP , Kokkos::DefaultExecutionSpace >::value ||
+      Impl::is_same< Kokkos::OpenMP , Kokkos::HostSpace::execution_space >::value ) {
+    if(num_threads>0) {
+      if(use_numa>0) {
+        Kokkos::OpenMP::initialize(num_threads,use_numa);
+      }
+      else {
+        Kokkos::OpenMP::initialize(num_threads);
+      }
+    } else {
+      Kokkos::OpenMP::initialize();
+    }
+    //std::cout << "Kokkos::initialize() fyi: OpenMP enabled and initialized" << std::endl ;
+  }
+  else {
+    //std::cout << "Kokkos::initialize() fyi: OpenMP enabled but not initialized" << std::endl ;
+  }
+#endif
+
+#if defined( KOKKOS_HAVE_PTHREAD )
+  if( Impl::is_same< Kokkos::Threads , Kokkos::DefaultExecutionSpace >::value ||
+      Impl::is_same< Kokkos::Threads , Kokkos::HostSpace::execution_space >::value ) {
+    if(num_threads>0) {
+      if(use_numa>0) {
+        Kokkos::Threads::initialize(num_threads,use_numa);
+      }
+      else {
+        Kokkos::Threads::initialize(num_threads);
+      }
+    } else {
+      Kokkos::Threads::initialize();
+    }
+    //std::cout << "Kokkos::initialize() fyi: Pthread enabled and initialized" << std::endl ;
+  }
+  else {
+    //std::cout << "Kokkos::initialize() fyi: Pthread enabled but not initialized" << std::endl ;
+  }
+#endif
+
+#if defined( KOKKOS_HAVE_SERIAL )
+  // Prevent "unused variable" warning for 'args' input struct.  If
+  // Serial::initialize() ever needs to take arguments from the input
+  // struct, you may remove this line of code.
+  (void) args;
+
+  if( Impl::is_same< Kokkos::Serial , Kokkos::DefaultExecutionSpace >::value ||
+      Impl::is_same< Kokkos::Serial , Kokkos::HostSpace::execution_space >::value ) {
+    Kokkos::Serial::initialize();
+  }
+#endif
+
+#if defined( KOKKOS_HAVE_CUDA )
+  if( Impl::is_same< Kokkos::Cuda , Kokkos::DefaultExecutionSpace >::value || 0 < use_gpu ) {
+    if (use_gpu > -1) {
+      Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice( use_gpu ) );
+    }
+    else {
+      Kokkos::Cuda::initialize();
+    }
+    //std::cout << "Kokkos::initialize() fyi: Cuda enabled and initialized" << std::endl ;
+  }
+#endif
+
+#if (KOKKOS_ENABLE_PROFILING)
+    Kokkos::Profiling::initialize();
+#endif
+}
+
+void finalize_internal( const bool all_spaces = false )
+{
+
+#if defined( KOKKOS_HAVE_CUDA )
+  if( Impl::is_same< Kokkos::Cuda , Kokkos::DefaultExecutionSpace >::value || all_spaces ) {
+    if(Kokkos::Cuda::is_initialized())
+      Kokkos::Cuda::finalize();
+  }
+#endif
+
+#if defined( KOKKOS_HAVE_OPENMP )
+  if( Impl::is_same< Kokkos::OpenMP , Kokkos::DefaultExecutionSpace >::value ||
+      Impl::is_same< Kokkos::OpenMP , Kokkos::HostSpace::execution_space >::value ||
+      all_spaces ) {
+    if(Kokkos::OpenMP::is_initialized())
+      Kokkos::OpenMP::finalize();
+  }
+#endif
+
+#if defined( KOKKOS_HAVE_PTHREAD )
+  if( Impl::is_same< Kokkos::Threads , Kokkos::DefaultExecutionSpace >::value ||
+      Impl::is_same< Kokkos::Threads , Kokkos::HostSpace::execution_space >::value ||
+      all_spaces ) {
+    if(Kokkos::Threads::is_initialized())
+      Kokkos::Threads::finalize();
+  }
+#endif
+
+#if defined( KOKKOS_HAVE_SERIAL )
+  if( Impl::is_same< Kokkos::Serial , Kokkos::DefaultExecutionSpace >::value ||
+      Impl::is_same< Kokkos::Serial , Kokkos::HostSpace::execution_space >::value ||
+      all_spaces ) {
+    if(Kokkos::Serial::is_initialized())
+      Kokkos::Serial::finalize();
+  }
+#endif
+
+#if (KOKKOS_ENABLE_PROFILING)
+    Kokkos::Profiling::finalize();
+#endif
+
+}
+
+void fence_internal()
+{
+
+#if defined( KOKKOS_HAVE_CUDA )
+  if( Impl::is_same< Kokkos::Cuda , Kokkos::DefaultExecutionSpace >::value ) {
+    Kokkos::Cuda::fence();
+  }
+#endif
+
+#if defined( KOKKOS_HAVE_OPENMP )
+  if( Impl::is_same< Kokkos::OpenMP , Kokkos::DefaultExecutionSpace >::value ||
+      Impl::is_same< Kokkos::OpenMP , Kokkos::HostSpace::execution_space >::value ) {
+    Kokkos::OpenMP::fence();
+  }
+#endif
+
+#if defined( KOKKOS_HAVE_PTHREAD )
+  if( Impl::is_same< Kokkos::Threads , Kokkos::DefaultExecutionSpace >::value ||
+      Impl::is_same< Kokkos::Threads , Kokkos::HostSpace::execution_space >::value ) {
+    Kokkos::Threads::fence();
+  }
+#endif
+
+#if defined( KOKKOS_HAVE_SERIAL )
+  if( Impl::is_same< Kokkos::Serial , Kokkos::DefaultExecutionSpace >::value ||
+      Impl::is_same< Kokkos::Serial , Kokkos::HostSpace::execution_space >::value ) {
+    Kokkos::Serial::fence();
+  }
+#endif
+
+}
+
+} // namespace
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+void initialize(int& narg, char* arg[])
+{
+    int num_threads = -1;
+    int numa = -1;
+    int device = -1;
+
+    int kokkos_threads_found = 0;
+    int kokkos_numa_found = 0;
+    int kokkos_device_found = 0;
+    int kokkos_ndevices_found = 0;
+
+    int iarg = 0;
+
+    while (iarg < narg) {
+      if ((strncmp(arg[iarg],"--kokkos-threads",16) == 0) || (strncmp(arg[iarg],"--threads",9) == 0)) {
+        //Find the number of threads (expecting --threads=XX)
+        if (!((strncmp(arg[iarg],"--kokkos-threads=",17) == 0) || (strncmp(arg[iarg],"--threads=",10) == 0)))
+          Impl::throw_runtime_exception("Error: expecting an '=INT' after command line argument '--threads/--kokkos-threads'. Raised by Kokkos::initialize(int narg, char* argc[]).");
+
+        char* number =  strchr(arg[iarg],'=')+1;
+
+        if(!Impl::is_unsigned_int(number) || (strlen(number)==0))
+          Impl::throw_runtime_exception("Error: expecting an '=INT' after command line argument '--threads/--kokkos-threads'. Raised by Kokkos::initialize(int narg, char* argc[]).");
+
+        if((strncmp(arg[iarg],"--kokkos-threads",16) == 0) || !kokkos_threads_found)
+          num_threads = atoi(number);
+
+        //Remove the --kokkos-threads argument from the list but leave --threads
+        if(strncmp(arg[iarg],"--kokkos-threads",16) == 0) {
+          for(int k=iarg;k<narg-1;k++) {
+            arg[k] = arg[k+1];
+          }
+          kokkos_threads_found=1;
+          narg--;
+        } else {
+          iarg++;
+        }
+      } else if ((strncmp(arg[iarg],"--kokkos-numa",13) == 0) || (strncmp(arg[iarg],"--numa",6) == 0)) {
+        //Find the number of numa (expecting --numa=XX)
+        if (!((strncmp(arg[iarg],"--kokkos-numa=",14) == 0) || (strncmp(arg[iarg],"--numa=",7) == 0)))
+          Impl::throw_runtime_exception("Error: expecting an '=INT' after command line argument '--numa/--kokkos-numa'. Raised by Kokkos::initialize(int narg, char* argc[]).");
+
+        char* number =  strchr(arg[iarg],'=')+1;
+
+        if(!Impl::is_unsigned_int(number) || (strlen(number)==0))
+          Impl::throw_runtime_exception("Error: expecting an '=INT' after command line argument '--numa/--kokkos-numa'. Raised by Kokkos::initialize(int narg, char* argc[]).");
+
+        if((strncmp(arg[iarg],"--kokkos-numa",13) == 0) || !kokkos_numa_found)
+          numa = atoi(number);
+
+        //Remove the --kokkos-numa argument from the list but leave --numa
+        if(strncmp(arg[iarg],"--kokkos-numa",13) == 0) {
+          for(int k=iarg;k<narg-1;k++) {
+            arg[k] = arg[k+1];
+          }
+          kokkos_numa_found=1;
+          narg--;
+        } else {
+          iarg++;
+        }
+      } else if ((strncmp(arg[iarg],"--kokkos-device",15) == 0) || (strncmp(arg[iarg],"--device",8) == 0)) {
+        //Find the number of device (expecting --device=XX)
+        if (!((strncmp(arg[iarg],"--kokkos-device=",16) == 0) || (strncmp(arg[iarg],"--device=",9) == 0)))
+          Impl::throw_runtime_exception("Error: expecting an '=INT' after command line argument '--device/--kokkos-device'. Raised by Kokkos::initialize(int narg, char* argc[]).");
+
+        char* number =  strchr(arg[iarg],'=')+1;
+
+        if(!Impl::is_unsigned_int(number) || (strlen(number)==0))
+          Impl::throw_runtime_exception("Error: expecting an '=INT' after command line argument '--device/--kokkos-device'. Raised by Kokkos::initialize(int narg, char* argc[]).");
+
+        if((strncmp(arg[iarg],"--kokkos-device",15) == 0) || !kokkos_device_found)
+          device = atoi(number);
+
+        //Remove the --kokkos-device argument from the list but leave --device
+        if(strncmp(arg[iarg],"--kokkos-device",15) == 0) {
+          for(int k=iarg;k<narg-1;k++) {
+            arg[k] = arg[k+1];
+          }
+          kokkos_device_found=1;
+          narg--;
+        } else {
+          iarg++;
+        }
+      } else if ((strncmp(arg[iarg],"--kokkos-ndevices",17) == 0) || (strncmp(arg[iarg],"--ndevices",10) == 0)) {
+
+        //Find the number of device (expecting --device=XX)
+        if (!((strncmp(arg[iarg],"--kokkos-ndevices=",18) == 0) || (strncmp(arg[iarg],"--ndevices=",11) == 0)))
+          Impl::throw_runtime_exception("Error: expecting an '=INT[,INT]' after command line argument '--ndevices/--kokkos-ndevices'. Raised by Kokkos::initialize(int narg, char* argc[]).");
+
+        int ndevices=-1;
+        int skip_device = 9999;
+
+        char* num1 = strchr(arg[iarg],'=')+1;
+        char* num2 = strpbrk(num1,",");
+        int num1_len = num2==NULL?strlen(num1):num2-num1;
+        char* num1_only = new char[num1_len+1];
+        strncpy(num1_only,num1,num1_len);
+        num1_only[num1_len]=0;
+
+        if(!Impl::is_unsigned_int(num1_only) || (strlen(num1_only)==0)) {
+          Impl::throw_runtime_exception("Error: expecting an integer number after command line argument '--kokkos-ndevices'. Raised by Kokkos::initialize(int narg, char* argc[]).");
+        }
+        if((strncmp(arg[iarg],"--kokkos-ndevices",17) == 0) || !kokkos_ndevices_found)
+          ndevices = atoi(num1_only);
+
+        if( num2 != NULL ) {
+          if(( !Impl::is_unsigned_int(num2+1) ) || (strlen(num2)==1) )
+            Impl::throw_runtime_exception("Error: expecting an integer number after command line argument '--kokkos-ndevices=XX,'. Raised by Kokkos::initialize(int narg, char* argc[]).");
+
+          if((strncmp(arg[iarg],"--kokkos-ndevices",17) == 0) || !kokkos_ndevices_found)
+            skip_device = atoi(num2+1);
+        }
+
+        if((strncmp(arg[iarg],"--kokkos-ndevices",17) == 0) || !kokkos_ndevices_found) {
+          char *str;
+          if ((str = getenv("SLURM_LOCALID"))) {
+            int local_rank = atoi(str);
+            device = local_rank % ndevices;
+            if (device >= skip_device) device++;
+          }
+          if ((str = getenv("MV2_COMM_WORLD_LOCAL_RANK"))) {
+            int local_rank = atoi(str);
+            device = local_rank % ndevices;
+            if (device >= skip_device) device++;
+          }
+          if ((str = getenv("OMPI_COMM_WORLD_LOCAL_RANK"))) {
+            int local_rank = atoi(str);
+            device = local_rank % ndevices;
+            if (device >= skip_device) device++;
+          }
+          if(device==-1) {
+            device = 0;
+            if (device >= skip_device) device++;
+          }
+        }
+
+        //Remove the --kokkos-ndevices argument from the list but leave --ndevices
+        if(strncmp(arg[iarg],"--kokkos-ndevices",17) == 0) {
+          for(int k=iarg;k<narg-1;k++) {
+            arg[k] = arg[k+1];
+          }
+          kokkos_ndevices_found=1;
+          narg--;
+        } else {
+          iarg++;
+        }
+      } else if ((strcmp(arg[iarg],"--kokkos-help") == 0) || (strcmp(arg[iarg],"--help") == 0)) {
+         std::cout << std::endl;
+         std::cout << "--------------------------------------------------------------------------------" << std::endl;
+         std::cout << "-------------Kokkos command line arguments--------------------------------------" << std::endl;
+         std::cout << "--------------------------------------------------------------------------------" << std::endl;
+         std::cout << "The following arguments exist also without prefix 'kokkos' (e.g. --help)." << std::endl;
+         std::cout << "The prefixed arguments will be removed from the list by Kokkos::initialize()," << std::endl;
+         std::cout << "the non-prefixed ones are not removed. Prefixed versions take precedence over " << std::endl;
+         std::cout << "non prefixed ones, and the last occurence of an argument overwrites prior" << std::endl;
+         std::cout << "settings." << std::endl;
+         std::cout << std::endl;
+         std::cout << "--kokkos-help               : print this message" << std::endl;
+         std::cout << "--kokkos-threads=INT        : specify total number of threads or" << std::endl;
+         std::cout << "                              number of threads per NUMA region if " << std::endl;
+         std::cout << "                              used in conjunction with '--numa' option. " << std::endl;
+         std::cout << "--kokkos-numa=INT           : specify number of NUMA regions used by process." << std::endl;
+         std::cout << "--kokkos-device=INT         : specify device id to be used by Kokkos. " << std::endl;
+         std::cout << "--kokkos-ndevices=INT[,INT] : used when running MPI jobs. Specify number of" << std::endl;
+         std::cout << "                              devices per node to be used. Process to device" << std::endl;
+         std::cout << "                              mapping happens by obtaining the local MPI rank" << std::endl;
+         std::cout << "                              and assigning devices round-robin. The optional" << std::endl;
+         std::cout << "                              second argument allows for an existing device" << std::endl;
+         std::cout << "                              to be ignored. This is most useful on workstations" << std::endl;
+         std::cout << "                              with multiple GPUs of which one is used to drive" << std::endl;
+         std::cout << "                              screen output." << std::endl;
+         std::cout << std::endl;
+         std::cout << "--------------------------------------------------------------------------------" << std::endl;
+         std::cout << std::endl;
+
+         //Remove the --kokkos-help argument from the list but leave --ndevices
+         if(strcmp(arg[iarg],"--kokkos-help") == 0) {
+           for(int k=iarg;k<narg-1;k++) {
+             arg[k] = arg[k+1];
+           }
+           narg--;
+         } else {
+           iarg++;
+         }
+      } else
+      iarg++;
+    }
+
+    InitArguments arguments;
+    arguments.num_threads = num_threads;
+    arguments.num_numa = numa;
+    arguments.device_id = device;
+    Impl::initialize_internal(arguments);
+}
+
+void initialize(const InitArguments& arguments) {
+  Impl::initialize_internal(arguments);
+}
+
+void finalize()
+{
+  Impl::finalize_internal();
+}
+
+void finalize_all()
+{
+  enum { all_spaces = true };
+  Impl::finalize_internal( all_spaces );
+}
+
+void fence()
+{
+  Impl::fence_internal();
+}
+
+} // namespace Kokkos
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Error.cpp b/lib/kokkos/core/src/impl/Kokkos_Error.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..36224990d048c2e0394889390cfe78cf826a5fdc
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Error.cpp
@@ -0,0 +1,193 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include <ostream>
+#include <sstream>
+#include <iomanip>
+#include <stdexcept>
+#include <impl/Kokkos_Error.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+void host_abort( const char * const message )
+{
+  fwrite(message,1,strlen(message),stderr);
+  fflush(stderr);
+  ::abort();
+}
+
+void throw_runtime_exception( const std::string & msg )
+{
+  std::ostringstream o ;
+  o << msg ;
+  traceback_callstack( o );
+  throw std::runtime_error( o.str() );
+}
+
+
+std::string human_memory_size(size_t arg_bytes)
+{
+  double bytes = arg_bytes;
+  const double K = 1024;
+  const double M = K*1024;
+  const double G = M*1024;
+
+  std::ostringstream out;
+  if (bytes < K) {
+    out << std::setprecision(4) << bytes << " B";
+  } else if (bytes < M) {
+    bytes /= K;
+    out << std::setprecision(4) << bytes << " K";
+  } else if (bytes < G) {
+    bytes /= M;
+    out << std::setprecision(4) << bytes << " M";
+  } else {
+    bytes /= G;
+    out << std::setprecision(4) << bytes << " G";
+  }
+  return out.str();
+}
+
+}
+}
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#if defined( __GNUC__ ) && defined( ENABLE_TRACEBACK )
+
+/*  This is only known to work with GNU C++
+ *  Must be compiled with '-rdynamic'
+ *  Must be linked with   '-ldl'
+ */
+
+/* Print call stack into an error stream,
+ * so one knows in which function the error occured.
+ *
+ * Code copied from:
+ *   http://stupefydeveloper.blogspot.com/2008/10/cc-call-stack.html
+ *
+ * License on this site:
+ *   This blog is licensed under a
+ *   Creative Commons Attribution-Share Alike 3.0 Unported License.
+ *
+ *   http://creativecommons.org/licenses/by-sa/3.0/
+ *
+ * Modified to output to std::ostream.
+ */
+#include <signal.h>
+#include <execinfo.h>
+#include <cxxabi.h>
+#include <dlfcn.h>
+#include <stdlib.h>
+
+namespace Kokkos {
+namespace Impl {
+
+void traceback_callstack( std::ostream & msg )
+{
+  using namespace abi;
+
+  enum { MAX_DEPTH = 32 };
+
+  void *trace[MAX_DEPTH];
+  Dl_info dlinfo;
+
+  int status;
+
+  int trace_size = backtrace(trace, MAX_DEPTH);
+
+  msg << std::endl << "Call stack {" << std::endl ;
+
+  for (int i=1; i<trace_size; ++i)
+  {
+    if(!dladdr(trace[i], &dlinfo))
+        continue;
+
+    const char * symname = dlinfo.dli_sname;
+
+    char * demangled = __cxa_demangle(symname, NULL, 0, &status);
+
+    if ( status == 0 && demangled ) {
+      symname = demangled;
+    }
+
+    if ( symname && *symname != 0 ) {
+      msg << "  object: " << dlinfo.dli_fname
+          << " function: " << symname
+          << std::endl ;
+    }
+
+    if ( demangled ) {
+        free(demangled);
+    }
+  }
+  msg << "}" ;
+}
+
+}
+}
+
+#else
+
+namespace Kokkos {
+namespace Impl {
+
+void traceback_callstack( std::ostream & msg )
+{
+  msg << std::endl << "Traceback functionality not available" << std::endl ;
+}
+
+}
+}
+
+#endif
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Error.hpp b/lib/kokkos/core/src/impl/Kokkos_Error.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..5f88d662069bcb6313c803073385736e23a93456
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Error.hpp
@@ -0,0 +1,82 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_IMPL_ERROR_HPP
+#define KOKKOS_IMPL_ERROR_HPP
+
+#include <string>
+#include <iosfwd>
+#include <KokkosCore_config.h>
+#ifdef KOKKOS_HAVE_CUDA
+#include <Cuda/Kokkos_Cuda_abort.hpp>
+#endif
+
+namespace Kokkos {
+namespace Impl {
+
+void host_abort( const char * const );
+
+void throw_runtime_exception( const std::string & );
+
+void traceback_callstack( std::ostream & );
+
+std::string human_memory_size(size_t arg_bytes);
+
+}
+}
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+namespace Kokkos {
+inline
+void abort( const char * const message ) { Kokkos::Impl::host_abort(message); }
+}
+#endif /* defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA ) */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_IMPL_ERROR_HPP */
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_ExecPolicy.cpp b/lib/kokkos/core/src/impl/Kokkos_ExecPolicy.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..cabf5a3caccb0bd0beca292f5dcc895867bb1a2e
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_ExecPolicy.cpp
@@ -0,0 +1,19 @@
+#include <Kokkos_Core.hpp>
+namespace Kokkos {
+namespace Impl {
+    PerTeamValue::PerTeamValue(int arg):value(arg) {}
+
+    PerThreadValue::PerThreadValue(int arg):value(arg) {}
+}
+
+Impl::PerTeamValue PerTeam(const int& arg)
+{
+  return Impl::PerTeamValue(arg);
+}
+
+Impl::PerThreadValue PerThread(const int& arg)
+{
+  return Impl::PerThreadValue(arg);
+}
+
+}
diff --git a/lib/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp b/lib/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..78b6794491a77b78c1025b10fbe3d214fdc71fdb
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp
@@ -0,0 +1,1131 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_FUNCTORADAPTER_HPP
+#define KOKKOS_FUNCTORADAPTER_HPP
+
+#include <cstddef>
+#include <Kokkos_Core_fwd.hpp>
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_Tags.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class ArgTag , class Enable = void >
+struct FunctorDeclaresValueType : public Impl::false_type {};
+
+template< class FunctorType , class ArgTag >
+struct FunctorDeclaresValueType< FunctorType , ArgTag
+                               , typename Impl::enable_if_type< typename FunctorType::value_type >::type >
+  : public Impl::true_type {};
+
+
+/** \brief  Query Functor and execution policy argument tag for value type.
+ *
+ *  If C++11 enabled and 'value_type' is not explicitly declared then attempt
+ *  to deduce the type from FunctorType::operator().
+ */
+template< class FunctorType , class ArgTag , bool Dec = FunctorDeclaresValueType<FunctorType,ArgTag>::value >
+struct FunctorValueTraits
+{
+  typedef void value_type ;
+  typedef void pointer_type ;
+  typedef void reference_type ;
+  typedef void functor_type ;
+
+  enum { StaticValueSize = 0 };
+
+  KOKKOS_FORCEINLINE_FUNCTION static
+  unsigned value_count( const FunctorType & ) { return 0 ; }
+
+  KOKKOS_FORCEINLINE_FUNCTION static
+  unsigned value_size( const FunctorType & ) { return 0 ; }
+};
+
+template<class ArgTag>
+struct FunctorValueTraits<void, ArgTag,false>
+{
+  typedef void value_type ;
+  typedef void pointer_type ;
+  typedef void reference_type ;
+  typedef void functor_type ;
+};
+
+/** \brief  FunctorType::value_type is explicitly declared so use it.
+ *
+ * Two options for declaration
+ *
+ *   1) A plain-old-data (POD) type
+ *        typedef {pod_type} value_type ;
+ *
+ *   2) An array of POD of a runtime specified count.
+ *        typedef {pod_type} value_type[] ;
+ *        const unsigned     value_count ;
+ */
+template< class FunctorType , class ArgTag >
+struct FunctorValueTraits< FunctorType , ArgTag , true /* == exists FunctorType::value_type */ >
+{
+  typedef typename Impl::remove_extent< typename FunctorType::value_type >::type  value_type ;
+  typedef FunctorType functor_type;
+
+  static_assert( 0 == ( sizeof(value_type) % sizeof(int) ) ,
+    "Reduction functor's declared value_type requires: 0 == sizeof(value_type) % sizeof(int)" );
+
+  // If not an array then what is the sizeof(value_type)
+  enum { StaticValueSize = Impl::is_array< typename FunctorType::value_type >::value ? 0 : sizeof(value_type) };
+
+  typedef value_type                 * pointer_type ;
+
+  // The reference_type for an array is 'value_type *'
+  // The reference_type for a single value is 'value_type &'
+
+  typedef typename Impl::if_c< ! StaticValueSize , value_type *
+                                                 , value_type & >::type  reference_type ;
+
+  // Number of values if single value
+  template< class F >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  typename Impl::enable_if< Impl::is_same<F,FunctorType>::value && StaticValueSize , unsigned >::type
+    value_count( const F & ) { return 1 ; }
+
+  // Number of values if an array, protect via templating because 'f.value_count'
+  // will only exist when the functor declares the value_type to be an array.
+  template< class F >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  typename Impl::enable_if< Impl::is_same<F,FunctorType>::value && ! StaticValueSize , unsigned >::type
+    value_count( const F & f ) { return f.value_count ; }
+
+  // Total size of the value
+  KOKKOS_INLINE_FUNCTION static
+  unsigned value_size( const FunctorType & f ) { return value_count( f ) * sizeof(value_type) ; }
+};
+
+
+template< class FunctorType , class ArgTag >
+struct FunctorValueTraits< FunctorType
+                         , ArgTag
+                         , false  /* == exists FunctorType::value_type */
+                         >
+{
+private:
+
+  struct VOIDTAG {};   // Allow declaration of non-matching operator() with void argument tag.
+  struct REJECTTAG {}; // Reject tagged operator() when using non-tagged execution policy.
+
+  typedef typename
+    Impl::if_c< Impl::is_same< ArgTag , void >::value , VOIDTAG , ArgTag >::type tag_type ;
+
+  //----------------------------------------
+  // parallel_for operator without a tag:
+
+  template< class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static VOIDTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( ArgMember ) const ) {}
+
+  template< class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static VOIDTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const ArgMember & ) const ) {}
+
+  template< class TagType , class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( TagType , ArgMember ) const ) {}
+
+  template< class TagType , class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( TagType , const ArgMember & ) const ) {}
+
+  template< class TagType , class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const TagType & , ArgMember ) const ) {}
+
+  template< class TagType , class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const TagType & , const ArgMember & ) const ) {}
+
+  //----------------------------------------
+  // parallel_for operator with a tag:
+
+  template< class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static VOIDTAG deduce_reduce_type( tag_type , void (FunctorType::*)( tag_type , ArgMember ) const ) {}
+
+  template< class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static VOIDTAG deduce_reduce_type( tag_type , void (FunctorType::*)( const tag_type & , ArgMember ) const ) {}
+
+  template< class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static VOIDTAG deduce_reduce_type( tag_type , void (FunctorType::*)( tag_type , const ArgMember & ) const ) {}
+
+  template< class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static VOIDTAG deduce_reduce_type( tag_type , void (FunctorType::*)( const tag_type & , const ArgMember & ) const ) {}
+
+  //----------------------------------------
+  // parallel_reduce operator without a tag:
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( VOIDTAG , void (FunctorType::*)( ArgMember , T & ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const ArgMember & , T & ) const ) {}
+
+  template< class TagType , class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( TagType , ArgMember , T & ) const ) {}
+
+  template< class TagType , class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( TagType , const ArgMember & , T & ) const ) {}
+
+  template< class TagType , class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const TagType & , ArgMember , T & ) const ) {}
+
+  template< class TagType , class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const TagType & , const ArgMember & , T & ) const ) {}
+
+  //----------------------------------------
+  // parallel_reduce operator with a tag:
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( tag_type , void (FunctorType::*)( tag_type , ArgMember , T & ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( tag_type , void (FunctorType::*)( const tag_type & , ArgMember , T & ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( tag_type , void (FunctorType::*)( tag_type , const ArgMember & , T & ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( tag_type , void (FunctorType::*)( const tag_type & , const ArgMember & , T & ) const ) {}
+
+  //----------------------------------------
+  // parallel_scan operator without a tag:
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( VOIDTAG , void (FunctorType::*)( ArgMember , T & , bool ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const ArgMember & , T & , bool ) const ) {}
+
+  template< class TagType , class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( TagType , ArgMember , T & , bool ) const ) {}
+
+  template< class TagType , class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( TagType , const ArgMember & , T & , bool ) const ) {}
+
+  template< class TagType , class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const TagType & , ArgMember , T & , bool ) const ) {}
+
+  template< class TagType , class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const TagType & , const ArgMember & , T & , bool ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( VOIDTAG , void (FunctorType::*)( ArgMember , T & , const bool& ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const ArgMember & , T & , const bool& ) const ) {}
+
+  template< class TagType , class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( TagType , ArgMember , T & , const bool& ) const ) {}
+
+  template< class TagType , class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( TagType , const ArgMember & , T & , const bool& ) const ) {}
+
+  template< class TagType , class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const TagType & , ArgMember , T & , const bool& ) const ) {}
+
+  template< class TagType , class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const TagType & , const ArgMember & , T & , const bool& ) const ) {}
+  //----------------------------------------
+  // parallel_scan operator with a tag:
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( tag_type , void (FunctorType::*)( tag_type , ArgMember , T & , bool ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( tag_type , void (FunctorType::*)( const tag_type & , ArgMember , T & , bool ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( tag_type , void (FunctorType::*)( tag_type , const ArgMember& , T & , bool ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( tag_type , void (FunctorType::*)( const tag_type & , const ArgMember& , T & , bool ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( tag_type , void (FunctorType::*)( tag_type , ArgMember , T & , const bool& ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( tag_type , void (FunctorType::*)( const tag_type & , ArgMember , T & , const bool& ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( tag_type , void (FunctorType::*)( tag_type , const ArgMember& , T & , const bool& ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( tag_type , void (FunctorType::*)( const tag_type & , const ArgMember& , T & , const bool& ) const ) {}
+  //----------------------------------------
+
+  typedef decltype( deduce_reduce_type( tag_type() , & FunctorType::operator() ) ) ValueType ;
+
+  enum { IS_VOID   = Impl::is_same<VOIDTAG  ,ValueType>::value };
+  enum { IS_REJECT = Impl::is_same<REJECTTAG,ValueType>::value };
+
+public:
+
+  typedef typename Impl::if_c< IS_VOID || IS_REJECT , void , ValueType   >::type  value_type ;
+  typedef typename Impl::if_c< IS_VOID || IS_REJECT , void , ValueType * >::type  pointer_type ;
+  typedef typename Impl::if_c< IS_VOID || IS_REJECT , void , ValueType & >::type  reference_type ;
+  typedef FunctorType functor_type;
+
+  static_assert( IS_VOID || IS_REJECT || 0 == ( sizeof(ValueType) % sizeof(int) ) ,
+    "Reduction functor's value_type deduced from functor::operator() requires: 0 == sizeof(value_type) % sizeof(int)" );
+
+  enum { StaticValueSize = IS_VOID || IS_REJECT ? 0 : sizeof(ValueType) };
+
+  KOKKOS_FORCEINLINE_FUNCTION static
+  unsigned value_size( const FunctorType & ) { return StaticValueSize ; }
+
+  KOKKOS_FORCEINLINE_FUNCTION static
+  unsigned value_count( const FunctorType & ) { return IS_VOID || IS_REJECT ? 0 : 1 ; }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+/** Function signatures for FunctorType::init function with a tag.
+ *  reference_type is 'value_type &' for scalar and 'value_type *' for array.
+ */
+template< class FunctorType , class ArgTag >
+struct FunctorValueInitFunction {
+
+  typedef typename FunctorValueTraits<FunctorType,ArgTag>::reference_type
+    reference_type ;
+
+  KOKKOS_INLINE_FUNCTION static void
+    enable_if( void (FunctorType::*)( ArgTag         , reference_type ) const );
+  KOKKOS_INLINE_FUNCTION static void
+    enable_if( void (FunctorType::*)( ArgTag const & , reference_type ) const );
+  KOKKOS_INLINE_FUNCTION static void
+    enable_if( void (             *)( ArgTag         , reference_type ) );
+  KOKKOS_INLINE_FUNCTION static void
+    enable_if( void (             *)( ArgTag const & , reference_type ) );
+
+};
+
+/** Function signatures for FunctorType::init function without a tag.
+ *  reference_type is 'value_type &' for scalar and 'value_type *' for array.
+ */
+template< class FunctorType >
+struct FunctorValueInitFunction< FunctorType , void > {
+
+  typedef typename FunctorValueTraits<FunctorType,void>::reference_type
+    reference_type ;
+
+  KOKKOS_INLINE_FUNCTION static void
+    enable_if( void (FunctorType::*)( reference_type ) const );
+  KOKKOS_INLINE_FUNCTION static void
+    enable_if( void (             *)( reference_type ) );
+};
+
+// Adapter for value initialization function.
+// If a proper FunctorType::init is declared then use it,
+// otherwise use default constructor.
+template< class FunctorType , class ArgTag
+        , class T = typename FunctorValueTraits<FunctorType,ArgTag>::reference_type
+        , class Enable = void >
+struct FunctorValueInit ;
+
+/* No 'init' function provided for single value */
+template< class FunctorType , class ArgTag , class T , class Enable >
+struct FunctorValueInit< FunctorType , ArgTag , T & , Enable >
+{
+  KOKKOS_FORCEINLINE_FUNCTION static
+  T & init( const FunctorType & f , void * p )
+    { return *( new(p) T() ); };
+};
+
+/* No 'init' function provided for array value */
+template< class FunctorType , class ArgTag , class T , class Enable >
+struct FunctorValueInit< FunctorType , ArgTag , T * , Enable >
+{
+  KOKKOS_FORCEINLINE_FUNCTION static
+  T * init( const FunctorType & f , void * p )
+    {
+      const int n = FunctorValueTraits< FunctorType , ArgTag >::value_count(f);
+      for ( int i = 0 ; i < n ; ++i ) { new( ((T*)p) + i ) T(); }
+      return (T*)p ;
+    }
+};
+
+/* 'init' function provided for single value */
+template< class FunctorType , class T >
+struct FunctorValueInit
+  < FunctorType
+  , void
+  , T &
+    // First  substitution failure when FunctorType::init does not exist.
+    // Second substitution failure when FunctorType::init is not compatible.
+  , decltype( FunctorValueInitFunction< FunctorType , void >::enable_if( & FunctorType::init ) )
+  >
+{
+  KOKKOS_FORCEINLINE_FUNCTION static
+  T & init( const FunctorType & f , void * p )
+    { f.init( *((T*)p) ); return *((T*)p) ; }
+};
+
+/* 'init' function provided for array value */
+template< class FunctorType , class T >
+struct FunctorValueInit
+  < FunctorType
+  , void
+  , T *
+    // First  substitution failure when FunctorType::init does not exist.
+    // Second substitution failure when FunctorType::init is not compatible
+  , decltype( FunctorValueInitFunction< FunctorType , void >::enable_if( & FunctorType::init ) )
+  >
+{
+  KOKKOS_FORCEINLINE_FUNCTION static
+  T * init( const FunctorType & f , void * p )
+    { f.init( (T*)p ); return (T*)p ; }
+};
+
+/* 'init' function provided for single value */
+template< class FunctorType , class ArgTag , class T >
+struct FunctorValueInit
+  < FunctorType
+  , ArgTag
+  , T &
+    // First  substitution failure when FunctorType::init does not exist.
+    // Second substitution failure when FunctorType::init is not compatible.
+  , decltype( FunctorValueInitFunction< FunctorType , ArgTag >::enable_if( & FunctorType::init ) )
+  >
+{
+  KOKKOS_FORCEINLINE_FUNCTION static
+  T & init( const FunctorType & f , void * p )
+    { f.init( ArgTag() , *((T*)p) ); return *((T*)p) ; }
+};
+
+/* 'init' function provided for array value */
+template< class FunctorType , class ArgTag , class T >
+struct FunctorValueInit
+  < FunctorType
+  , ArgTag
+  , T *
+    // First  substitution failure when FunctorType::init does not exist.
+    // Second substitution failure when FunctorType::init is not compatible
+  , decltype( FunctorValueInitFunction< FunctorType , ArgTag >::enable_if( & FunctorType::init ) )
+  >
+{
+  KOKKOS_FORCEINLINE_FUNCTION static
+  T * init( const FunctorType & f , void * p )
+    { f.init( ArgTag() , (T*)p ); return (T*)p ; }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+// Signatures for compatible FunctorType::join with tag and not an array
+template< class FunctorType , class ArgTag , bool IsArray = 0 == FunctorValueTraits<FunctorType,ArgTag>::StaticValueSize >
+struct FunctorValueJoinFunction {
+
+  typedef typename FunctorValueTraits<FunctorType,ArgTag>::value_type value_type ;
+
+  typedef       volatile value_type & vref_type ;
+  typedef const volatile value_type & cvref_type ;
+
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , vref_type , cvref_type ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , vref_type , cvref_type ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag         , vref_type , cvref_type ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag const & , vref_type , cvref_type ) );
+};
+
+// Signatures for compatible FunctorType::join with tag and is an array
+template< class FunctorType , class ArgTag >
+struct FunctorValueJoinFunction< FunctorType , ArgTag , true > {
+
+  typedef typename FunctorValueTraits<FunctorType,ArgTag>::value_type value_type ;
+
+  typedef       volatile value_type * vptr_type ;
+  typedef const volatile value_type * cvptr_type ;
+
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , vptr_type , cvptr_type ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , vptr_type , cvptr_type ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag         , vptr_type , cvptr_type ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag const & , vptr_type , cvptr_type ) );
+};
+
+// Signatures for compatible FunctorType::join without tag and not an array
+template< class FunctorType >
+struct FunctorValueJoinFunction< FunctorType , void , false > {
+
+  typedef typename FunctorValueTraits<FunctorType,void>::value_type value_type ;
+
+  typedef       volatile value_type & vref_type ;
+  typedef const volatile value_type & cvref_type ;
+
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( vref_type , cvref_type ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( vref_type , cvref_type ) );
+};
+
+// Signatures for compatible FunctorType::join without tag and is an array
+template< class FunctorType >
+struct FunctorValueJoinFunction< FunctorType , void , true > {
+
+  typedef typename FunctorValueTraits<FunctorType,void>::value_type value_type ;
+
+  typedef       volatile value_type * vptr_type ;
+  typedef const volatile value_type * cvptr_type ;
+
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( vptr_type , cvptr_type ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( vptr_type , cvptr_type ) );
+};
+
+
+template< class FunctorType , class ArgTag
+        , class T = typename FunctorValueTraits<FunctorType,ArgTag>::reference_type
+        , class Enable = void >
+struct FunctorValueJoin ;
+
+/* No 'join' function provided, single value */
+template< class FunctorType , class ArgTag , class T , class Enable >
+struct FunctorValueJoin< FunctorType , ArgTag , T & , Enable >
+{
+  KOKKOS_FORCEINLINE_FUNCTION
+  FunctorValueJoin(const FunctorType& ){}
+
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void join( const FunctorType & f , volatile void * const lhs , const volatile void * const rhs )
+    {
+      *((volatile T*)lhs) += *((const volatile T*)rhs);
+    }
+  KOKKOS_FORCEINLINE_FUNCTION
+  void operator()( volatile T& lhs , const volatile T& rhs ) const
+    {
+      lhs += rhs;
+    }
+  KOKKOS_FORCEINLINE_FUNCTION
+  void operator() ( T& lhs , const T& rhs ) const
+    {
+      lhs += rhs;
+    }
+};
+
+/* No 'join' function provided, array of values */
+template< class FunctorType , class ArgTag , class T , class Enable >
+struct FunctorValueJoin< FunctorType , ArgTag , T * , Enable >
+{
+  const FunctorType& f;
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  FunctorValueJoin(const FunctorType& f_):f(f_){}
+
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void join( const FunctorType & f_ , volatile void * const lhs , const volatile void * const rhs )
+    {
+      const int n = FunctorValueTraits<FunctorType,ArgTag>::value_count(f_);
+
+      for ( int i = 0 ; i < n ; ++i ) { ((volatile T*)lhs)[i] += ((const volatile T*)rhs)[i]; }
+    }
+  KOKKOS_FORCEINLINE_FUNCTION
+  void operator()( volatile T* const lhs , const volatile T* const rhs ) const
+    {
+      const int n = FunctorValueTraits<FunctorType,ArgTag>::value_count(f);
+
+      for ( int i = 0 ; i < n ; ++i ) { lhs[i] += rhs[i]; }
+    }
+  KOKKOS_FORCEINLINE_FUNCTION
+  void operator() ( T* lhs , const T* rhs ) const
+    {
+      const int n = FunctorValueTraits<FunctorType,ArgTag>::value_count(f);
+
+      for ( int i = 0 ; i < n ; ++i ) { lhs[i] += rhs[i]; }
+    }
+};
+
+/* 'join' function provided, single value */
+template< class FunctorType , class ArgTag , class T >
+struct FunctorValueJoin
+  < FunctorType
+  , ArgTag
+  , T &
+    // First  substitution failure when FunctorType::join does not exist.
+    // Second substitution failure when enable_if( & Functor::join ) does not exist
+  , decltype( FunctorValueJoinFunction< FunctorType , ArgTag >::enable_if( & FunctorType::join ) )
+  >
+{
+  const FunctorType& f;
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  FunctorValueJoin(const FunctorType& f_):f(f_){}
+
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void join( const FunctorType & f_ , volatile void * const lhs , const volatile void * const rhs )
+    {
+      f_.join( ArgTag() , *((volatile T *)lhs) , *((const volatile T *)rhs) );
+    }
+  KOKKOS_FORCEINLINE_FUNCTION
+  void operator()( volatile T& lhs , const volatile T& rhs ) const
+    {
+      f.join( ArgTag() , lhs , rhs );
+    }
+  KOKKOS_FORCEINLINE_FUNCTION
+  void operator() ( T& lhs , const T& rhs ) const
+    {
+      f.join( ArgTag(), lhs , rhs );
+    }
+};
+
+/* 'join' function provided, no tag, single value */
+template< class FunctorType , class T >
+struct FunctorValueJoin
+  < FunctorType
+  , void
+  , T &
+    // First  substitution failure when FunctorType::join does not exist.
+    // Second substitution failure when enable_if( & Functor::join ) does not exist
+  , decltype( FunctorValueJoinFunction< FunctorType , void >::enable_if( & FunctorType::join ) )
+  >
+{
+  const FunctorType& f;
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  FunctorValueJoin(const FunctorType& f_):f(f_){}
+
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void join( const FunctorType & f_ , volatile void * const lhs , const volatile void * const rhs )
+    {
+      f_.join( *((volatile T *)lhs) , *((const volatile T *)rhs) );
+    }
+  KOKKOS_FORCEINLINE_FUNCTION
+  void operator()( volatile T& lhs , const volatile T& rhs ) const
+    {
+      f.join( lhs , rhs );
+    }
+  KOKKOS_FORCEINLINE_FUNCTION
+  void operator() ( T& lhs , const T& rhs ) const
+    {
+      f.join( lhs , rhs );
+    }
+};
+
+/* 'join' function provided for array value */
+template< class FunctorType , class ArgTag , class T >
+struct FunctorValueJoin
+  < FunctorType
+  , ArgTag
+  , T *
+    // First  substitution failure when FunctorType::join does not exist.
+    // Second substitution failure when enable_if( & Functor::join ) does not exist
+  , decltype( FunctorValueJoinFunction< FunctorType , ArgTag >::enable_if( & FunctorType::join ) )
+  >
+{
+  const FunctorType& f;
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  FunctorValueJoin(const FunctorType& f_):f(f_){}
+
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void join( const FunctorType & f_ , volatile void * const lhs , const volatile void * const rhs )
+    {
+      f_.join( ArgTag() , (volatile T *)lhs , (const volatile T *)rhs );
+    }
+  KOKKOS_FORCEINLINE_FUNCTION
+  void operator()( volatile T* const lhs , const volatile T* const rhs ) const
+    {
+      f.join( ArgTag() , lhs , rhs );
+    }
+  KOKKOS_FORCEINLINE_FUNCTION
+  void operator() ( T* lhs , const T* rhs ) const
+    {
+      f.join( ArgTag(), lhs , rhs );
+    }
+};
+
+/* 'join' function provided, no tag, array value */
+template< class FunctorType , class T >
+struct FunctorValueJoin
+  < FunctorType
+  , void
+  , T *
+    // First  substitution failure when FunctorType::join does not exist.
+    // Second substitution failure when enable_if( & Functor::join ) does not exist
+  , decltype( FunctorValueJoinFunction< FunctorType , void >::enable_if( & FunctorType::join ) )
+  >
+{
+  const FunctorType& f;
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  FunctorValueJoin(const FunctorType& f_):f(f_){}
+
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void join( const FunctorType & f_ , volatile void * const lhs , const volatile void * const rhs )
+    {
+      f_.join( (volatile T *)lhs , (const volatile T *)rhs );
+    }
+  KOKKOS_FORCEINLINE_FUNCTION
+  void operator() ( volatile T* const lhs , const volatile T* const rhs ) const
+    {
+      f.join( lhs , rhs );
+    }
+  KOKKOS_FORCEINLINE_FUNCTION
+  void operator() ( T* lhs , const T* rhs ) const
+    {
+      f.join( lhs , rhs );
+    }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+namespace Kokkos {
+
+namespace Impl {
+
+  template<typename ValueType, class JoinOp, class Enable = void>
+  struct JoinLambdaAdapter {
+    typedef ValueType value_type;
+    const JoinOp& lambda;
+    KOKKOS_INLINE_FUNCTION
+    JoinLambdaAdapter(const JoinOp& lambda_):lambda(lambda_) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void join(volatile value_type& dst, const volatile value_type& src) const {
+      lambda(dst,src);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void join(value_type& dst, const value_type& src) const {
+      lambda(dst,src);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (volatile value_type& dst, const volatile value_type& src) const {
+      lambda(dst,src);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (value_type& dst, const value_type& src) const {
+      lambda(dst,src);
+    }
+  };
+
+  template<typename ValueType, class JoinOp>
+  struct JoinLambdaAdapter<ValueType, JoinOp, decltype( FunctorValueJoinFunction< JoinOp , void >::enable_if( & JoinOp::join ) )> {
+    typedef ValueType value_type;
+    typedef StaticAssertSame<ValueType,typename JoinOp::value_type> assert_value_types_match;
+    const JoinOp& lambda;
+    KOKKOS_INLINE_FUNCTION
+    JoinLambdaAdapter(const JoinOp& lambda_):lambda(lambda_) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void join(volatile value_type& dst, const volatile value_type& src) const {
+      lambda.join(dst,src);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void join(value_type& dst, const value_type& src) const {
+      lambda.join(dst,src);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (volatile value_type& dst, const volatile value_type& src) const {
+      lambda.join(dst,src);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (value_type& dst, const value_type& src) const {
+      lambda.join(dst,src);
+    }
+  };
+
+  template<typename ValueType>
+  struct JoinAdd {
+    typedef ValueType value_type;
+
+    KOKKOS_INLINE_FUNCTION
+    JoinAdd() {}
+
+    KOKKOS_INLINE_FUNCTION
+    void join(volatile value_type& dst, const volatile value_type& src) const {
+      dst+=src;
+    }
+    KOKKOS_INLINE_FUNCTION
+    void operator() (value_type& dst, const value_type& src) const {
+      dst+=src;
+    }
+    KOKKOS_INLINE_FUNCTION
+    void operator() (volatile value_type& dst, const volatile value_type& src) const {
+      dst+=src;
+    }
+  };
+
+}
+}
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class ArgTag
+        , class T = typename FunctorValueTraits<FunctorType,ArgTag>::reference_type >
+struct FunctorValueOps ;
+
+template< class FunctorType , class ArgTag , class T >
+struct FunctorValueOps< FunctorType , ArgTag , T & >
+{
+  KOKKOS_FORCEINLINE_FUNCTION static
+  T * pointer( T & r ) { return & r ; }
+
+  KOKKOS_FORCEINLINE_FUNCTION static
+  T & reference( void * p ) { return *((T*)p); }
+
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void copy( const FunctorType & , void * const lhs , const void * const rhs )
+    { *((T*)lhs) = *((const T*)rhs); }
+};
+
+/* No 'join' function provided, array of values */
+template< class FunctorType , class ArgTag , class T >
+struct FunctorValueOps< FunctorType , ArgTag , T * >
+{
+  KOKKOS_FORCEINLINE_FUNCTION static
+  T * pointer( T * p ) { return p ; }
+
+  KOKKOS_FORCEINLINE_FUNCTION static
+  T * reference( void * p ) { return ((T*)p); }
+
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void copy( const FunctorType & f , void * const lhs , const void * const rhs )
+    {
+      const int n = FunctorValueTraits<FunctorType,ArgTag>::value_count(f);
+      for ( int i = 0 ; i < n ; ++i ) { ((T*)lhs)[i] = ((const T*)rhs)[i]; }
+    }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+// Compatible functions for 'final' function and value_type not an array
+template< class FunctorType , class ArgTag , bool IsArray = 0 == FunctorValueTraits<FunctorType,ArgTag>::StaticValueSize >
+struct FunctorFinalFunction {
+
+  typedef typename FunctorValueTraits<FunctorType,ArgTag>::value_type value_type ;
+
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type & ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type & ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type & ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type & ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag         , value_type & ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag const & , value_type & ) );
+
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type volatile & ) const );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type volatile & ) const );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type volatile & ) );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type volatile & ) );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag         , value_type volatile & ) );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag const & , value_type volatile & ) );
+
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type const & ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type const & ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type const & ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type const & ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag         , value_type const & ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag const & , value_type const & ) );
+
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type const volatile & ) const );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type const volatile & ) const );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type const volatile & ) );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type const volatile & ) );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag         , value_type const volatile & ) );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag const & , value_type const volatile & ) );
+};
+
+// Compatible functions for 'final' function and value_type is an array
+template< class FunctorType , class ArgTag >
+struct FunctorFinalFunction< FunctorType , ArgTag , true > {
+
+  typedef typename FunctorValueTraits<FunctorType,ArgTag>::value_type value_type ;
+
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type * ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type * ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type * ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type * ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag         , value_type * ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag const & , value_type * ) );
+
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type volatile * ) const );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type volatile * ) const );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type volatile * ) );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type volatile * ) );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag         , value_type volatile * ) );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag const & , value_type volatile * ) );
+
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type const * ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type const * ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type const * ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type const * ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag         , value_type const * ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag const & , value_type const * ) );
+
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type const volatile * ) const );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type const volatile * ) const );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type const volatile * ) );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type const volatile * ) );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag         , value_type const volatile * ) );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag const & , value_type const volatile * ) );
+};
+
+template< class FunctorType >
+struct FunctorFinalFunction< FunctorType , void , false > {
+
+  typedef typename FunctorValueTraits<FunctorType,void>::value_type value_type ;
+
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( value_type & ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( value_type & ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( value_type & ) );
+
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( const value_type & ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( const value_type & ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( const value_type & ) );
+};
+
+template< class FunctorType >
+struct FunctorFinalFunction< FunctorType , void , true > {
+
+  typedef typename FunctorValueTraits<FunctorType,void>::value_type value_type ;
+
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( value_type * ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( value_type * ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( value_type * ) );
+
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( const value_type * ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( const value_type * ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( const value_type * ) );
+};
+
+/* No 'final' function provided */
+template< class FunctorType , class ArgTag
+        , class ResultType = typename FunctorValueTraits<FunctorType,ArgTag>::reference_type
+        , class Enable = void >
+struct FunctorFinal
+{
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void final( const FunctorType & , void * ) {}
+};
+
+/* 'final' function provided */
+template< class FunctorType , class ArgTag , class T >
+struct FunctorFinal
+  < FunctorType
+  , ArgTag
+  , T &
+    // First  substitution failure when FunctorType::final does not exist.
+    // Second substitution failure when enable_if( & Functor::final ) does not exist
+  , decltype( FunctorFinalFunction< FunctorType , ArgTag >::enable_if( & FunctorType::final ) )
+  >
+{
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void final( const FunctorType & f , void * p ) { f.final( *((T*)p) ); }
+
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void final( FunctorType & f , void * p ) { f.final( *((T*)p) ); }
+};
+
+/* 'final' function provided for array value */
+template< class FunctorType , class ArgTag , class T >
+struct FunctorFinal
+  < FunctorType
+  , ArgTag
+  , T *
+    // First  substitution failure when FunctorType::final does not exist.
+    // Second substitution failure when enable_if( & Functor::final ) does not exist
+  , decltype( FunctorFinalFunction< FunctorType , ArgTag >::enable_if( & FunctorType::final ) )
+  >
+{
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void final( const FunctorType & f , void * p ) { f.final( (T*)p ); }
+
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void final( FunctorType & f , void * p ) { f.final( (T*)p ); }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class ArgTag
+        , class ReferenceType = typename FunctorValueTraits<FunctorType,ArgTag>::reference_type >
+struct FunctorApplyFunction {
+
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , ReferenceType ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , ReferenceType ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , ReferenceType ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , ReferenceType ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag         , ReferenceType ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag const & , ReferenceType ) );
+};
+
+template< class FunctorType , class ReferenceType >
+struct FunctorApplyFunction< FunctorType , void , ReferenceType > {
+
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ReferenceType ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ReferenceType ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ReferenceType ) );
+};
+
+template< class FunctorType >
+struct FunctorApplyFunction< FunctorType , void , void > {
+
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)() const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)() );
+};
+
+template< class FunctorType , class ArgTag , class ReferenceType
+        , class Enable = void >
+struct FunctorApply
+{
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void apply( const FunctorType & , void * ) {}
+};
+
+/* 'apply' function provided for void value */
+template< class FunctorType , class ArgTag >
+struct FunctorApply
+  < FunctorType
+  , ArgTag
+  , void
+    // First  substitution failure when FunctorType::apply does not exist.
+    // Second substitution failure when enable_if( & Functor::apply ) does not exist
+  , decltype( FunctorApplyFunction< FunctorType , ArgTag , void >::enable_if( & FunctorType::apply ) )
+  >
+{
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void apply( FunctorType & f ) { f.apply(); }
+
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void apply( const FunctorType & f ) { f.apply(); }
+};
+
+/* 'apply' function provided for single value */
+template< class FunctorType , class ArgTag , class T >
+struct FunctorApply
+  < FunctorType
+  , ArgTag
+  , T &
+    // First  substitution failure when FunctorType::apply does not exist.
+    // Second substitution failure when enable_if( & Functor::apply ) does not exist
+  , decltype( FunctorApplyFunction< FunctorType , ArgTag >::enable_if( & FunctorType::apply ) )
+  >
+{
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void apply( const FunctorType & f , void * p ) { f.apply( *((T*)p) ); }
+
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void apply( FunctorType & f , void * p ) { f.apply( *((T*)p) ); }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* KOKKOS_FUNCTORADAPTER_HPP */
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_HBWAllocators.cpp b/lib/kokkos/core/src/impl/Kokkos_HBWAllocators.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4eb80d03f1fa0c26a2ba9524b16719dcf2a72e99
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_HBWAllocators.cpp
@@ -0,0 +1,108 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_HostSpace.hpp>
+
+#include <impl/Kokkos_HBWAllocators.hpp>
+#include <impl/Kokkos_Error.hpp>
+
+
+#include <stdint.h>    // uintptr_t
+#include <cstdlib>     // for malloc, realloc, and free
+#include <cstring>     // for memcpy
+
+#if defined(KOKKOS_POSIX_MEMALIGN_AVAILABLE)
+#include <sys/mman.h>  // for mmap, munmap, MAP_ANON, etc
+#include <unistd.h>    // for sysconf, _SC_PAGE_SIZE, _SC_PHYS_PAGES
+#endif
+
+#include <sstream>
+#include <iostream>
+
+#ifdef KOKKOS_HAVE_HBWSPACE
+#include <memkind.h>
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+#define MEMKIND_TYPE MEMKIND_HBW //hbw_get_kind(HBW_PAGESIZE_4KB)
+/*--------------------------------------------------------------------------*/
+
+void* HBWMallocAllocator::allocate( size_t size )
+{
+  std::cout<< "Allocate HBW: " << 1.0e-6*size << "MB" << std::endl;
+  void * ptr = NULL;
+  if (size) {
+    ptr = memkind_malloc(MEMKIND_TYPE,size);
+
+    if (!ptr)
+    {
+      std::ostringstream msg ;
+      msg << name() << ": allocate(" << size << ") FAILED";
+      Kokkos::Impl::throw_runtime_exception( msg.str() );
+    }
+  }
+  return ptr;
+}
+
+void HBWMallocAllocator::deallocate( void * ptr, size_t /*size*/ )
+{
+  if (ptr) {
+    memkind_free(MEMKIND_TYPE,ptr);
+  }
+}
+
+void * HBWMallocAllocator::reallocate(void * old_ptr, size_t /*old_size*/, size_t new_size)
+{
+  void * ptr = memkind_realloc(MEMKIND_TYPE, old_ptr, new_size);
+
+  if (new_size > 0u && ptr == NULL) {
+    Kokkos::Impl::throw_runtime_exception("Error: Malloc Allocator could not reallocate memory");
+  }
+  return ptr;
+}
+
+} // namespace Impl
+} // namespace Experimental
+} // namespace Kokkos
+#endif
diff --git a/lib/kokkos/core/src/impl/Kokkos_HBWAllocators.hpp b/lib/kokkos/core/src/impl/Kokkos_HBWAllocators.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..be0134460b279f0cbb5f0bc1efda36863c0342ca
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_HBWAllocators.hpp
@@ -0,0 +1,75 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_HBW_ALLOCATORS_HPP
+#define KOKKOS_HBW_ALLOCATORS_HPP
+
+#ifdef KOKKOS_HAVE_HBWSPACE
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+/// class MallocAllocator
+class HBWMallocAllocator
+{
+public:
+  static const char * name()
+  {
+    return "HBW Malloc Allocator";
+  }
+
+  static void* allocate(size_t size);
+
+  static void deallocate(void * ptr, size_t size);
+
+  static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
+};
+
+}
+}
+} // namespace Kokkos::Impl
+#endif //KOKKOS_HAVE_HBWSPACE
+#endif //KOKKOS_HBW_ALLOCATORS_HPP
+
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp b/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..11cc120212b25804df0afb9f660ff8b165e0f217
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp
@@ -0,0 +1,379 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+
+#include <Kokkos_Macros.hpp>
+
+
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <memory.h>
+
+#include <iostream>
+#include <sstream>
+#include <cstring>
+#include <algorithm>
+
+#include <Kokkos_HBWSpace.hpp>
+#include <impl/Kokkos_Error.hpp>
+#include <Kokkos_Atomic.hpp>
+#ifdef KOKKOS_HAVE_HBWSPACE
+#include <memkind.h>
+#endif
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+#ifdef KOKKOS_HAVE_HBWSPACE
+#define MEMKIND_TYPE MEMKIND_HBW //hbw_get_kind(HBW_PAGESIZE_4KB)
+
+namespace Kokkos {
+namespace Experimental {
+namespace {
+
+static const int QUERY_SPACE_IN_PARALLEL_MAX = 16 ;
+
+typedef int (* QuerySpaceInParallelPtr )();
+
+QuerySpaceInParallelPtr s_in_parallel_query[ QUERY_SPACE_IN_PARALLEL_MAX ] ;
+int s_in_parallel_query_count = 0 ;
+
+} // namespace <empty>
+
+void HBWSpace::register_in_parallel( int (*device_in_parallel)() )
+{
+  if ( 0 == device_in_parallel ) {
+    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::HBWSpace::register_in_parallel ERROR : given NULL" ) );
+  }
+
+  int i = -1 ;
+
+  if ( ! (device_in_parallel)() ) {
+    for ( i = 0 ; i < s_in_parallel_query_count && ! (*(s_in_parallel_query[i]))() ; ++i );
+  }
+
+  if ( i < s_in_parallel_query_count ) {
+    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::HBWSpace::register_in_parallel_query ERROR : called in_parallel" ) );
+
+  }
+
+  if ( QUERY_SPACE_IN_PARALLEL_MAX <= i ) {
+    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::HBWSpace::register_in_parallel_query ERROR : exceeded maximum" ) );
+
+  }
+
+  for ( i = 0 ; i < s_in_parallel_query_count && s_in_parallel_query[i] != device_in_parallel ; ++i );
+
+  if ( i == s_in_parallel_query_count ) {
+    s_in_parallel_query[s_in_parallel_query_count++] = device_in_parallel ;
+  }
+}
+
+int HBWSpace::in_parallel()
+{
+  const int n = s_in_parallel_query_count ;
+
+  int i = 0 ;
+
+  while ( i < n && ! (*(s_in_parallel_query[i]))() ) { ++i ; }
+
+  return i < n ;
+}
+
+} // namespace Experiemtal
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Experimental {
+
+/* Default allocation mechanism */
+HBWSpace::HBWSpace()
+  : m_alloc_mech(
+     HBWSpace::STD_MALLOC
+    )
+{
+printf("Init\n");
+setenv("MEMKIND_HBW_NODES", "1", 0);
+}
+
+/* Default allocation mechanism */
+HBWSpace::HBWSpace( const HBWSpace::AllocationMechanism & arg_alloc_mech )
+  : m_alloc_mech( HBWSpace::STD_MALLOC )
+{
+printf("Init2\n");
+setenv("MEMKIND_HBW_NODES", "1", 0);
+  if ( arg_alloc_mech == STD_MALLOC ) {
+    m_alloc_mech = HBWSpace::STD_MALLOC ;
+  }
+}
+
+void * HBWSpace::allocate( const size_t arg_alloc_size ) const
+{
+  static_assert( sizeof(void*) == sizeof(uintptr_t)
+               , "Error sizeof(void*) != sizeof(uintptr_t)" );
+
+  static_assert( Kokkos::Impl::power_of_two< Kokkos::Impl::MEMORY_ALIGNMENT >::value
+               , "Memory alignment must be power of two" );
+
+  constexpr uintptr_t alignment = Kokkos::Impl::MEMORY_ALIGNMENT ;
+  constexpr uintptr_t alignment_mask = alignment - 1 ;
+
+  void * ptr = 0 ;
+
+  if ( arg_alloc_size ) {
+
+    if ( m_alloc_mech == STD_MALLOC ) {
+      // Over-allocate to and round up to guarantee proper alignment.
+      size_t size_padded = arg_alloc_size + sizeof(void*) + alignment ;
+
+      void * alloc_ptr = memkind_malloc(MEMKIND_TYPE, size_padded );
+
+      if (alloc_ptr) {
+        uintptr_t address = reinterpret_cast<uintptr_t>(alloc_ptr);
+
+        // offset enough to record the alloc_ptr
+        address += sizeof(void *);
+        uintptr_t rem = address % alignment;
+        uintptr_t offset = rem ? (alignment - rem) : 0u;
+        address += offset;
+        ptr = reinterpret_cast<void *>(address);
+        // record the alloc'd pointer
+        address -= sizeof(void *);
+        *reinterpret_cast<void **>(address) = alloc_ptr;
+      }
+    }
+  }
+
+  if ( ( ptr == 0 ) || ( reinterpret_cast<uintptr_t>(ptr) == ~uintptr_t(0) )
+       || ( reinterpret_cast<uintptr_t>(ptr) & alignment_mask ) ) {
+    std::ostringstream msg ;
+    msg << "Kokkos::Experimental::HBWSpace::allocate[ " ;
+    switch( m_alloc_mech ) {
+    case STD_MALLOC: msg << "STD_MALLOC" ; break ;
+    }
+    msg << " ]( " << arg_alloc_size << " ) FAILED" ;
+    if ( ptr == NULL ) { msg << " NULL" ; } 
+    else { msg << " NOT ALIGNED " << ptr ; }
+
+    std::cerr << msg.str() << std::endl ;
+    std::cerr.flush();
+
+    Kokkos::Impl::throw_runtime_exception( msg.str() );
+  }
+
+  return ptr;
+}
+
+
+void HBWSpace::deallocate( void * const arg_alloc_ptr , const size_t arg_alloc_size ) const
+{
+  if ( arg_alloc_ptr ) {
+
+    if ( m_alloc_mech == STD_MALLOC ) {
+      void * alloc_ptr = *(reinterpret_cast<void **>(arg_alloc_ptr) -1);
+      memkind_free(MEMKIND_TYPE, alloc_ptr );
+    }    
+
+  }
+}
+
+} // namespace Experimental
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+SharedAllocationRecord< void , void >
+SharedAllocationRecord< Kokkos::Experimental::HBWSpace , void >::s_root_record ;
+
+void
+SharedAllocationRecord< Kokkos::Experimental::HBWSpace , void >::
+deallocate( SharedAllocationRecord< void , void > * arg_rec )
+{
+  delete static_cast<SharedAllocationRecord*>(arg_rec);
+}
+
+SharedAllocationRecord< Kokkos::Experimental::HBWSpace , void >::
+~SharedAllocationRecord()
+{
+  m_space.deallocate( SharedAllocationRecord< void , void >::m_alloc_ptr
+                    , SharedAllocationRecord< void , void >::m_alloc_size
+                    );
+}
+
+SharedAllocationRecord< Kokkos::Experimental::HBWSpace , void >::
+SharedAllocationRecord( const Kokkos::Experimental::HBWSpace & arg_space
+                      , const std::string       & arg_label
+                      , const size_t              arg_alloc_size
+                      , const SharedAllocationRecord< void , void >::function_type arg_dealloc
+                      )
+  // Pass through allocated [ SharedAllocationHeader , user_memory ]
+  // Pass through deallocation function
+  : SharedAllocationRecord< void , void >
+      ( & SharedAllocationRecord< Kokkos::Experimental::HBWSpace , void >::s_root_record
+      , reinterpret_cast<SharedAllocationHeader*>( arg_space.allocate( sizeof(SharedAllocationHeader) + arg_alloc_size ) )
+      , sizeof(SharedAllocationHeader) + arg_alloc_size
+      , arg_dealloc
+      )
+  , m_space( arg_space )
+{
+  // Fill in the Header information
+  RecordBase::m_alloc_ptr->m_record = static_cast< SharedAllocationRecord< void , void > * >( this );
+
+  strncpy( RecordBase::m_alloc_ptr->m_label
+          , arg_label.c_str()
+          , SharedAllocationHeader::maximum_label_length
+          );
+}
+
+//----------------------------------------------------------------------------
+
+void * SharedAllocationRecord< Kokkos::Experimental::HBWSpace , void >::
+allocate_tracked( const Kokkos::Experimental::HBWSpace & arg_space
+                , const std::string & arg_alloc_label 
+                , const size_t arg_alloc_size )
+{
+  if ( ! arg_alloc_size ) return (void *) 0 ;
+
+  SharedAllocationRecord * const r =
+    allocate( arg_space , arg_alloc_label , arg_alloc_size );
+
+  RecordBase::increment( r );
+
+  return r->data();
+}
+
+void SharedAllocationRecord< Kokkos::Experimental::HBWSpace , void >::
+deallocate_tracked( void * const arg_alloc_ptr )
+{
+  if ( arg_alloc_ptr != 0 ) {
+    SharedAllocationRecord * const r = get_record( arg_alloc_ptr );
+
+    RecordBase::decrement( r );
+  }
+}
+
+void * SharedAllocationRecord< Kokkos::Experimental::HBWSpace , void >::
+reallocate_tracked( void * const arg_alloc_ptr
+                  , const size_t arg_alloc_size )
+{
+  SharedAllocationRecord * const r_old = get_record( arg_alloc_ptr );
+  SharedAllocationRecord * const r_new = allocate( r_old->m_space , r_old->get_label() , arg_alloc_size );
+
+  Kokkos::Impl::DeepCopy<HBWSpace,HBWSpace>( r_new->data() , r_old->data()
+                                             , std::min( r_old->size() , r_new->size() ) );
+
+  RecordBase::increment( r_new );
+  RecordBase::decrement( r_old );
+
+  return r_new->data();
+}
+
+SharedAllocationRecord< Kokkos::Experimental::HBWSpace , void > *
+SharedAllocationRecord< Kokkos::Experimental::HBWSpace , void >::get_record( void * alloc_ptr )
+{
+  typedef SharedAllocationHeader  Header ;
+  typedef SharedAllocationRecord< Kokkos::Experimental::HBWSpace , void >  RecordHost ;
+
+  SharedAllocationHeader const * const head   = alloc_ptr ? Header::get_header( alloc_ptr ) : (SharedAllocationHeader *)0 ;
+  RecordHost                   * const record = head ? static_cast< RecordHost * >( head->m_record ) : (RecordHost *) 0 ;
+
+  if ( ! alloc_ptr || record->m_alloc_ptr != head ) {
+    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::Experimental::HBWSpace , void >::get_record ERROR" ) );
+  }
+
+  return record ;
+}
+
+// Iterate records to print orphaned memory ...
+void SharedAllocationRecord< Kokkos::Experimental::HBWSpace , void >::
+print_records( std::ostream & s , const Kokkos::Experimental::HBWSpace & space , bool detail )
+{
+  SharedAllocationRecord< void , void >::print_host_accessible_records( s , "HBWSpace" , & s_root_record , detail );
+}
+
+} // namespace Impl
+} // namespace Experimental
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Experimental {
+namespace {
+  const unsigned HBW_SPACE_ATOMIC_MASK = 0xFFFF;
+  const unsigned HBW_SPACE_ATOMIC_XOR_MASK = 0x5A39;
+  static int HBW_SPACE_ATOMIC_LOCKS[HBW_SPACE_ATOMIC_MASK+1];
+}
+
+namespace Impl {
+void init_lock_array_hbw_space() {
+  static int is_initialized = 0;
+  if(! is_initialized)
+    for(int i = 0; i < static_cast<int> (HBW_SPACE_ATOMIC_MASK+1); i++)
+      HBW_SPACE_ATOMIC_LOCKS[i] = 0;
+}
+
+bool lock_address_hbw_space(void* ptr) {
+  return 0 == atomic_compare_exchange( &HBW_SPACE_ATOMIC_LOCKS[
+      (( size_t(ptr) >> 2 ) & HBW_SPACE_ATOMIC_MASK) ^ HBW_SPACE_ATOMIC_XOR_MASK] ,
+                                  0 , 1);
+}
+
+void unlock_address_hbw_space(void* ptr) {
+   atomic_exchange( &HBW_SPACE_ATOMIC_LOCKS[
+      (( size_t(ptr) >> 2 ) & HBW_SPACE_ATOMIC_MASK) ^ HBW_SPACE_ATOMIC_XOR_MASK] ,
+                    0);
+}
+
+}
+}
+}
+#endif
diff --git a/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp b/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b52f4591ef0b8c0b71445f6e33b4d913822e5446
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp
@@ -0,0 +1,537 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <algorithm>
+#include <Kokkos_Macros.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+#if defined( __INTEL_COMPILER ) && ! defined ( KOKKOS_HAVE_CUDA )
+
+// Intel specialized allocator does not interoperate with CUDA memory allocation
+
+#define KOKKOS_INTEL_MM_ALLOC_AVAILABLE
+
+#endif
+
+/*--------------------------------------------------------------------------*/
+
+#if defined(KOKKOS_POSIX_MEMALIGN_AVAILABLE)
+
+#include <unistd.h>
+#include <sys/mman.h>
+
+/* mmap flags for private anonymous memory allocation */
+
+#if defined( MAP_ANONYMOUS ) && defined( MAP_PRIVATE )
+  #define KOKKOS_POSIX_MMAP_FLAGS (MAP_PRIVATE | MAP_ANONYMOUS)
+#elif defined( MAP_ANON ) && defined( MAP_PRIVATE )
+  #define KOKKOS_POSIX_MMAP_FLAGS (MAP_PRIVATE | MAP_ANON)
+#endif
+
+// mmap flags for huge page tables
+// the Cuda driver does not interoperate with MAP_HUGETLB
+#if defined( KOKKOS_POSIX_MMAP_FLAGS )
+  #if defined( MAP_HUGETLB ) && ! defined( KOKKOS_HAVE_CUDA )
+    #define KOKKOS_POSIX_MMAP_FLAGS_HUGE (KOKKOS_POSIX_MMAP_FLAGS | MAP_HUGETLB )
+  #else
+    #define KOKKOS_POSIX_MMAP_FLAGS_HUGE KOKKOS_POSIX_MMAP_FLAGS
+  #endif
+#endif
+
+#endif
+
+/*--------------------------------------------------------------------------*/
+
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <memory.h>
+
+#include <iostream>
+#include <sstream>
+#include <cstring>
+
+#include <Kokkos_HostSpace.hpp>
+#include <impl/Kokkos_Error.hpp>
+#include <Kokkos_Atomic.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace {
+
+static const int QUERY_SPACE_IN_PARALLEL_MAX = 16 ;
+
+typedef int (* QuerySpaceInParallelPtr )();
+
+QuerySpaceInParallelPtr s_in_parallel_query[ QUERY_SPACE_IN_PARALLEL_MAX ] ;
+int s_in_parallel_query_count = 0 ;
+
+} // namespace <empty>
+
+void HostSpace::register_in_parallel( int (*device_in_parallel)() )
+{
+  if ( 0 == device_in_parallel ) {
+    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::HostSpace::register_in_parallel ERROR : given NULL" ) );
+  }
+
+  int i = -1 ;
+
+  if ( ! (device_in_parallel)() ) {
+    for ( i = 0 ; i < s_in_parallel_query_count && ! (*(s_in_parallel_query[i]))() ; ++i );
+  }
+
+  if ( i < s_in_parallel_query_count ) {
+    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::HostSpace::register_in_parallel_query ERROR : called in_parallel" ) );
+
+  }
+
+  if ( QUERY_SPACE_IN_PARALLEL_MAX <= i ) {
+    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::HostSpace::register_in_parallel_query ERROR : exceeded maximum" ) );
+
+  }
+
+  for ( i = 0 ; i < s_in_parallel_query_count && s_in_parallel_query[i] != device_in_parallel ; ++i );
+
+  if ( i == s_in_parallel_query_count ) {
+    s_in_parallel_query[s_in_parallel_query_count++] = device_in_parallel ;
+  }
+}
+
+int HostSpace::in_parallel()
+{
+  const int n = s_in_parallel_query_count ;
+
+  int i = 0 ;
+
+  while ( i < n && ! (*(s_in_parallel_query[i]))() ) { ++i ; }
+
+  return i < n ;
+}
+
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+/* Default allocation mechanism */
+HostSpace::HostSpace()
+  : m_alloc_mech(
+#if defined( KOKKOS_INTEL_MM_ALLOC_AVAILABLE )
+      HostSpace::INTEL_MM_ALLOC
+#elif defined( KOKKOS_POSIX_MMAP_FLAGS )
+      HostSpace::POSIX_MMAP
+#elif defined( KOKKOS_POSIX_MEMALIGN_AVAILABLE )
+      HostSpace::POSIX_MEMALIGN
+#else
+      HostSpace::STD_MALLOC
+#endif
+    )
+{}
+
+/* Default allocation mechanism */
+HostSpace::HostSpace( const HostSpace::AllocationMechanism & arg_alloc_mech )
+  : m_alloc_mech( HostSpace::STD_MALLOC )
+{
+  if ( arg_alloc_mech == STD_MALLOC ) {
+    m_alloc_mech = HostSpace::STD_MALLOC ;
+  }
+#if defined( KOKKOS_INTEL_MM_ALLOC_AVAILABLE )
+  else if ( arg_alloc_mech == HostSpace::INTEL_MM_ALLOC ) {
+    m_alloc_mech = HostSpace::INTEL_MM_ALLOC ;
+  }
+#elif defined( KOKKOS_POSIX_MEMALIGN_AVAILABLE )
+  else if ( arg_alloc_mech == HostSpace::POSIX_MEMALIGN ) {
+    m_alloc_mech = HostSpace::POSIX_MEMALIGN ;
+  }
+#elif defined( KOKKOS_POSIX_MMAP_FLAGS )
+  else if ( arg_alloc_mech == HostSpace::POSIX_MMAP ) {
+    m_alloc_mech = HostSpace::POSIX_MMAP ;
+  }
+#endif
+  else {
+    const char * const mech =
+      ( arg_alloc_mech == HostSpace::INTEL_MM_ALLOC ) ? "INTEL_MM_ALLOC" : (
+      ( arg_alloc_mech == HostSpace::POSIX_MEMALIGN ) ? "POSIX_MEMALIGN" : (
+      ( arg_alloc_mech == HostSpace::POSIX_MMAP     ) ? "POSIX_MMAP" : "" ));
+
+    std::string msg ;
+    msg.append("Kokkos::HostSpace ");
+    msg.append(mech);
+    msg.append(" is not available" );
+    Kokkos::Impl::throw_runtime_exception( msg );
+  }
+}
+
+void * HostSpace::allocate( const size_t arg_alloc_size ) const
+{
+  static_assert( sizeof(void*) == sizeof(uintptr_t)
+               , "Error sizeof(void*) != sizeof(uintptr_t)" );
+
+  static_assert( Kokkos::Impl::is_integral_power_of_two( Kokkos::Impl::MEMORY_ALIGNMENT )
+               , "Memory alignment must be power of two" );
+
+  constexpr uintptr_t alignment = Kokkos::Impl::MEMORY_ALIGNMENT ;
+  constexpr uintptr_t alignment_mask = alignment - 1 ;
+
+  void * ptr = 0 ;
+
+  if ( arg_alloc_size ) {
+
+    if ( m_alloc_mech == STD_MALLOC ) {
+      // Over-allocate to and round up to guarantee proper alignment.
+      size_t size_padded = arg_alloc_size + sizeof(void*) + alignment ;
+
+      void * alloc_ptr = malloc( size_padded );
+
+      if (alloc_ptr) {
+        uintptr_t address = reinterpret_cast<uintptr_t>(alloc_ptr);
+
+        // offset enough to record the alloc_ptr
+        address += sizeof(void *);
+        uintptr_t rem = address % alignment;
+        uintptr_t offset = rem ? (alignment - rem) : 0u;
+        address += offset;
+        ptr = reinterpret_cast<void *>(address);
+        // record the alloc'd pointer
+        address -= sizeof(void *);
+        *reinterpret_cast<void **>(address) = alloc_ptr;
+      }
+    }
+
+#if defined( KOKKOS_INTEL_MM_ALLOC_AVAILABLE )
+    else if ( m_alloc_mech == INTEL_MM_ALLOC ) {
+      ptr = _mm_malloc( arg_alloc_size , alignment );
+    }
+#endif
+
+#if defined( KOKKOS_POSIX_MEMALIGN_AVAILABLE )
+    else if ( m_alloc_mech == POSIX_MEMALIGN ) {
+      posix_memalign( & ptr, alignment , arg_alloc_size );
+    }
+#endif
+
+#if defined( KOKKOS_POSIX_MMAP_FLAGS )
+    else if ( m_alloc_mech == POSIX_MMAP ) {
+      constexpr size_t use_huge_pages = (1u << 27);
+      constexpr int    prot  = PROT_READ | PROT_WRITE ;
+      const int flags = arg_alloc_size < use_huge_pages
+                      ? KOKKOS_POSIX_MMAP_FLAGS
+                      : KOKKOS_POSIX_MMAP_FLAGS_HUGE ;
+
+      // read write access to private memory
+
+      ptr = mmap( NULL /* address hint, if NULL OS kernel chooses address */
+                , arg_alloc_size /* size in bytes */
+                , prot           /* memory protection */
+                , flags          /* visibility of updates */
+                , -1             /* file descriptor */
+                ,  0             /* offset */
+                );
+
+/* Associated reallocation:
+       ptr = mremap( old_ptr , old_size , new_size , MREMAP_MAYMOVE );
+*/
+    }
+#endif
+  }
+
+  if ( ( ptr == 0 ) || ( reinterpret_cast<uintptr_t>(ptr) == ~uintptr_t(0) )
+       || ( reinterpret_cast<uintptr_t>(ptr) & alignment_mask ) ) {
+    std::ostringstream msg ;
+    msg << "Kokkos::HostSpace::allocate[ " ;
+    switch( m_alloc_mech ) {
+    case STD_MALLOC: msg << "STD_MALLOC" ; break ;
+    case POSIX_MEMALIGN: msg << "POSIX_MEMALIGN" ; break ;
+    case POSIX_MMAP: msg << "POSIX_MMAP" ; break ;
+    case INTEL_MM_ALLOC: msg << "INTEL_MM_ALLOC" ; break ;
+    }
+    msg << " ]( " << arg_alloc_size << " ) FAILED" ;
+    if ( ptr == NULL ) { msg << " NULL" ; } 
+    else { msg << " NOT ALIGNED " << ptr ; }
+
+    std::cerr << msg.str() << std::endl ;
+    std::cerr.flush();
+
+    Kokkos::Impl::throw_runtime_exception( msg.str() );
+  }
+
+  return ptr;
+}
+
+
+void HostSpace::deallocate( void * const arg_alloc_ptr , const size_t arg_alloc_size ) const
+{
+  if ( arg_alloc_ptr ) {
+
+    if ( m_alloc_mech == STD_MALLOC ) {
+      void * alloc_ptr = *(reinterpret_cast<void **>(arg_alloc_ptr) -1);
+      free( alloc_ptr );
+    }    
+
+#if defined( KOKKOS_INTEL_MM_ALLOC_AVAILABLE )
+    else if ( m_alloc_mech == INTEL_MM_ALLOC ) {
+      _mm_free( arg_alloc_ptr );
+    }
+#endif
+
+#if defined( KOKKOS_POSIX_MEMALIGN_AVAILABLE )
+    else if ( m_alloc_mech == POSIX_MEMALIGN ) {
+      free( arg_alloc_ptr );
+    }
+#endif
+
+#if defined( KOKKOS_POSIX_MMAP_FLAGS )
+    else if ( m_alloc_mech == POSIX_MMAP ) {
+      munmap( arg_alloc_ptr , arg_alloc_size );
+    }
+#endif
+
+  }
+}
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+SharedAllocationRecord< void , void >
+SharedAllocationRecord< Kokkos::HostSpace , void >::s_root_record ;
+
+void
+SharedAllocationRecord< Kokkos::HostSpace , void >::
+deallocate( SharedAllocationRecord< void , void > * arg_rec )
+{
+  delete static_cast<SharedAllocationRecord*>(arg_rec);
+}
+
+SharedAllocationRecord< Kokkos::HostSpace , void >::
+~SharedAllocationRecord()
+{
+  m_space.deallocate( SharedAllocationRecord< void , void >::m_alloc_ptr
+                    , SharedAllocationRecord< void , void >::m_alloc_size
+                    );
+}
+
+SharedAllocationRecord< Kokkos::HostSpace , void >::
+SharedAllocationRecord( const Kokkos::HostSpace & arg_space
+                      , const std::string       & arg_label
+                      , const size_t              arg_alloc_size
+                      , const SharedAllocationRecord< void , void >::function_type arg_dealloc
+                      )
+  // Pass through allocated [ SharedAllocationHeader , user_memory ]
+  // Pass through deallocation function
+  : SharedAllocationRecord< void , void >
+      ( & SharedAllocationRecord< Kokkos::HostSpace , void >::s_root_record
+      , reinterpret_cast<SharedAllocationHeader*>( arg_space.allocate( sizeof(SharedAllocationHeader) + arg_alloc_size ) )
+      , sizeof(SharedAllocationHeader) + arg_alloc_size
+      , arg_dealloc
+      )
+  , m_space( arg_space )
+{
+  // Fill in the Header information
+  RecordBase::m_alloc_ptr->m_record = static_cast< SharedAllocationRecord< void , void > * >( this );
+
+  strncpy( RecordBase::m_alloc_ptr->m_label
+          , arg_label.c_str()
+          , SharedAllocationHeader::maximum_label_length
+          );
+}
+
+//----------------------------------------------------------------------------
+
+void * SharedAllocationRecord< Kokkos::HostSpace , void >::
+allocate_tracked( const Kokkos::HostSpace & arg_space
+                , const std::string & arg_alloc_label 
+                , const size_t arg_alloc_size )
+{
+  if ( ! arg_alloc_size ) return (void *) 0 ;
+
+  SharedAllocationRecord * const r =
+    allocate( arg_space , arg_alloc_label , arg_alloc_size );
+
+  RecordBase::increment( r );
+
+  return r->data();
+}
+
+void SharedAllocationRecord< Kokkos::HostSpace , void >::
+deallocate_tracked( void * const arg_alloc_ptr )
+{
+  if ( arg_alloc_ptr != 0 ) {
+    SharedAllocationRecord * const r = get_record( arg_alloc_ptr );
+
+    RecordBase::decrement( r );
+  }
+}
+
+void * SharedAllocationRecord< Kokkos::HostSpace , void >::
+reallocate_tracked( void * const arg_alloc_ptr
+                  , const size_t arg_alloc_size )
+{
+  SharedAllocationRecord * const r_old = get_record( arg_alloc_ptr );
+  SharedAllocationRecord * const r_new = allocate( r_old->m_space , r_old->get_label() , arg_alloc_size );
+
+  Kokkos::Impl::DeepCopy<HostSpace,HostSpace>( r_new->data() , r_old->data()
+                                             , std::min( r_old->size() , r_new->size() ) );
+
+  RecordBase::increment( r_new );
+  RecordBase::decrement( r_old );
+
+  return r_new->data();
+}
+
+SharedAllocationRecord< Kokkos::HostSpace , void > *
+SharedAllocationRecord< Kokkos::HostSpace , void >::get_record( void * alloc_ptr )
+{
+  typedef SharedAllocationHeader  Header ;
+  typedef SharedAllocationRecord< Kokkos::HostSpace , void >  RecordHost ;
+
+  SharedAllocationHeader const * const head   = alloc_ptr ? Header::get_header( alloc_ptr ) : (SharedAllocationHeader *)0 ;
+  RecordHost                   * const record = head ? static_cast< RecordHost * >( head->m_record ) : (RecordHost *) 0 ;
+
+  if ( ! alloc_ptr || record->m_alloc_ptr != head ) {
+    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void >::get_record ERROR" ) );
+  }
+
+  return record ;
+}
+
+// Iterate records to print orphaned memory ...
+void SharedAllocationRecord< Kokkos::HostSpace , void >::
+print_records( std::ostream & s , const Kokkos::HostSpace & space , bool detail )
+{
+  SharedAllocationRecord< void , void >::print_host_accessible_records( s , "HostSpace" , & s_root_record , detail );
+}
+
+} // namespace Impl
+} // namespace Experimental
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template< class >
+struct ViewOperatorBoundsErrorAbort ;
+
+template<>
+struct ViewOperatorBoundsErrorAbort< Kokkos::HostSpace > {
+ static void apply( const size_t rank
+                  , const size_t n0 , const size_t n1
+                  , const size_t n2 , const size_t n3
+                  , const size_t n4 , const size_t n5
+                  , const size_t n6 , const size_t n7
+                  , const size_t i0 , const size_t i1
+                  , const size_t i2 , const size_t i3
+                  , const size_t i4 , const size_t i5
+                  , const size_t i6 , const size_t i7 );
+};
+
+void ViewOperatorBoundsErrorAbort< Kokkos::HostSpace >::
+apply( const size_t rank
+     , const size_t n0 , const size_t n1
+     , const size_t n2 , const size_t n3
+     , const size_t n4 , const size_t n5
+     , const size_t n6 , const size_t n7
+     , const size_t i0 , const size_t i1
+     , const size_t i2 , const size_t i3
+     , const size_t i4 , const size_t i5
+     , const size_t i6 , const size_t i7 )
+{
+  char buffer[512];
+
+  snprintf( buffer , sizeof(buffer)
+          , "View operator bounds error : rank(%lu) dim(%lu,%lu,%lu,%lu,%lu,%lu,%lu,%lu) index(%lu,%lu,%lu,%lu,%lu,%lu,%lu,%lu)"
+          , rank , n0 , n1 , n2 , n3 , n4 , n5 , n6 , n7
+                 , i0 , i1 , i2 , i3 , i4 , i5 , i6 , i7 );
+
+  Kokkos::Impl::throw_runtime_exception( buffer );
+}
+
+} // namespace Impl
+} // namespace Experimental
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace {
+  const unsigned HOST_SPACE_ATOMIC_MASK = 0xFFFF;
+  const unsigned HOST_SPACE_ATOMIC_XOR_MASK = 0x5A39;
+  static int HOST_SPACE_ATOMIC_LOCKS[HOST_SPACE_ATOMIC_MASK+1];
+}
+
+namespace Impl {
+void init_lock_array_host_space() {
+  static int is_initialized = 0;
+  if(! is_initialized)
+    for(int i = 0; i < static_cast<int> (HOST_SPACE_ATOMIC_MASK+1); i++)
+      HOST_SPACE_ATOMIC_LOCKS[i] = 0;
+}
+
+bool lock_address_host_space(void* ptr) {
+  return 0 == atomic_compare_exchange( &HOST_SPACE_ATOMIC_LOCKS[
+      (( size_t(ptr) >> 2 ) & HOST_SPACE_ATOMIC_MASK) ^ HOST_SPACE_ATOMIC_XOR_MASK] ,
+                                  0 , 1);
+}
+
+void unlock_address_host_space(void* ptr) {
+   atomic_exchange( &HOST_SPACE_ATOMIC_LOCKS[
+      (( size_t(ptr) >> 2 ) & HOST_SPACE_ATOMIC_MASK) ^ HOST_SPACE_ATOMIC_XOR_MASK] ,
+                    0);
+}
+
+}
+}
diff --git a/lib/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp b/lib/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..eb3da7501ebeeda048e0e8c78e81f20fb60060fa
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp
@@ -0,0 +1,107 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_MEMORY_FENCE )
+#define KOKKOS_MEMORY_FENCE
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+
+KOKKOS_FORCEINLINE_FUNCTION
+void memory_fence()
+{
+#if defined( KOKKOS_ATOMICS_USE_CUDA )
+  __threadfence();
+#elif defined( KOKKOS_ATOMICS_USE_GCC ) || \
+      ( defined( KOKKOS_COMPILER_NVCC ) && defined( KOKKOS_ATOMICS_USE_INTEL ) )
+  __sync_synchronize();
+#elif defined( KOKKOS_ATOMICS_USE_INTEL )
+  _mm_mfence();
+#elif defined( KOKKOS_ATOMICS_USE_OMP31 )
+  #pragma omp flush
+#elif defined( KOKKOS_ATOMICS_USE_WINDOWS )
+  MemoryBarrier();
+#else
+ #error "Error: memory_fence() not defined"
+#endif
+}
+
+//////////////////////////////////////////////////////
+// store_fence()
+//
+// If possible use a store fence on the architecture, if not run a full memory fence
+
+KOKKOS_FORCEINLINE_FUNCTION
+void store_fence()
+{
+#if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_USE_ISA_X86_64 )
+  asm volatile (
+	"sfence" ::: "memory"
+  	);
+#else
+  memory_fence();
+#endif
+}
+
+//////////////////////////////////////////////////////
+// load_fence()
+//
+// If possible use a load fence on the architecture, if not run a full memory fence
+
+KOKKOS_FORCEINLINE_FUNCTION
+void load_fence()
+{
+#if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_USE_ISA_X86_64 )
+  asm volatile (
+	"lfence" ::: "memory"
+  	);
+#else
+  memory_fence();
+#endif
+}
+
+} // namespace kokkos
+
+#endif
+
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_PhysicalLayout.hpp b/lib/kokkos/core/src/impl/Kokkos_PhysicalLayout.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..556c96d863a2b3d19a5f8c7941f3257dbca34f85
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_PhysicalLayout.hpp
@@ -0,0 +1,73 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_PHYSICAL_LAYOUT_HPP
+#define KOKKOS_PHYSICAL_LAYOUT_HPP
+
+
+#include <Kokkos_View.hpp>
+namespace Kokkos {
+namespace Impl {
+
+
+
+struct PhysicalLayout {
+  enum LayoutType {Left,Right,Scalar,Error};
+  LayoutType layout_type;
+  int rank;
+  long long int stride[8]; //distance between two neighboring elements in a given dimension
+
+  template< class T , class L , class D , class M >
+  PhysicalLayout( const View<T,L,D,M> & view )
+    : layout_type( is_same< typename View<T,L,D,M>::array_layout , LayoutLeft  >::value ? Left : (
+                   is_same< typename View<T,L,D,M>::array_layout , LayoutRight >::value ? Right : Error ))
+    , rank( view.Rank )
+    {
+      for(int i=0;i<8;i++) stride[i] = 0;
+      view.stride( stride );
+    }
+};
+
+}
+}
+#endif
diff --git a/lib/kokkos/core/src/impl/Kokkos_Profiling_DeviceInfo.hpp b/lib/kokkos/core/src/impl/Kokkos_Profiling_DeviceInfo.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..8ea1e816cdab4fc29679ee8df8800cf2a59f026e
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Profiling_DeviceInfo.hpp
@@ -0,0 +1,57 @@
+/*
+ //@HEADER
+ // ************************************************************************
+ //
+ //                        Kokkos v. 2.0
+ //              Copyright (2014) Sandia Corporation
+ //
+ // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+ // the U.S. Government retains certain rights in this software.
+ //
+ // Redistribution and use in source and binary forms, with or without
+ // modification, are permitted provided that the following conditions are
+ // met:
+ //
+ // 1. Redistributions of source code must retain the above copyright
+ // notice, this list of conditions and the following disclaimer.
+ //
+ // 2. Redistributions in binary form must reproduce the above copyright
+ // notice, this list of conditions and the following disclaimer in the
+ // documentation and/or other materials provided with the distribution.
+ //
+ // 3. Neither the name of the Corporation nor the names of the
+ // contributors may be used to endorse or promote products derived from
+ // this software without specific prior written permission.
+ //
+ // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+ // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+ // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ //
+ // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+ //
+ // ************************************************************************
+ //@HEADER
+*/
+
+#ifndef KOKKOSP_DEVICE_INFO_HPP
+#define KOKKOSP_DEVICE_INFO_HPP
+
+namespace Kokkos {
+namespace Profiling {
+
+    struct KokkosPDeviceInfo {
+        uint32_t deviceID;
+    };
+
+}
+}
+
+#endif
diff --git a/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.cpp b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..91faed170abbeb6d552b6247c74afdaa1596e038
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.cpp
@@ -0,0 +1,186 @@
+/*
+ //@HEADER
+ // ************************************************************************
+ //
+ //                        Kokkos v. 2.0
+ //              Copyright (2014) Sandia Corporation
+ //
+ // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+ // the U.S. Government retains certain rights in this software.
+ //
+ // Redistribution and use in source and binary forms, with or without
+ // modification, are permitted provided that the following conditions are
+ // met:
+ //
+ // 1. Redistributions of source code must retain the above copyright
+ // notice, this list of conditions and the following disclaimer.
+ //
+ // 2. Redistributions in binary form must reproduce the above copyright
+ // notice, this list of conditions and the following disclaimer in the
+ // documentation and/or other materials provided with the distribution.
+ //
+ // 3. Neither the name of the Corporation nor the names of the
+ // contributors may be used to endorse or promote products derived from
+ // this software without specific prior written permission.
+ //
+ // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+ // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+ // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ //
+ // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+ //
+ // ************************************************************************
+ //@HEADER
+ */
+
+#include <impl/Kokkos_Profiling_Interface.hpp>
+
+#if (KOKKOS_ENABLE_PROFILING)
+#include <string.h>
+
+namespace Kokkos {
+  namespace Profiling {
+    bool profileLibraryLoaded() {
+       	return (NULL != initProfileLibrary);
+    }
+
+    void beginParallelFor(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID) {
+        if(NULL != beginForCallee) {
+            Kokkos::fence();
+            (*beginForCallee)(kernelPrefix.c_str(), devID, kernelID);
+        }
+    }
+
+    void endParallelFor(const uint64_t kernelID) {
+        if(NULL != endForCallee) {
+            Kokkos::fence();
+            (*endForCallee)(kernelID);
+        }
+    }
+
+    void beginParallelScan(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID) {
+        if(NULL != beginScanCallee) {
+            Kokkos::fence();
+            (*beginScanCallee)(kernelPrefix.c_str(), devID, kernelID);
+        }
+    }
+
+    void endParallelScan(const uint64_t kernelID) {
+        if(NULL != endScanCallee) {
+            Kokkos::fence();
+            (*endScanCallee)(kernelID);
+        }
+    }
+    
+    void beginParallelReduce(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID) {
+        if(NULL != beginReduceCallee) {
+            Kokkos::fence();
+            (*beginReduceCallee)(kernelPrefix.c_str(), devID, kernelID);
+        }
+    }
+    
+    void endParallelReduce(const uint64_t kernelID) {
+        if(NULL != endReduceCallee) {
+            Kokkos::fence();
+            (*endReduceCallee)(kernelID);
+        }
+    }
+    
+    void initialize() {
+
+        // Make sure initialize calls happens only once
+        static int is_initialized = 0;
+        if(is_initialized) return;
+        is_initialized = 1;
+
+        void* firstProfileLibrary;
+
+        char* envProfileLibrary  = getenv("KOKKOS_PROFILE_LIBRARY");
+
+	// If we do not find a profiling library in the environment then exit
+	// early.
+	if( NULL == envProfileLibrary ) {
+		return ;
+	}
+
+		char* envProfileCopy = (char*) malloc(sizeof(char) * (strlen(envProfileLibrary) + 1));
+		sprintf(envProfileCopy, "%s", envProfileLibrary);
+
+		char* profileLibraryName = strtok(envProfileCopy, ";");
+
+        if( (NULL != profileLibraryName) && (strcmp(profileLibraryName, "") != 0) ) {
+            firstProfileLibrary = dlopen(profileLibraryName, RTLD_NOW | RTLD_GLOBAL);
+
+            if(NULL == firstProfileLibrary) {
+                std::cerr << "Error: Unable to load KokkosP library: " <<
+                profileLibraryName << std::endl;
+            } else {
+                std::cout << "KokkosP: Library Loaded: " << profileLibraryName << std::endl;
+
+                // dlsym returns a pointer to an object, while we want to assign to pointer to function
+                // A direct cast will give warnings hence, we have to workaround the issue by casting pointer to pointers.
+                auto p1 = dlsym(firstProfileLibrary, "kokkosp_begin_parallel_for");
+                beginForCallee = *((beginFunction*) &p1);
+                auto p2 = dlsym(firstProfileLibrary, "kokkosp_begin_parallel_scan");
+                beginScanCallee = *((beginFunction*) &p2);
+                auto p3 = dlsym(firstProfileLibrary, "kokkosp_begin_parallel_reduce");
+                beginReduceCallee = *((beginFunction*) &p3);
+
+                auto p4 = dlsym(firstProfileLibrary, "kokkosp_end_parallel_scan");
+                endScanCallee = *((endFunction*) &p4);
+                auto p5 = dlsym(firstProfileLibrary, "kokkosp_end_parallel_for");
+                endForCallee = *((endFunction*) &p5);
+                auto p6 = dlsym(firstProfileLibrary, "kokkosp_end_parallel_reduce");
+                endReduceCallee = *((endFunction*) &p6);
+
+                auto p7 = dlsym(firstProfileLibrary, "kokkosp_init_library");
+                initProfileLibrary = *((initFunction*) &p7);
+                auto p8 = dlsym(firstProfileLibrary, "kokkosp_finalize_library");
+                finalizeProfileLibrary = *((finalizeFunction*) &p8);
+            }
+        }
+
+        if(NULL != initProfileLibrary) {
+            (*initProfileLibrary)(0,
+			(uint64_t) KOKKOSP_INTERFACE_VERSION,
+			(uint32_t) 0,
+			NULL);
+        }
+
+		free(envProfileCopy);
+    }
+
+    void finalize() {
+      // Make sure finalize calls happens only once
+      static int is_finalized = 0;
+      if(is_finalized) return;
+      is_finalized = 1;
+
+      if(NULL != finalizeProfileLibrary) {
+        (*finalizeProfileLibrary)();
+
+        // Set all profile hooks to NULL to prevent
+        // any additional calls. Once we are told to
+        // finalize, we mean it
+        beginForCallee = NULL;
+        beginScanCallee = NULL;
+        beginReduceCallee = NULL;
+        endScanCallee = NULL;
+        endForCallee = NULL;
+        endReduceCallee = NULL;
+        initProfileLibrary = NULL;
+        finalizeProfileLibrary = NULL;
+      }
+    }
+  }
+}
+
+#endif
diff --git a/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..4f01256335cd82962d1744a9895374c170a5cb8b
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp
@@ -0,0 +1,118 @@
+/*
+ //@HEADER
+ // ************************************************************************
+ //
+ //                        Kokkos v. 2.0
+ //              Copyright (2014) Sandia Corporation
+ //
+ // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+ // the U.S. Government retains certain rights in this software.
+ //
+ // Redistribution and use in source and binary forms, with or without
+ // modification, are permitted provided that the following conditions are
+ // met:
+ //
+ // 1. Redistributions of source code must retain the above copyright
+ // notice, this list of conditions and the following disclaimer.
+ //
+ // 2. Redistributions in binary form must reproduce the above copyright
+ // notice, this list of conditions and the following disclaimer in the
+ // documentation and/or other materials provided with the distribution.
+ //
+ // 3. Neither the name of the Corporation nor the names of the
+ // contributors may be used to endorse or promote products derived from
+ // this software without specific prior written permission.
+ //
+ // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+ // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+ // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ //
+ // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+ //
+ // ************************************************************************
+ //@HEADER
+ */
+
+#ifndef KOKKOSP_INTERFACE_HPP
+#define KOKKOSP_INTERFACE_HPP
+
+#include <cstddef>
+#include <Kokkos_Core_fwd.hpp>
+#include <Kokkos_Macros.hpp>
+#include <string>
+
+#if (KOKKOS_ENABLE_PROFILING)
+#include <impl/Kokkos_Profiling_DeviceInfo.hpp>
+#include <dlfcn.h>
+#include <iostream>
+#include <stdlib.h>
+#endif
+
+#define KOKKOSP_INTERFACE_VERSION 20150628
+
+#if (KOKKOS_ENABLE_PROFILING)
+namespace Kokkos {
+  namespace Profiling {
+
+    typedef void (*initFunction)(const int,
+	const uint64_t,
+	const uint32_t,
+	KokkosPDeviceInfo*);
+    typedef void (*finalizeFunction)();
+    typedef void (*beginFunction)(const char*, const uint32_t, uint64_t*);
+    typedef void (*endFunction)(uint64_t);
+
+    static initFunction initProfileLibrary = NULL;
+    static finalizeFunction finalizeProfileLibrary = NULL;
+    static beginFunction beginForCallee = NULL;
+    static beginFunction beginScanCallee = NULL;
+    static beginFunction beginReduceCallee = NULL;
+    static endFunction endForCallee = NULL;
+    static endFunction endScanCallee = NULL;
+    static endFunction endReduceCallee = NULL;
+
+    bool profileLibraryLoaded();
+
+    void beginParallelFor(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID);
+    void endParallelFor(const uint64_t kernelID);
+    void beginParallelScan(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID);
+    void endParallelScan(const uint64_t kernelID);
+    void beginParallelReduce(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID);
+    void endParallelReduce(const uint64_t kernelID);
+
+    void initialize();
+    void finalize();
+
+    //Define finalize_fake inline to get rid of warnings for unused static variables
+    inline void finalize_fake() {
+      if(NULL != finalizeProfileLibrary) {
+        (*finalizeProfileLibrary)();
+
+        // Set all profile hooks to NULL to prevent
+        // any additional calls. Once we are told to
+        // finalize, we mean it
+        beginForCallee = NULL;
+        beginScanCallee = NULL;
+        beginReduceCallee = NULL;
+        endScanCallee = NULL;
+        endForCallee = NULL;
+        endReduceCallee = NULL;
+        initProfileLibrary = NULL;
+        finalizeProfileLibrary = NULL;
+      }
+    }
+
+
+  }
+}
+
+#endif
+#endif
diff --git a/lib/kokkos/core/src/impl/Kokkos_Serial.cpp b/lib/kokkos/core/src/impl/Kokkos_Serial.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..562c7afc6de5e3b6913671e52abc5157dc61c6d5
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Serial.cpp
@@ -0,0 +1,119 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <stdlib.h>
+#include <sstream>
+#include <Kokkos_Serial.hpp>
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_Error.hpp>
+
+#if defined( KOKKOS_HAVE_SERIAL )
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+namespace SerialImpl {
+
+Sentinel::Sentinel() : m_scratch(0), m_reduce_end(0), m_shared_end(0) {}
+
+Sentinel::~Sentinel()
+{
+  if ( m_scratch ) { free( m_scratch ); }
+  m_scratch = 0 ;
+  m_reduce_end = 0 ;
+  m_shared_end = 0 ;
+}
+
+Sentinel & Sentinel::singleton()
+{
+  static Sentinel s ; return s ;
+}
+
+inline
+unsigned align( unsigned n )
+{
+  enum { ALIGN = 0x0100 /* 256 */ , MASK = ALIGN - 1 };
+  return ( n + MASK ) & ~MASK ;
+}
+
+} // namespace
+
+SerialTeamMember::SerialTeamMember( int arg_league_rank
+                                  , int arg_league_size
+                                  , int arg_shared_size
+                                  )
+  : m_space( ((char *) SerialImpl::Sentinel::singleton().m_scratch) + SerialImpl::Sentinel::singleton().m_reduce_end
+           , arg_shared_size )
+  , m_league_rank( arg_league_rank )
+  , m_league_size( arg_league_size )
+{}
+
+} // namespace Impl
+
+void * Serial::scratch_memory_resize( unsigned reduce_size , unsigned shared_size )
+{
+  static Impl::SerialImpl::Sentinel & s = Impl::SerialImpl::Sentinel::singleton();
+
+  reduce_size = Impl::SerialImpl::align( reduce_size );
+  shared_size = Impl::SerialImpl::align( shared_size );
+
+  if ( ( s.m_reduce_end < reduce_size ) ||
+       ( s.m_shared_end < s.m_reduce_end + shared_size ) ) {
+
+    if ( s.m_scratch ) { free( s.m_scratch ); }
+
+    if ( s.m_reduce_end < reduce_size ) s.m_reduce_end = reduce_size ;
+    if ( s.m_shared_end < s.m_reduce_end + shared_size ) s.m_shared_end = s.m_reduce_end + shared_size ;
+
+    s.m_scratch = malloc( s.m_shared_end );
+  }
+
+  return s.m_scratch ;
+}
+
+} // namespace Kokkos
+
+#endif // defined( KOKKOS_HAVE_SERIAL )
+
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Serial_Task.cpp b/lib/kokkos/core/src/impl/Kokkos_Serial_Task.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e8bdbde6c60f182f588617dda2a9c2f32530694c
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Serial_Task.cpp
@@ -0,0 +1,147 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+
+#if defined( KOKKOS_HAVE_SERIAL ) && defined( KOKKOS_ENABLE_TASKPOLICY )
+
+#include <impl/Kokkos_TaskQueue_impl.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template class TaskQueue< Kokkos::Serial > ;
+
+void TaskQueueSpecialization< Kokkos::Serial >::execute
+  ( TaskQueue< Kokkos::Serial > * const queue )
+{
+  using execution_space = Kokkos::Serial ;
+  using queue_type      = TaskQueue< execution_space > ;
+  using task_root_type  = TaskBase< execution_space , void , void > ;
+  using Member          = TaskExec< execution_space > ;
+
+  task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
+
+  Member exec ;
+
+  // Loop until all queues are empty
+  while ( 0 < queue->m_ready_count ) {
+
+    task_root_type * task = end ;
+
+    for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
+      for ( int j = 0 ; j < 2 && end == task ; ++j ) {
+        task = queue_type::pop_task( & queue->m_ready[i][j] );
+      }
+    }
+
+    if ( end != task ) {
+
+      // pop_task resulted in lock == task->m_next
+      // In the executing state
+
+      (*task->m_apply)( task , & exec );
+
+#if 0
+  printf( "TaskQueue<Serial>::executed: 0x%lx { 0x%lx 0x%lx %d %d %d }\n"
+        , uintptr_t(task)
+        , uintptr_t(task->m_wait)
+        , uintptr_t(task->m_next)
+        , task->m_task_type
+        , task->m_priority
+        , task->m_ref_count );
+#endif
+
+      // If a respawn then re-enqueue otherwise the task is complete
+      // and all tasks waiting on this task are updated.
+      queue->complete( task );
+    }
+    else if ( 0 != queue->m_ready_count ) {
+      Kokkos::abort("TaskQueue<Serial>::execute ERROR: ready_count");
+    }
+  }
+}
+
+void TaskQueueSpecialization< Kokkos::Serial > ::
+  iff_single_thread_recursive_execute(
+    TaskQueue< Kokkos::Serial > * const queue )
+{
+  using execution_space = Kokkos::Serial ;
+  using queue_type      = TaskQueue< execution_space > ;
+  using task_root_type  = TaskBase< execution_space , void , void > ;
+  using Member          = TaskExec< execution_space > ;
+
+  task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
+
+  Member exec ;
+
+  // Loop until no runnable task
+
+  task_root_type * task = end ;
+  
+  do {
+
+    task = end ;
+
+    for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
+      for ( int j = 0 ; j < 2 && end == task ; ++j ) {
+        task = queue_type::pop_task( & queue->m_ready[i][j] );
+      }
+    }
+
+    if ( end == task ) break ;
+
+    (*task->m_apply)( task , & exec );
+
+    queue->complete( task );
+
+  } while(1);
+}
+
+}} /* namespace Kokkos::Impl */
+
+#endif /* #if defined( KOKKOS_HAVE_SERIAL ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Serial_Task.hpp b/lib/kokkos/core/src/impl/Kokkos_Serial_Task.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..48a110c5f1583cd4943a011f3d33bd25e3cd00f2
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Serial_Task.hpp
@@ -0,0 +1,271 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_IMPL_SERIAL_TASK_HPP
+#define KOKKOS_IMPL_SERIAL_TASK_HPP
+
+#if defined( KOKKOS_ENABLE_TASKPOLICY )
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+
+template<>
+class TaskQueueSpecialization< Kokkos::Serial >
+{
+public:
+
+  using execution_space = Kokkos::Serial ;
+  using memory_space    = Kokkos::HostSpace ;
+  using queue_type      = Kokkos::Impl::TaskQueue< execution_space > ;
+  using task_base_type  = Kokkos::Impl::TaskBase< execution_space , void , void > ;
+
+  static
+  void iff_single_thread_recursive_execute( queue_type * const );
+
+  static
+  void execute( queue_type * const );
+
+  template< typename FunctorType >
+  static
+  void proc_set_apply( task_base_type::function_type * ptr )
+    {
+      using TaskType = TaskBase< Kokkos::Serial
+                               , typename FunctorType::value_type
+                               , FunctorType
+                               > ;
+       *ptr = TaskType::apply ;
+    }
+};
+
+extern template class TaskQueue< Kokkos::Serial > ;
+
+//----------------------------------------------------------------------------
+
+template<>
+class TaskExec< Kokkos::Serial >
+{
+public:
+
+  KOKKOS_INLINE_FUNCTION void team_barrier() const {}
+  KOKKOS_INLINE_FUNCTION int team_rank() const { return 0 ; }
+  KOKKOS_INLINE_FUNCTION int team_size() const { return 1 ; }
+};
+
+template<typename iType>
+struct TeamThreadRangeBoundariesStruct<iType, TaskExec< Kokkos::Serial > >
+{
+  typedef iType index_type;
+  const iType start ;
+  const iType end ;
+  enum {increment = 1};
+  //const  TaskExec< Kokkos::Serial > & thread;
+  TaskExec< Kokkos::Serial > & thread;
+
+  KOKKOS_INLINE_FUNCTION
+  TeamThreadRangeBoundariesStruct
+    //( const TaskExec< Kokkos::Serial > & arg_thread, const iType& arg_count)
+    ( TaskExec< Kokkos::Serial > & arg_thread, const iType& arg_count)
+    : start(0)
+    , end(arg_count)
+    , thread(arg_thread)
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  TeamThreadRangeBoundariesStruct
+    //( const TaskExec< Kokkos::Serial > & arg_thread
+    ( TaskExec< Kokkos::Serial > & arg_thread
+    , const iType& arg_start
+    , const iType & arg_end
+    )
+    : start( arg_start )
+    , end(   arg_end)
+    , thread( arg_thread )
+    {}
+};
+
+}} /* namespace Kokkos::Impl */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+/*
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >
+TeamThreadRange( const Impl::TaskExec< Kokkos::Serial > & thread
+               , const iType & count )
+{
+  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >(thread,count);
+}
+*/
+//TODO const issue omp
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >
+TeamThreadRange( Impl::TaskExec< Kokkos::Serial > & thread
+               , const iType & count )
+{
+  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >(thread,count);
+}
+/*
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Serial > >
+TeamThreadRange( const Impl:: TaskExec< Kokkos::Serial > & thread, const iType & start , const iType & end )
+{
+  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Serial > >(thread,start,end);
+}
+*/
+//TODO const issue omp
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Serial > >
+TeamThreadRange( Impl:: TaskExec< Kokkos::Serial > & thread, const iType & start , const iType & end )
+{
+  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Serial > >(thread,start,end);
+}
+
+  /** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
+   *
+   * The range i=0..N-1 is mapped to all threads of the the calling thread team.
+   * This functionality requires C++11 support.*/
+template<typename iType, class Lambda>
+KOKKOS_INLINE_FUNCTION
+void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Serial > >& loop_boundaries, const Lambda& lambda) {
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
+    lambda(i);
+}
+
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries,
+   const Lambda & lambda,
+   ValueType& initialized_result)
+{
+
+  ValueType result = initialized_result;
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
+    lambda(i, result);
+
+  initialized_result = result;
+}
+
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries,
+   const Lambda & lambda,
+   const JoinType & join,
+   ValueType& initialized_result)
+{
+  ValueType result = initialized_result;
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
+    lambda(i, result);
+
+  initialized_result = result;
+}
+// placeholder for future function
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries,
+   const Lambda & lambda,
+   ValueType& initialized_result)
+{
+}
+// placeholder for future function
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries,
+   const Lambda & lambda,
+   const JoinType & join,
+   ValueType& initialized_result)
+{
+}
+
+template< typename ValueType, typename iType, class Lambda >
+KOKKOS_INLINE_FUNCTION
+void parallel_scan
+  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries,
+   const Lambda & lambda)
+{
+  ValueType accum = 0 ;
+  ValueType val, local_total;
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    local_total = 0;
+    lambda(i,local_total,false);
+    val = accum;
+    lambda(i,val,true);
+    accum += local_total;
+  }
+
+}
+
+// placeholder for future function
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_scan
+  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries,
+   const Lambda & lambda)
+{
+}
+
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
+#endif /* #ifndef KOKKOS_IMPL_SERIAL_TASK_HPP */
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Serial_TaskPolicy.cpp b/lib/kokkos/core/src/impl/Kokkos_Serial_TaskPolicy.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1577df07cd74f2634f9f98cc94d3825062ad3ff6
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Serial_TaskPolicy.cpp
@@ -0,0 +1,348 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+// Experimental unified task-data parallel manycore LDRD
+
+#include <impl/Kokkos_Serial_TaskPolicy.hpp>
+
+#if defined( KOKKOS_HAVE_SERIAL ) && defined( KOKKOS_ENABLE_TASKPOLICY )
+
+#include <stdlib.h>
+#include <stdexcept>
+#include <iostream>
+#include <sstream>
+#include <string>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+
+TaskPolicy< Kokkos::Serial >::member_type &
+TaskPolicy< Kokkos::Serial >::member_single()
+{
+  static member_type s(0,1,0); 
+  return s ;
+}
+
+} // namespace Experimental
+} // namespace Kokkos
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+typedef TaskMember<  Kokkos::Serial , void , void > Task ;
+
+//----------------------------------------------------------------------------
+
+namespace {
+
+inline
+unsigned padded_sizeof_derived( unsigned sizeof_derived )
+{
+  return sizeof_derived +
+    ( sizeof_derived % sizeof(Task*) ? sizeof(Task*) - sizeof_derived % sizeof(Task*) : 0 );
+}
+
+} // namespace
+
+void Task::deallocate( void * ptr )
+{
+  free( ptr );
+}
+
+void * Task::allocate( const unsigned arg_sizeof_derived
+                     , const unsigned arg_dependence_capacity )
+{
+  return malloc( padded_sizeof_derived( arg_sizeof_derived ) + arg_dependence_capacity * sizeof(Task*) );
+}
+
+Task::~TaskMember()
+{
+
+}
+
+Task::TaskMember( const Task::function_verify_type   arg_verify
+                , const Task::function_dealloc_type  arg_dealloc
+                , const Task::function_apply_type    arg_apply
+                , const unsigned                     arg_sizeof_derived
+                , const unsigned                     arg_dependence_capacity
+                )
+  : m_dealloc( arg_dealloc )
+  , m_verify(  arg_verify )
+  , m_apply(   arg_apply )
+  , m_dep( (Task **)( ((unsigned char *) this) + padded_sizeof_derived( arg_sizeof_derived ) ) )
+  , m_wait( 0 )
+  , m_next( 0 )
+  , m_dep_capacity( arg_dependence_capacity )
+  , m_dep_size( 0 )
+  , m_ref_count( 0 )
+  , m_state( TASK_STATE_CONSTRUCTING )
+{
+  for ( unsigned i = 0 ; i < arg_dependence_capacity ; ++i ) m_dep[i] = 0 ;
+}
+
+Task::TaskMember( const Task::function_dealloc_type  arg_dealloc
+                , const Task::function_apply_type    arg_apply
+                , const unsigned                     arg_sizeof_derived
+                , const unsigned                     arg_dependence_capacity
+                )
+  : m_dealloc( arg_dealloc )
+  , m_verify(  & Task::verify_type<void> )
+  , m_apply(   arg_apply )
+  , m_dep( (Task **)( ((unsigned char *) this) + padded_sizeof_derived( arg_sizeof_derived ) ) )
+  , m_wait( 0 )
+  , m_next( 0 )
+  , m_dep_capacity( arg_dependence_capacity )
+  , m_dep_size( 0 )
+  , m_ref_count( 0 )
+  , m_state( TASK_STATE_CONSTRUCTING )
+{
+  for ( unsigned i = 0 ; i < arg_dependence_capacity ; ++i ) m_dep[i] = 0 ;
+}
+
+//----------------------------------------------------------------------------
+
+void Task::throw_error_add_dependence() const
+{
+  std::cerr << "TaskMember< Serial >::add_dependence ERROR"
+            << " state(" << m_state << ")"
+            << " dep_size(" << m_dep_size << ")"
+            << std::endl ;
+  throw std::runtime_error("TaskMember< Serial >::add_dependence ERROR");
+}
+
+void Task::throw_error_verify_type()
+{
+  throw std::runtime_error("TaskMember< Serial >::verify_type ERROR");
+}
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+
+void Task::assign( Task ** const lhs , Task * rhs , const bool no_throw )
+{
+  static const char msg_error_header[]      = "Kokkos::Experimental::Impl::TaskManager<Kokkos::Serial>::assign ERROR" ;
+  static const char msg_error_count[]       = ": negative reference count" ;
+  static const char msg_error_complete[]    = ": destroy task that is not complete" ;
+  static const char msg_error_dependences[] = ": destroy task that has dependences" ;
+  static const char msg_error_exception[]   = ": caught internal exception" ;
+
+  const char * msg_error = 0 ;
+
+  try {
+
+    if ( *lhs ) {
+
+      const int count = --((**lhs).m_ref_count);
+
+      if ( 0 == count ) {
+
+        // Reference count at zero, delete it
+
+        // Should only be deallocating a completed task
+        if ( (**lhs).m_state == Kokkos::Experimental::TASK_STATE_COMPLETE ) {
+
+          // A completed task should not have dependences...
+          for ( int i = 0 ; i < (**lhs).m_dep_size && 0 == msg_error ; ++i ) {
+            if ( (**lhs).m_dep[i] ) msg_error = msg_error_dependences ;
+          }
+        }
+        else {
+          msg_error = msg_error_complete ;
+        }
+
+        if ( 0 == msg_error ) {
+          // Get deletion function and apply it
+          const Task::function_dealloc_type d = (**lhs).m_dealloc ;
+
+          (*d)( *lhs );
+        }
+      }
+      else if ( count <= 0 ) {
+        msg_error = msg_error_count ;
+      }
+    }
+
+    if ( 0 == msg_error && rhs ) { ++( rhs->m_ref_count ); }
+
+    *lhs = rhs ;
+  }
+  catch( ... ) {
+    if ( 0 == msg_error ) msg_error = msg_error_exception ;
+  }
+
+  if ( 0 != msg_error ) {
+    if ( no_throw ) {
+      std::cerr << msg_error_header << msg_error << std::endl ;
+      std::cerr.flush();
+    }
+    else {
+      std::string msg(msg_error_header);
+      msg.append(msg_error);
+      throw std::runtime_error( msg );
+    }
+  }
+}
+#endif
+
+namespace {
+
+Task * s_ready = 0 ;
+Task * s_denied = reinterpret_cast<Task*>( ~((uintptr_t)0) );
+
+}
+
+void Task::schedule()
+{
+  // Execute ready tasks in case the task being scheduled
+  // is dependent upon a waiting and ready task.
+
+  Task::execute_ready_tasks();
+
+  // spawning   : Constructing -> Waiting
+  // respawning : Executing    -> Waiting
+  // updating   : Waiting      -> Waiting
+
+  // Must not be in a dependence linked list:  0 == t->m_next
+
+  const bool ok_state = TASK_STATE_COMPLETE != m_state ;
+  const bool ok_list  = 0 == m_next ;
+
+  if ( ok_state && ok_list ) {
+
+    if ( TASK_STATE_CONSTRUCTING == m_state ) {
+      // Initial scheduling increment,
+      // matched by decrement when task is complete.
+      ++m_ref_count ;
+    }
+
+    // Will be waiting for execution upon return from this function
+
+    m_state = Kokkos::Experimental::TASK_STATE_WAITING ;
+
+    // Insert this task into another dependence that is not complete
+
+    int i = 0 ;
+    for ( ; i < m_dep_size ; ++i ) {
+      Task * const y = m_dep[i] ;
+      if ( y && s_denied != ( m_next = y->m_wait ) ) {
+        y->m_wait = this ; // CAS( & y->m_wait , m_next , this );
+        break ;
+      }
+    }
+    if ( i == m_dep_size ) {
+      // All dependences are complete, insert into the ready list
+      m_next  = s_ready ;
+      s_ready = this ; // CAS( & s_ready , m_next = s_ready , this );
+    }
+  }
+  else {
+    throw std::runtime_error(std::string("Kokkos::Experimental::Impl::Task spawn or respawn state error"));
+  }
+}
+
+void Task::execute_ready_tasks()
+{
+  while ( s_ready ) {
+
+    // Remove this task from the ready list
+
+    // Task * task ;
+    // while ( ! CAS( & s_ready , task = s_ready , s_ready->m_next ) );
+
+    Task * task = s_ready ;
+
+    s_ready = task->m_next ;
+
+    task->m_next = 0 ;
+
+    // precondition: task->m_state = TASK_STATE_WAITING
+    // precondition: task->m_dep[i]->m_state == TASK_STATE_COMPLETE  for all i
+    // precondition: does not exist T such that T->m_wait = task
+    // precondition: does not exist T such that T->m_next = task
+
+    task->m_state = Kokkos::Experimental::TASK_STATE_EXECUTING ;
+
+    (*task->m_apply)( task );
+
+    if ( task->m_state == Kokkos::Experimental::TASK_STATE_EXECUTING ) {
+      // task did not respawn itself
+      task->m_state = Kokkos::Experimental::TASK_STATE_COMPLETE ;
+
+      // release dependences:
+      for ( int i = 0 ; i < task->m_dep_size ; ++i ) {
+        assign( task->m_dep + i , 0 );
+      }
+
+      // Stop other tasks from adding themselves to 'task->m_wait' ;
+
+      Task * x ;
+      // CAS( & task->m_wait , x = task->m_wait , s_denied );
+      x = task->m_wait ; task->m_wait = s_denied ;
+
+      // update tasks waiting on this task
+      while ( x ) {
+        Task * const next = x->m_next ;
+
+        x->m_next = 0 ;
+
+        x->schedule(); // could happen concurrently
+
+        x = next ;
+      }
+
+      // Decrement to match the initial scheduling increment
+      assign( & task , 0 );
+    }
+  }
+}
+
+} // namespace Impl
+} // namespace Experimental
+} // namespace Kokkos
+
+#endif /* #if defined( KOKKOS_HAVE_SERIAL ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Serial_TaskPolicy.hpp b/lib/kokkos/core/src/impl/Kokkos_Serial_TaskPolicy.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a333f948ae18e3e3622d06551dd935aff0d77707
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Serial_TaskPolicy.hpp
@@ -0,0 +1,677 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+// Experimental unified task-data parallel manycore LDRD
+
+#ifndef KOKKOS_EXPERIMENTAL_SERIAL_TASKPOLICY_HPP
+#define KOKKOS_EXPERIMENTAL_SERIAL_TASKPOLICY_HPP
+
+#include <Kokkos_Macros.hpp>
+
+#if defined( KOKKOS_HAVE_SERIAL )
+
+#include <string>
+#include <typeinfo>
+#include <stdexcept>
+
+#include <Kokkos_Serial.hpp>
+#include <Kokkos_TaskPolicy.hpp>
+#include <Kokkos_View.hpp>
+
+#if defined( KOKKOS_ENABLE_TASKPOLICY )
+
+#include <impl/Kokkos_FunctorAdapter.hpp>
+
+//----------------------------------------------------------------------------
+/*  Inheritance structure to allow static_cast from the task root type
+ *  and a task's FunctorType.
+ *
+ *    task_root_type == TaskMember< Space , void , void >
+ *
+ *    TaskMember< PolicyType , ResultType , FunctorType >
+ *      : TaskMember< PolicyType::Space , ResultType , FunctorType >
+ *      { ... };
+ *
+ *    TaskMember< Space , ResultType , FunctorType >
+ *      : TaskMember< Space , ResultType , void >
+ *      , FunctorType
+ *      { ... };
+ *
+ *  when ResultType != void
+ *
+ *    TaskMember< Space , ResultType , void >
+ *      : TaskMember< Space , void , void >
+ *      { ... };
+ *
+ */
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+/** \brief  Base class for all tasks in the Serial execution space */
+template<>
+class TaskMember< Kokkos::Serial , void , void >
+{
+public:
+
+  typedef void         (* function_apply_type)  ( TaskMember * );
+  typedef void         (* function_dealloc_type)( TaskMember * );
+  typedef TaskMember * (* function_verify_type) ( TaskMember * );
+
+private:
+
+  const function_dealloc_type  m_dealloc ; ///< Deallocation
+  const function_verify_type   m_verify ;  ///< Result type verification
+  const function_apply_type    m_apply ;   ///< Apply function
+  TaskMember ** const          m_dep ;     ///< Dependences
+  TaskMember *                 m_wait ;    ///< Linked list of tasks waiting on this task
+  TaskMember *                 m_next ;    ///< Linked list of tasks waiting on a different task
+  const int                    m_dep_capacity ; ///< Capacity of dependences
+  int                          m_dep_size ;     ///< Actual count of dependences
+  int                          m_ref_count ;    ///< Reference count
+  int                          m_state ;        ///< State of the task
+
+  // size = 6 Pointers + 4 ints
+
+  TaskMember() /* = delete */ ;
+  TaskMember( const TaskMember & ) /* = delete */ ;
+  TaskMember & operator = ( const TaskMember & ) /* = delete */ ;
+
+  static void * allocate( const unsigned arg_sizeof_derived , const unsigned arg_dependence_capacity );
+  static void   deallocate( void * );
+
+  void throw_error_add_dependence() const ;
+  static void throw_error_verify_type();
+
+  template < class DerivedTaskType >
+  static
+  void deallocate( TaskMember * t )
+    {
+      DerivedTaskType * ptr = static_cast< DerivedTaskType * >(t);
+      ptr->~DerivedTaskType();
+      deallocate( (void *) ptr );
+    }
+
+protected :
+
+  ~TaskMember();
+
+  // Used by TaskMember< Serial , ResultType , void >
+  TaskMember( const function_verify_type   arg_verify
+            , const function_dealloc_type  arg_dealloc
+            , const function_apply_type    arg_apply
+            , const unsigned               arg_sizeof_derived
+            , const unsigned               arg_dependence_capacity
+            );
+
+  // Used for TaskMember< Serial , void , void >
+  TaskMember( const function_dealloc_type  arg_dealloc
+            , const function_apply_type    arg_apply
+            , const unsigned               arg_sizeof_derived
+            , const unsigned               arg_dependence_capacity
+            );
+
+public:
+
+  template< typename ResultType >
+  KOKKOS_FUNCTION static
+  TaskMember * verify_type( TaskMember * t )
+    {
+      enum { check_type = ! Kokkos::Impl::is_same< ResultType , void >::value };
+
+      if ( check_type && t != 0 ) {
+
+        // Verify that t->m_verify is this function
+        const function_verify_type self = & TaskMember::template verify_type< ResultType > ;
+
+        if ( t->m_verify != self ) {
+          t = 0 ;
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+          throw_error_verify_type();
+#endif
+        }
+      }
+      return t ;
+    }
+
+  //----------------------------------------
+  /*  Inheritence Requirements on task types:
+   *    typedef  FunctorType::value_type  value_type ;
+   *    class DerivedTaskType
+   *      : public TaskMember< Serial , value_type , FunctorType >
+   *      { ... };
+   *    class TaskMember< Serial , value_type , FunctorType >
+   *      : public TaskMember< Serial , value_type , void >
+   *      , public Functor
+   *      { ... };
+   *  If value_type != void
+   *    class TaskMember< Serial , value_type , void >
+   *      : public TaskMember< Serial , void , void >
+   *
+   *  Allocate space for DerivedTaskType followed by TaskMember*[ dependence_capacity ]
+   *
+   */
+
+  /** \brief  Allocate and construct a single-thread task */
+  template< class DerivedTaskType >
+  static
+  TaskMember * create( const typename DerivedTaskType::functor_type &  arg_functor
+                     , const unsigned                                  arg_dependence_capacity
+                     )
+    {
+      typedef typename DerivedTaskType::functor_type  functor_type ;
+      typedef typename functor_type::value_type       value_type ;
+
+      DerivedTaskType * const task =
+        new( allocate( sizeof(DerivedTaskType) , arg_dependence_capacity ) )
+          DerivedTaskType( & TaskMember::template deallocate< DerivedTaskType >
+                         , & TaskMember::template apply_single< functor_type , value_type >
+                         , sizeof(DerivedTaskType)
+                         , arg_dependence_capacity
+                         , arg_functor );
+
+      return static_cast< TaskMember * >( task );
+    }
+
+  /** \brief  Allocate and construct a data parallel task */
+  template< class DerivedTaskType >
+  static
+  TaskMember * create( const typename DerivedTaskType::policy_type &   arg_policy
+                     , const typename DerivedTaskType::functor_type &  arg_functor
+                     , const unsigned                                  arg_dependence_capacity
+                     )
+    {
+      DerivedTaskType * const task =
+        new( allocate( sizeof(DerivedTaskType) , arg_dependence_capacity ) )
+          DerivedTaskType( & TaskMember::template deallocate< DerivedTaskType >
+                         , sizeof(DerivedTaskType)
+                         , arg_dependence_capacity
+                         , arg_policy
+                         , arg_functor
+                         );
+
+      return static_cast< TaskMember * >( task );
+    }
+
+  /** \brief  Allocate and construct a thread-team task */
+  template< class DerivedTaskType >
+  static
+  TaskMember * create_team( const typename DerivedTaskType::functor_type &  arg_functor
+                          , const unsigned                                  arg_dependence_capacity
+                          )
+    {
+      typedef typename DerivedTaskType::functor_type  functor_type ;
+      typedef typename functor_type::value_type       value_type ;
+
+      DerivedTaskType * const task =
+        new( allocate( sizeof(DerivedTaskType) , arg_dependence_capacity ) )
+          DerivedTaskType( & TaskMember::template deallocate< DerivedTaskType >
+                         , & TaskMember::template apply_team< functor_type , value_type >
+                         , sizeof(DerivedTaskType)
+                         , arg_dependence_capacity
+                         , arg_functor );
+
+      return static_cast< TaskMember * >( task );
+    }
+
+  void schedule();
+  static void execute_ready_tasks();
+
+  //----------------------------------------
+
+  typedef FutureValueTypeIsVoidError get_result_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  get_result_type get() const { return get_result_type() ; }
+
+  KOKKOS_INLINE_FUNCTION
+  Kokkos::Experimental::TaskState get_state() const { return Kokkos::Experimental::TaskState( m_state ); }
+
+  //----------------------------------------
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+  static
+  void assign( TaskMember ** const lhs , TaskMember * const rhs , const bool no_throw = false );
+#else
+  KOKKOS_INLINE_FUNCTION static
+  void assign( TaskMember ** const lhs , TaskMember * const rhs , const bool no_throw = false ) {}
+#endif
+
+  KOKKOS_INLINE_FUNCTION
+  TaskMember * get_dependence( int i ) const
+    { return ( Kokkos::Experimental::TASK_STATE_EXECUTING == m_state && 0 <= i && i < m_dep_size ) ? m_dep[i] : (TaskMember*) 0 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  int get_dependence() const
+    { return m_dep_size ; }
+
+  KOKKOS_INLINE_FUNCTION
+  void clear_dependence()
+    {
+      for ( int i = 0 ; i < m_dep_size ; ++i ) assign( m_dep + i , 0 );
+      m_dep_size = 0 ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void add_dependence( TaskMember * before )
+    {
+      if ( ( Kokkos::Experimental::TASK_STATE_CONSTRUCTING == m_state ||
+             Kokkos::Experimental::TASK_STATE_EXECUTING    == m_state ) &&
+           m_dep_size < m_dep_capacity ) {
+        assign( m_dep + m_dep_size , before );
+        ++m_dep_size ;
+      }
+      else {
+        throw_error_add_dependence();
+      }
+    }
+
+  //----------------------------------------
+
+  template< class FunctorType , class ResultType >
+  KOKKOS_INLINE_FUNCTION static
+  void apply_single( typename Kokkos::Impl::enable_if< ! Kokkos::Impl::is_same< ResultType , void >::value , TaskMember * >::type t )
+    {
+      typedef TaskMember< Kokkos::Serial , ResultType , FunctorType > derived_type ;
+
+      // TaskMember< Kokkos::Serial , ResultType , FunctorType >
+      //   : public TaskMember< Kokkos::Serial , ResultType , void >
+      //   , public FunctorType
+      //   { ... };
+
+      derived_type & m = * static_cast< derived_type * >( t );
+
+      Kokkos::Impl::FunctorApply< FunctorType , void , ResultType & >::apply( (FunctorType &) m , & m.m_result );
+    }
+
+  template< class FunctorType , class ResultType >
+  KOKKOS_INLINE_FUNCTION static
+  void apply_single( typename Kokkos::Impl::enable_if< Kokkos::Impl::is_same< ResultType , void >::value , TaskMember * >::type t )
+    {
+      typedef TaskMember< Kokkos::Serial , ResultType , FunctorType > derived_type ;
+
+      // TaskMember< Kokkos::Serial , ResultType , FunctorType >
+      //   : public TaskMember< Kokkos::Serial , ResultType , void >
+      //   , public FunctorType
+      //   { ... };
+
+      derived_type & m = * static_cast< derived_type * >( t );
+
+      Kokkos::Impl::FunctorApply< FunctorType , void , void >::apply( (FunctorType &) m );
+    }
+
+  //----------------------------------------
+
+  template< class FunctorType , class ResultType >
+  static
+  void apply_team( typename Kokkos::Impl::enable_if< ! Kokkos::Impl::is_same< ResultType , void >::value , TaskMember * >::type t )
+    {
+      typedef TaskMember< Kokkos::Serial , ResultType , FunctorType > derived_type ;
+      typedef Kokkos::Impl::SerialTeamMember                          member_type ;
+
+      // TaskMember< Kokkos::Serial , ResultType , FunctorType >
+      //   : public TaskMember< Kokkos::Serial , ResultType , void >
+      //   , public FunctorType
+      //   { ... };
+
+      derived_type & m = * static_cast< derived_type * >( t );
+
+      m.FunctorType::apply( member_type(0,1,0) , m.m_result );
+    }
+
+  template< class FunctorType , class ResultType >
+  static
+  void apply_team( typename Kokkos::Impl::enable_if< Kokkos::Impl::is_same< ResultType , void >::value , TaskMember * >::type t )
+    {
+      typedef TaskMember< Kokkos::Serial , ResultType , FunctorType > derived_type ;
+      typedef Kokkos::Impl::SerialTeamMember                          member_type ;
+
+      // TaskMember< Kokkos::Serial , ResultType , FunctorType >
+      //   : public TaskMember< Kokkos::Serial , ResultType , void >
+      //   , public FunctorType
+      //   { ... };
+
+      derived_type & m = * static_cast< derived_type * >( t );
+
+      m.FunctorType::apply( member_type(0,1,0) );
+    }
+};
+
+//----------------------------------------------------------------------------
+/** \brief  Base class for tasks with a result value in the Serial execution space.
+ *
+ *  The FunctorType must be void because this class is accessed by the
+ *  Future class for the task and result value.
+ *
+ *  Must be derived from TaskMember<S,void,void> 'root class' so the Future class
+ *  can correctly static_cast from the 'root class' to this class.
+ */
+template < class ResultType >
+class TaskMember< Kokkos::Serial , ResultType , void >
+  : public TaskMember< Kokkos::Serial , void , void >
+{
+public:
+
+  ResultType  m_result ;
+
+  typedef const ResultType & get_result_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  get_result_type get() const { return m_result ; }
+
+protected:
+
+  typedef TaskMember< Kokkos::Serial , void , void >  task_root_type ;
+  typedef task_root_type::function_dealloc_type       function_dealloc_type ;
+  typedef task_root_type::function_apply_type         function_apply_type ;
+
+  inline
+  TaskMember( const function_dealloc_type  arg_dealloc
+            , const function_apply_type    arg_apply
+            , const unsigned               arg_sizeof_derived
+            , const unsigned               arg_dependence_capacity
+            )
+    : task_root_type( & task_root_type::template verify_type< ResultType >
+                    , arg_dealloc
+                    , arg_apply
+                    , arg_sizeof_derived
+                    , arg_dependence_capacity )
+    , m_result()
+    {}
+};
+
+template< class ResultType , class FunctorType >
+class TaskMember< Kokkos::Serial , ResultType , FunctorType >
+  : public TaskMember< Kokkos::Serial , ResultType , void >
+  , public FunctorType
+{
+public:
+
+  typedef FunctorType  functor_type ;
+
+  typedef TaskMember< Kokkos::Serial , void , void >        task_root_type ;
+  typedef TaskMember< Kokkos::Serial , ResultType , void >  task_base_type ;
+  typedef task_root_type::function_dealloc_type             function_dealloc_type ;
+  typedef task_root_type::function_apply_type               function_apply_type ;
+
+  inline
+  TaskMember( const function_dealloc_type  arg_dealloc
+            , const function_apply_type    arg_apply
+            , const unsigned               arg_sizeof_derived
+            , const unsigned               arg_dependence_capacity
+            , const functor_type &         arg_functor
+            )
+    : task_base_type( arg_dealloc , arg_apply , arg_sizeof_derived , arg_dependence_capacity )
+    , functor_type( arg_functor )
+    {}
+};
+
+} /* namespace Impl */
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+
+template<>
+class TaskPolicy< Kokkos::Serial >
+{
+public:
+
+  typedef Kokkos::Serial                  execution_space ;
+  typedef Kokkos::Impl::SerialTeamMember  member_type ;
+
+private:
+
+  typedef Impl::TaskMember< execution_space , void , void > task_root_type ;
+
+  template< class FunctorType >
+  static inline
+  const task_root_type * get_task_root( const FunctorType * f )
+    {
+      typedef Impl::TaskMember< execution_space , typename FunctorType::value_type , FunctorType > task_type ;
+      return static_cast< const task_root_type * >( static_cast< const task_type * >(f) );
+    }
+
+  template< class FunctorType >
+  static inline
+  task_root_type * get_task_root( FunctorType * f )
+    {
+      typedef Impl::TaskMember< execution_space , typename FunctorType::value_type , FunctorType > task_type ;
+      return static_cast< task_root_type * >( static_cast< task_type * >(f) );
+    }
+
+  unsigned m_default_dependence_capacity ;
+
+public:
+
+  // Stubbed out for now.
+  KOKKOS_INLINE_FUNCTION
+  int allocated_task_count() const { return 0 ; }
+
+  TaskPolicy
+    ( const unsigned /* arg_task_max_count */
+    , const unsigned /* arg_task_max_size */
+    , const unsigned arg_task_default_dependence_capacity = 4
+    , const unsigned /* arg_task_team_size */ = 0
+    )
+    : m_default_dependence_capacity( arg_task_default_dependence_capacity )
+    {}
+
+  KOKKOS_FUNCTION TaskPolicy() = default ;
+  KOKKOS_FUNCTION TaskPolicy( TaskPolicy && rhs ) = default ;
+  KOKKOS_FUNCTION TaskPolicy( const TaskPolicy & rhs ) = default ;
+  KOKKOS_FUNCTION TaskPolicy & operator = ( TaskPolicy && rhs ) = default ;
+  KOKKOS_FUNCTION TaskPolicy & operator = ( const TaskPolicy & rhs ) = default ;
+
+  //----------------------------------------
+
+  template< class ValueType >
+  KOKKOS_INLINE_FUNCTION
+  const Future< ValueType , execution_space > &
+    spawn( const Future< ValueType , execution_space > & f 
+         , const bool priority = false ) const
+      {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+        f.m_task->schedule();
+#endif
+        return f ;
+      }
+
+  //----------------------------------------
+  // Create single-thread task
+
+  template< class FunctorType >
+  KOKKOS_INLINE_FUNCTION
+  Future< typename FunctorType::value_type , execution_space >
+  task_create( const FunctorType & functor
+             , const unsigned dependence_capacity = ~0u ) const
+    {
+      typedef typename FunctorType::value_type value_type ;
+      typedef Impl::TaskMember< execution_space , value_type , FunctorType >  task_type ;
+      return Future< value_type , execution_space >(
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+        task_root_type::create< task_type >(
+          functor , ( ~0u == dependence_capacity ? m_default_dependence_capacity : dependence_capacity ) )
+#endif
+        );
+    }
+
+  template< class FunctorType >
+  KOKKOS_INLINE_FUNCTION
+  Future< typename FunctorType::value_type , execution_space >
+  proc_create( const FunctorType & functor
+             , const unsigned dependence_capacity = ~0u ) const
+    { return task_create( functor , dependence_capacity ); }
+
+  template< class FunctorType >
+  KOKKOS_INLINE_FUNCTION
+  Future< typename FunctorType::value_type , execution_space >
+  task_create_team( const FunctorType & functor
+                  , const unsigned dependence_capacity = ~0u ) const
+    {
+      typedef typename FunctorType::value_type value_type ;
+      typedef Impl::TaskMember< execution_space , value_type , FunctorType >  task_type ;
+      return Future< value_type , execution_space >(
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+        task_root_type::create_team< task_type >(
+          functor , ( ~0u == dependence_capacity ? m_default_dependence_capacity : dependence_capacity ) )
+#endif
+        );
+    }
+
+  template< class FunctorType >
+  KOKKOS_INLINE_FUNCTION
+  Future< typename FunctorType::value_type , execution_space >
+  proc_create_team( const FunctorType & functor
+                  , const unsigned dependence_capacity = ~0u ) const
+    { return task_create_team( functor , dependence_capacity ); }
+
+  //----------------------------------------
+  // Add dependence
+  template< class A1 , class A2 , class A3 , class A4 >
+  KOKKOS_INLINE_FUNCTION
+  void add_dependence( const Future<A1,A2> & after
+                     , const Future<A3,A4> & before
+                     , typename Kokkos::Impl::enable_if
+                        < Kokkos::Impl::is_same< typename Future<A1,A2>::execution_space , execution_space >::value
+                          &&
+                          Kokkos::Impl::is_same< typename Future<A3,A4>::execution_space , execution_space >::value
+                        >::type * = 0
+                      ) const
+    {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      after.m_task->add_dependence( before.m_task );
+#endif
+    }
+
+  //----------------------------------------
+  // Functions for an executing task functor to query dependences,
+  // set new dependences, and respawn itself.
+
+  template< class FunctorType >
+  KOKKOS_INLINE_FUNCTION
+  Future< void , execution_space >
+  get_dependence( const FunctorType * task_functor , int i ) const
+    {
+      return Future<void,execution_space>(
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+        get_task_root(task_functor)->get_dependence(i)
+#endif
+        );
+    }
+
+  template< class FunctorType >
+  KOKKOS_INLINE_FUNCTION
+  int get_dependence( const FunctorType * task_functor ) const
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { return get_task_root(task_functor)->get_dependence(); }
+#else
+    { return 0 ; }
+#endif
+
+  template< class FunctorType >
+  KOKKOS_INLINE_FUNCTION
+  void clear_dependence( FunctorType * task_functor ) const
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { get_task_root(task_functor)->clear_dependence(); }
+#else
+    {}
+#endif
+
+  template< class FunctorType , class A3 , class A4 >
+  KOKKOS_INLINE_FUNCTION
+  void add_dependence( FunctorType * task_functor
+                     , const Future<A3,A4> & before
+                     , typename Kokkos::Impl::enable_if
+                        < Kokkos::Impl::is_same< typename Future<A3,A4>::execution_space , execution_space >::value
+                        >::type * = 0
+                      ) const
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { get_task_root(task_functor)->add_dependence( before.m_task ); }
+#else
+    {}
+#endif
+
+  template< class FunctorType >
+  KOKKOS_INLINE_FUNCTION
+  void respawn( FunctorType * task_functor 
+              , const bool priority = false ) const
+    {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      get_task_root(task_functor)->schedule();
+#endif
+    }
+
+  template< class FunctorType >
+  KOKKOS_INLINE_FUNCTION
+  void respawn_needing_memory( FunctorType * task_functor ) const
+    {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      get_task_root(task_functor)->schedule();
+#endif
+    }
+
+  //----------------------------------------
+
+  static member_type & member_single();
+};
+
+inline
+void wait( TaskPolicy< Kokkos::Serial > & )
+{ Impl::TaskMember< Kokkos::Serial , void , void >::execute_ready_tasks(); }
+
+} /* namespace Experimental */
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
+#endif /* defined( KOKKOS_HAVE_SERIAL ) */
+#endif /* #define KOKKOS_EXPERIMENTAL_SERIAL_TASK_HPP */
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Shape.cpp b/lib/kokkos/core/src/impl/Kokkos_Shape.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..da12db1f381e790e46604f8a15280d2a07f5152a
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Shape.cpp
@@ -0,0 +1,178 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+
+#include <sstream>
+#include <impl/Kokkos_Error.hpp>
+#include <impl/Kokkos_Shape.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+void assert_counts_are_equal_throw(
+  const size_t x_count ,
+  const size_t y_count )
+{
+  std::ostringstream msg ;
+
+  msg << "Kokkos::Impl::assert_counts_are_equal_throw( "
+      << x_count << " != " << y_count << " )" ;
+
+  throw_runtime_exception( msg.str() );
+}
+
+void assert_shapes_are_equal_throw(
+  const unsigned x_scalar_size ,
+  const unsigned x_rank ,
+  const size_t   x_N0 , const unsigned x_N1 ,
+  const unsigned x_N2 , const unsigned x_N3 ,
+  const unsigned x_N4 , const unsigned x_N5 ,
+  const unsigned x_N6 , const unsigned x_N7 ,
+
+  const unsigned y_scalar_size ,
+  const unsigned y_rank ,
+  const size_t   y_N0 , const unsigned y_N1 ,
+  const unsigned y_N2 , const unsigned y_N3 ,
+  const unsigned y_N4 , const unsigned y_N5 ,
+  const unsigned y_N6 , const unsigned y_N7 )
+{
+  std::ostringstream msg ;
+
+  msg << "Kokkos::Impl::assert_shape_are_equal_throw( {"
+      << " scalar_size(" << x_scalar_size
+      << ") rank(" << x_rank
+      << ") dimension(" ;
+  if ( 0 < x_rank ) { msg << " " << x_N0 ; }
+  if ( 1 < x_rank ) { msg << " " << x_N1 ; }
+  if ( 2 < x_rank ) { msg << " " << x_N2 ; }
+  if ( 3 < x_rank ) { msg << " " << x_N3 ; }
+  if ( 4 < x_rank ) { msg << " " << x_N4 ; }
+  if ( 5 < x_rank ) { msg << " " << x_N5 ; }
+  if ( 6 < x_rank ) { msg << " " << x_N6 ; }
+  if ( 7 < x_rank ) { msg << " " << x_N7 ; }
+  msg << " ) } != { "
+      << " scalar_size(" << y_scalar_size
+      << ") rank(" << y_rank
+      << ") dimension(" ;
+  if ( 0 < y_rank ) { msg << " " << y_N0 ; }
+  if ( 1 < y_rank ) { msg << " " << y_N1 ; }
+  if ( 2 < y_rank ) { msg << " " << y_N2 ; }
+  if ( 3 < y_rank ) { msg << " " << y_N3 ; }
+  if ( 4 < y_rank ) { msg << " " << y_N4 ; }
+  if ( 5 < y_rank ) { msg << " " << y_N5 ; }
+  if ( 6 < y_rank ) { msg << " " << y_N6 ; }
+  if ( 7 < y_rank ) { msg << " " << y_N7 ; }
+  msg << " ) } )" ;
+
+  throw_runtime_exception( msg.str() );
+}
+
+void AssertShapeBoundsAbort< Kokkos::HostSpace >::apply(
+  const size_t rank ,
+  const size_t n0 , const size_t n1 , 
+  const size_t n2 , const size_t n3 ,
+  const size_t n4 , const size_t n5 ,
+  const size_t n6 , const size_t n7 ,
+
+  const size_t arg_rank ,
+  const size_t i0 , const size_t i1 ,
+  const size_t i2 , const size_t i3 ,
+  const size_t i4 , const size_t i5 ,
+  const size_t i6 , const size_t i7 )
+{
+  std::ostringstream msg ;
+  msg << "Kokkos::Impl::AssertShapeBoundsAbort( shape = {" ;
+  if ( 0 < rank ) { msg << " " << n0 ; }
+  if ( 1 < rank ) { msg << " " << n1 ; }
+  if ( 2 < rank ) { msg << " " << n2 ; }
+  if ( 3 < rank ) { msg << " " << n3 ; }
+  if ( 4 < rank ) { msg << " " << n4 ; }
+  if ( 5 < rank ) { msg << " " << n5 ; }
+  if ( 6 < rank ) { msg << " " << n6 ; }
+  if ( 7 < rank ) { msg << " " << n7 ; }
+  msg << " } index = {" ;
+  if ( 0 < arg_rank ) { msg << " " << i0 ; }
+  if ( 1 < arg_rank ) { msg << " " << i1 ; }
+  if ( 2 < arg_rank ) { msg << " " << i2 ; }
+  if ( 3 < arg_rank ) { msg << " " << i3 ; }
+  if ( 4 < arg_rank ) { msg << " " << i4 ; }
+  if ( 5 < arg_rank ) { msg << " " << i5 ; }
+  if ( 6 < arg_rank ) { msg << " " << i6 ; }
+  if ( 7 < arg_rank ) { msg << " " << i7 ; }
+  msg << " } )" ;
+
+  throw_runtime_exception( msg.str() );
+}
+
+void assert_shape_effective_rank1_at_leastN_throw(
+  const size_t x_rank , const size_t x_N0 ,
+  const size_t x_N1 ,   const size_t x_N2 ,
+  const size_t x_N3 ,   const size_t x_N4 ,
+  const size_t x_N5 ,   const size_t x_N6 ,
+  const size_t x_N7 ,
+  const size_t N0 )
+{
+  std::ostringstream msg ;
+
+  msg << "Kokkos::Impl::assert_shape_effective_rank1_at_leastN_throw( shape = {" ;
+  if ( 0 < x_rank ) { msg << " " << x_N0 ; }
+  if ( 1 < x_rank ) { msg << " " << x_N1 ; }
+  if ( 2 < x_rank ) { msg << " " << x_N2 ; }
+  if ( 3 < x_rank ) { msg << " " << x_N3 ; }
+  if ( 4 < x_rank ) { msg << " " << x_N4 ; }
+  if ( 5 < x_rank ) { msg << " " << x_N5 ; }
+  if ( 6 < x_rank ) { msg << " " << x_N6 ; }
+  if ( 7 < x_rank ) { msg << " " << x_N7 ; }
+  msg << " } N = " << N0 << " )" ;
+
+  throw_runtime_exception( msg.str() );
+}
+
+
+
+}
+}
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Shape.hpp b/lib/kokkos/core/src/impl/Kokkos_Shape.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..9749e0a1ff73107b97435862f737d96439fcb9d3
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Shape.hpp
@@ -0,0 +1,917 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_SHAPE_HPP
+#define KOKKOS_SHAPE_HPP
+
+#include <typeinfo>
+#include <utility>
+#include <Kokkos_Core_fwd.hpp>
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_StaticAssert.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+/** \brief  The shape of a Kokkos with dynamic and static dimensions.
+ *          Dynamic dimensions are member values and static dimensions are
+ *          'static const' values.
+ *
+ *  The upper bound on the array rank is eight.
+ */
+template< unsigned ScalarSize ,
+          unsigned Rank ,
+          unsigned s0  = 1 ,
+          unsigned s1  = 1 ,
+          unsigned s2  = 1 ,
+          unsigned s3  = 1 ,
+          unsigned s4  = 1 ,
+          unsigned s5  = 1 ,
+          unsigned s6  = 1 ,
+          unsigned s7  = 1 >
+struct Shape ;
+
+//----------------------------------------------------------------------------
+/** \brief  Shape equality if the value type, layout, and dimensions
+ *          are equal.
+ */
+template< unsigned xSize , unsigned xRank ,
+          unsigned xN0 , unsigned xN1 , unsigned xN2 , unsigned xN3 ,
+          unsigned xN4 , unsigned xN5 , unsigned xN6 , unsigned xN7 ,
+
+          unsigned ySize , unsigned yRank ,
+          unsigned yN0 , unsigned yN1 , unsigned yN2 , unsigned yN3 ,
+          unsigned yN4 , unsigned yN5 , unsigned yN6 , unsigned yN7 >
+KOKKOS_INLINE_FUNCTION
+bool operator == ( const Shape<xSize,xRank,xN0,xN1,xN2,xN3,xN4,xN5,xN6,xN7> & x ,
+                   const Shape<ySize,yRank,yN0,yN1,yN2,yN3,yN4,yN5,yN6,yN7> & y )
+{
+  enum { same_size = xSize == ySize };
+  enum { same_rank = xRank == yRank };
+
+  return same_size && same_rank &&
+         size_t( x.N0 )   == size_t( y.N0 ) &&
+         unsigned( x.N1 ) == unsigned( y.N1 ) &&
+         unsigned( x.N2 ) == unsigned( y.N2 ) &&
+         unsigned( x.N3 ) == unsigned( y.N3 ) &&
+         unsigned( x.N4 ) == unsigned( y.N4 ) &&
+         unsigned( x.N5 ) == unsigned( y.N5 ) &&
+         unsigned( x.N6 ) == unsigned( y.N6 ) &&
+         unsigned( x.N7 ) == unsigned( y.N7 ) ;
+}
+
+template< unsigned xSize , unsigned xRank ,
+          unsigned xN0 , unsigned xN1 , unsigned xN2 , unsigned xN3 ,
+          unsigned xN4 , unsigned xN5 , unsigned xN6 , unsigned xN7 ,
+
+          unsigned ySize ,unsigned yRank ,
+          unsigned yN0 , unsigned yN1 , unsigned yN2 , unsigned yN3 ,
+          unsigned yN4 , unsigned yN5 , unsigned yN6 , unsigned yN7 >
+KOKKOS_INLINE_FUNCTION
+bool operator != ( const Shape<xSize,xRank,xN0,xN1,xN2,xN3,xN4,xN5,xN6,xN7> & x ,
+                   const Shape<ySize,yRank,yN0,yN1,yN2,yN3,yN4,yN5,yN6,yN7> & y )
+{ return ! operator == ( x , y ); }
+
+//----------------------------------------------------------------------------
+
+void assert_counts_are_equal_throw(
+  const size_t x_count ,
+  const size_t y_count );
+
+inline
+void assert_counts_are_equal(
+  const size_t x_count ,
+  const size_t y_count )
+{
+  if ( x_count != y_count ) {
+    assert_counts_are_equal_throw( x_count , y_count );
+  }
+}
+
+void assert_shapes_are_equal_throw(
+  const unsigned x_scalar_size ,
+  const unsigned x_rank ,
+  const size_t   x_N0 , const unsigned x_N1 ,
+  const unsigned x_N2 , const unsigned x_N3 ,
+  const unsigned x_N4 , const unsigned x_N5 ,
+  const unsigned x_N6 , const unsigned x_N7 ,
+
+  const unsigned y_scalar_size ,
+  const unsigned y_rank ,
+  const size_t   y_N0 , const unsigned y_N1 ,
+  const unsigned y_N2 , const unsigned y_N3 ,
+  const unsigned y_N4 , const unsigned y_N5 ,
+  const unsigned y_N6 , const unsigned y_N7 );
+
+template< unsigned xSize , unsigned xRank ,
+          unsigned xN0 , unsigned xN1 , unsigned xN2 , unsigned xN3 ,
+          unsigned xN4 , unsigned xN5 , unsigned xN6 , unsigned xN7 ,
+
+          unsigned ySize , unsigned yRank ,
+          unsigned yN0 , unsigned yN1 , unsigned yN2 , unsigned yN3 ,
+          unsigned yN4 , unsigned yN5 , unsigned yN6 , unsigned yN7 >
+inline
+void assert_shapes_are_equal(
+  const Shape<xSize,xRank,xN0,xN1,xN2,xN3,xN4,xN5,xN6,xN7> & x ,
+  const Shape<ySize,yRank,yN0,yN1,yN2,yN3,yN4,yN5,yN6,yN7> & y )
+{
+  typedef Shape<xSize,xRank,xN0,xN1,xN2,xN3,xN4,xN5,xN6,xN7> x_type ;
+  typedef Shape<ySize,yRank,yN0,yN1,yN2,yN3,yN4,yN5,yN6,yN7> y_type ;
+
+  if ( x != y ) {
+    assert_shapes_are_equal_throw(
+      x_type::scalar_size, x_type::rank, x.N0, x.N1, x.N2, x.N3, x.N4, x.N5, x.N6, x.N7,
+      y_type::scalar_size, y_type::rank, y.N0, y.N1, y.N2, y.N3, y.N4, y.N5, y.N6, y.N7 );
+  }
+}
+
+template< unsigned xSize , unsigned xRank ,
+          unsigned xN0 , unsigned xN1 , unsigned xN2 , unsigned xN3 ,
+          unsigned xN4 , unsigned xN5 , unsigned xN6 , unsigned xN7 ,
+
+          unsigned ySize , unsigned yRank ,
+          unsigned yN0 , unsigned yN1 , unsigned yN2 , unsigned yN3 ,
+          unsigned yN4 , unsigned yN5 , unsigned yN6 , unsigned yN7 >
+void assert_shapes_equal_dimension(
+  const Shape<xSize,xRank,xN0,xN1,xN2,xN3,xN4,xN5,xN6,xN7> & x ,
+  const Shape<ySize,yRank,yN0,yN1,yN2,yN3,yN4,yN5,yN6,yN7> & y )
+{
+  typedef Shape<xSize,xRank,xN0,xN1,xN2,xN3,xN4,xN5,xN6,xN7> x_type ;
+  typedef Shape<ySize,yRank,yN0,yN1,yN2,yN3,yN4,yN5,yN6,yN7> y_type ;
+
+  // Omit comparison of scalar_size.
+  if ( unsigned( x.rank ) != unsigned( y.rank ) ||
+       size_t( x.N0 )   != size_t( y.N0 ) || 
+       unsigned( x.N1 ) != unsigned( y.N1 ) || 
+       unsigned( x.N2 ) != unsigned( y.N2 ) || 
+       unsigned( x.N3 ) != unsigned( y.N3 ) ||
+       unsigned( x.N4 ) != unsigned( y.N4 ) || 
+       unsigned( x.N5 ) != unsigned( y.N5 ) || 
+       unsigned( x.N6 ) != unsigned( y.N6 ) || 
+       unsigned( x.N7 ) != unsigned( y.N7 ) ) {
+    assert_shapes_are_equal_throw(
+      x_type::scalar_size, x_type::rank, x.N0, x.N1, x.N2, x.N3, x.N4, x.N5, x.N6, x.N7,
+      y_type::scalar_size, y_type::rank, y.N0, y.N1, y.N2, y.N3, y.N4, y.N5, y.N6, y.N7 );
+  }
+}
+
+//----------------------------------------------------------------------------
+
+template< class ShapeType > struct assert_shape_is_rank_zero ;
+template< class ShapeType > struct assert_shape_is_rank_one ;
+
+template< unsigned Size >
+struct assert_shape_is_rank_zero< Shape<Size,0> >
+  : public true_type {};
+
+template< unsigned Size , unsigned s0 >
+struct assert_shape_is_rank_one< Shape<Size,1,s0> >
+  : public true_type {};
+
+//----------------------------------------------------------------------------
+
+/** \brief  Array bounds assertion templated on the execution space
+ *          to allow device-specific abort code.
+ */
+template< class Space >
+struct AssertShapeBoundsAbort ;
+
+template<>
+struct AssertShapeBoundsAbort< Kokkos::HostSpace >
+{
+  static void apply( const size_t rank ,
+                     const size_t n0 , const size_t n1 ,
+                     const size_t n2 , const size_t n3 ,
+                     const size_t n4 , const size_t n5 ,
+                     const size_t n6 , const size_t n7 ,
+                     const size_t arg_rank ,
+                     const size_t i0 , const size_t i1 ,
+                     const size_t i2 , const size_t i3 ,
+                     const size_t i4 , const size_t i5 ,
+                     const size_t i6 , const size_t i7 );
+};
+
+template< class ExecutionSpace >
+struct AssertShapeBoundsAbort
+{
+  KOKKOS_INLINE_FUNCTION
+  static void apply( const size_t rank ,
+                     const size_t n0 , const size_t n1 ,
+                     const size_t n2 , const size_t n3 ,
+                     const size_t n4 , const size_t n5 ,
+                     const size_t n6 , const size_t n7 ,
+                     const size_t arg_rank ,
+                     const size_t i0 , const size_t i1 ,
+                     const size_t i2 , const size_t i3 ,
+                     const size_t i4 , const size_t i5 ,
+                     const size_t i6 , const size_t i7 )
+    {
+      AssertShapeBoundsAbort< Kokkos::HostSpace >
+        ::apply( rank ,    n0 , n1 , n2 , n3 , n4 , n5 , n6 , n7 ,
+                 arg_rank, i0 , i1 , i2 , i3 , i4 , i5 , i6 , i7 );
+    }
+};
+
+template< class ShapeType >
+KOKKOS_INLINE_FUNCTION
+void assert_shape_bounds( const ShapeType & shape ,
+                          const size_t arg_rank ,
+                          const size_t i0 ,
+                          const size_t i1 = 0 ,
+                          const size_t i2 = 0 ,
+                          const size_t i3 = 0 ,
+                          const size_t i4 = 0 ,
+                          const size_t i5 = 0 ,
+                          const size_t i6 = 0 ,
+                          const size_t i7 = 0 )
+{
+  // Must supply at least as many indices as ranks.
+  // Every index must be within bounds.
+  const bool ok = ShapeType::rank <= arg_rank &&
+                  i0 < size_t(shape.N0) && 
+                  i1 < size_t(shape.N1) &&
+                  i2 < size_t(shape.N2) &&
+                  i3 < size_t(shape.N3) &&
+                  i4 < size_t(shape.N4) &&
+                  i5 < size_t(shape.N5) &&
+                  i6 < size_t(shape.N6) &&
+                  i7 < size_t(shape.N7) ;
+
+  if ( ! ok ) {
+    AssertShapeBoundsAbort< Kokkos::Impl::ActiveExecutionMemorySpace >
+      ::apply( ShapeType::rank ,
+               shape.N0 , shape.N1 , shape.N2 , shape.N3 ,
+               shape.N4 , shape.N5 , shape.N6 , shape.N7 ,
+               arg_rank , i0 , i1 , i2 , i3 , i4 , i5 , i6 , i7 );
+  }
+}
+
+#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_1( S , I0 ) assert_shape_bounds(S,1,I0);
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_2( S , I0 , I1 ) assert_shape_bounds(S,2,I0,I1);
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_3( S , I0 , I1 , I2 ) assert_shape_bounds(S,3,I0,I1,I2);
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_4( S , I0 , I1 , I2 , I3 ) assert_shape_bounds(S,4,I0,I1,I2,I3);
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_5( S , I0 , I1 , I2 , I3 , I4 ) assert_shape_bounds(S,5,I0,I1,I2,I3,I4);
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_6( S , I0 , I1 , I2 , I3 , I4 , I5 ) assert_shape_bounds(S,6,I0,I1,I2,I3,I4,I5);
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_7( S , I0 , I1 , I2 , I3 , I4 , I5 , I6 ) assert_shape_bounds(S,7,I0,I1,I2,I3,I4,I5,I6);
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_8( S , I0 , I1 , I2 , I3 , I4 , I5 , I6 , I7 ) assert_shape_bounds(S,8,I0,I1,I2,I3,I4,I5,I6,I7);
+#else
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_1( S , I0 ) /* */
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_2( S , I0 , I1 ) /* */
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_3( S , I0 , I1 , I2 ) /* */
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_4( S , I0 , I1 , I2 , I3 ) /* */
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_5( S , I0 , I1 , I2 , I3 , I4 ) /* */
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_6( S , I0 , I1 , I2 , I3 , I4 , I5 ) /* */
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_7( S , I0 , I1 , I2 , I3 , I4 , I5 , I6 ) /* */
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_8( S , I0 , I1 , I2 , I3 , I4 , I5 , I6 , I7 ) /* */
+#endif
+
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+// Specialization and optimization for the Rank 0 shape.
+
+template < unsigned ScalarSize >
+struct Shape< ScalarSize , 0, 1,1,1,1, 1,1,1,1 >
+{
+  enum { scalar_size   = ScalarSize };
+  enum { rank_dynamic = 0 };
+  enum { rank         = 0 };
+
+  enum { N0 = 1 };
+  enum { N1 = 1 };
+  enum { N2 = 1 };
+  enum { N3 = 1 };
+  enum { N4 = 1 };
+  enum { N5 = 1 };
+  enum { N6 = 1 };
+  enum { N7 = 1 };
+
+  KOKKOS_INLINE_FUNCTION
+  static
+  void assign( Shape & ,
+               unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 ,
+               unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 )
+  {}
+};
+
+//----------------------------------------------------------------------------
+
+template< unsigned R > struct assign_shape_dimension ;
+
+#define KOKKOS_ASSIGN_SHAPE_DIMENSION( R ) \
+template<> \
+struct assign_shape_dimension< R > \
+{ \
+  template< class ShapeType > \
+  KOKKOS_INLINE_FUNCTION \
+  assign_shape_dimension( ShapeType & shape \
+                        , typename Impl::enable_if<( R < ShapeType::rank_dynamic ), size_t >::type n \
+                        ) { shape.N ## R = n ; } \
+};
+
+KOKKOS_ASSIGN_SHAPE_DIMENSION(0)
+KOKKOS_ASSIGN_SHAPE_DIMENSION(1)
+KOKKOS_ASSIGN_SHAPE_DIMENSION(2)
+KOKKOS_ASSIGN_SHAPE_DIMENSION(3)
+KOKKOS_ASSIGN_SHAPE_DIMENSION(4)
+KOKKOS_ASSIGN_SHAPE_DIMENSION(5)
+KOKKOS_ASSIGN_SHAPE_DIMENSION(6)
+KOKKOS_ASSIGN_SHAPE_DIMENSION(7)
+
+#undef KOKKOS_ASSIGN_SHAPE_DIMENSION
+
+//----------------------------------------------------------------------------
+// All-static dimension array
+
+template < unsigned ScalarSize ,
+           unsigned Rank ,
+           unsigned s0 ,
+           unsigned s1 ,
+           unsigned s2 ,
+           unsigned s3 ,
+           unsigned s4 ,
+           unsigned s5 ,
+           unsigned s6 ,
+           unsigned s7 >
+struct Shape {
+
+  enum { scalar_size   = ScalarSize };
+  enum { rank_dynamic = 0 };
+  enum { rank         = Rank };
+
+  enum { N0 = s0 };
+  enum { N1 = s1 };
+  enum { N2 = s2 };
+  enum { N3 = s3 };
+  enum { N4 = s4 };
+  enum { N5 = s5 };
+  enum { N6 = s6 };
+  enum { N7 = s7 };
+
+  KOKKOS_INLINE_FUNCTION
+  static
+  void assign( Shape & ,
+               unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 ,
+               unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 )
+  {}
+};
+
+// 1 == dynamic_rank <= rank <= 8
+template < unsigned ScalarSize ,
+           unsigned Rank ,
+           unsigned s1 ,
+           unsigned s2 ,
+           unsigned s3 ,
+           unsigned s4 ,
+           unsigned s5 ,
+           unsigned s6 ,
+           unsigned s7 >
+struct Shape< ScalarSize , Rank , 0,s1,s2,s3, s4,s5,s6,s7 >
+{
+  enum { scalar_size   = ScalarSize };
+  enum { rank_dynamic = 1 };
+  enum { rank         = Rank };
+
+  size_t N0 ; // For 1 == dynamic_rank allow  N0 > 2^32
+
+  enum { N1 = s1 };
+  enum { N2 = s2 };
+  enum { N3 = s3 };
+  enum { N4 = s4 };
+  enum { N5 = s5 };
+  enum { N6 = s6 };
+  enum { N7 = s7 };
+
+  KOKKOS_INLINE_FUNCTION
+  static
+  void assign( Shape & s ,
+               size_t n0 , unsigned = 0 , unsigned = 0 , unsigned = 0 ,
+               unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 )
+  { s.N0 = n0 ; }
+};
+
+// 2 == dynamic_rank <= rank <= 8
+template < unsigned ScalarSize , unsigned Rank ,
+           unsigned s2 ,
+           unsigned s3 ,
+           unsigned s4 ,
+           unsigned s5 ,
+           unsigned s6 ,
+           unsigned s7 >
+struct Shape< ScalarSize , Rank , 0,0,s2,s3, s4,s5,s6,s7 >
+{
+  enum { scalar_size   = ScalarSize };
+  enum { rank_dynamic = 2 };
+  enum { rank         = Rank };
+
+  unsigned N0 ;
+  unsigned N1 ;
+
+  enum { N2 = s2 };
+  enum { N3 = s3 };
+  enum { N4 = s4 };
+  enum { N5 = s5 };
+  enum { N6 = s6 };
+  enum { N7 = s7 };
+
+  KOKKOS_INLINE_FUNCTION
+  static
+  void assign( Shape & s ,
+               unsigned n0 , unsigned n1 , unsigned = 0 , unsigned = 0 ,
+               unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 )
+  { s.N0 = n0 ; s.N1 = n1 ; }
+};
+
+// 3 == dynamic_rank <= rank <= 8
+template < unsigned Rank , unsigned ScalarSize ,
+           unsigned s3 ,
+           unsigned s4 ,
+           unsigned s5 ,
+           unsigned s6 ,
+           unsigned s7 >
+struct Shape< ScalarSize , Rank , 0,0,0,s3, s4,s5,s6,s7>
+{
+  enum { scalar_size   = ScalarSize };
+  enum { rank_dynamic = 3 };
+  enum { rank         = Rank };
+
+  unsigned N0 ;
+  unsigned N1 ;
+  unsigned N2 ;
+
+  enum { N3 = s3 };
+  enum { N4 = s4 };
+  enum { N5 = s5 };
+  enum { N6 = s6 };
+  enum { N7 = s7 };
+
+  KOKKOS_INLINE_FUNCTION
+  static
+  void assign( Shape & s ,
+               unsigned n0 , unsigned n1 , unsigned n2 , unsigned = 0 ,
+               unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 )
+  { s.N0 = n0 ; s.N1 = n1 ; s.N2 = n2 ; }
+};
+
+// 4 == dynamic_rank <= rank <= 8
+template < unsigned ScalarSize , unsigned Rank ,
+           unsigned s4 ,
+           unsigned s5 ,
+           unsigned s6 ,
+           unsigned s7 >
+struct Shape< ScalarSize , Rank, 0,0,0,0, s4,s5,s6,s7 >
+{
+  enum { scalar_size   = ScalarSize };
+  enum { rank_dynamic = 4 };
+  enum { rank         = Rank };
+
+  unsigned N0 ;
+  unsigned N1 ;
+  unsigned N2 ;
+  unsigned N3 ;
+
+  enum { N4 = s4 };
+  enum { N5 = s5 };
+  enum { N6 = s6 };
+  enum { N7 = s7 };
+
+  KOKKOS_INLINE_FUNCTION
+  static
+  void assign( Shape & s ,
+               unsigned n0 , unsigned n1 , unsigned n2 , unsigned n3 ,
+               unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 )
+  { s.N0 = n0 ; s.N1 = n1 ; s.N2 = n2 ; s.N3 = n3 ; }
+};
+
+// 5 == dynamic_rank <= rank <= 8
+template < unsigned ScalarSize , unsigned Rank ,
+           unsigned s5 ,
+           unsigned s6 ,
+           unsigned s7 >
+struct Shape< ScalarSize , Rank , 0,0,0,0, 0,s5,s6,s7 >
+{
+  enum { scalar_size   = ScalarSize };
+  enum { rank_dynamic = 5 };
+  enum { rank         = Rank };
+
+  unsigned N0 ;
+  unsigned N1 ;
+  unsigned N2 ;
+  unsigned N3 ;
+  unsigned N4 ;
+
+  enum { N5 = s5 };
+  enum { N6 = s6 };
+  enum { N7 = s7 };
+
+  KOKKOS_INLINE_FUNCTION
+  static
+  void assign( Shape & s ,
+               unsigned n0 , unsigned n1 , unsigned n2 , unsigned n3 ,
+               unsigned n4 , unsigned = 0 , unsigned = 0 , unsigned = 0 )
+  { s.N0 = n0 ; s.N1 = n1 ; s.N2 = n2 ; s.N3 = n3 ; s.N4 = n4 ; }
+};
+
+// 6 == dynamic_rank <= rank <= 8
+template < unsigned ScalarSize , unsigned Rank ,
+           unsigned s6 ,
+           unsigned s7 >
+struct Shape< ScalarSize , Rank , 0,0,0,0, 0,0,s6,s7 >
+{
+  enum { scalar_size   = ScalarSize };
+  enum { rank_dynamic = 6 };
+  enum { rank         = Rank };
+
+  unsigned N0 ;
+  unsigned N1 ;
+  unsigned N2 ;
+  unsigned N3 ;
+  unsigned N4 ;
+  unsigned N5 ;
+
+  enum { N6 = s6 };
+  enum { N7 = s7 };
+
+  KOKKOS_INLINE_FUNCTION
+  static
+  void assign( Shape & s ,
+               unsigned n0 , unsigned n1 , unsigned n2 , unsigned n3 ,
+               unsigned n4 , unsigned n5 = 0 , unsigned = 0 , unsigned = 0 )
+  {
+    s.N0 = n0 ; s.N1 = n1 ; s.N2 = n2 ; s.N3 = n3 ;
+    s.N4 = n4 ; s.N5 = n5 ;
+  }
+};
+
+// 7 == dynamic_rank <= rank <= 8
+template < unsigned ScalarSize , unsigned Rank ,
+           unsigned s7 >
+struct Shape< ScalarSize , Rank , 0,0,0,0, 0,0,0,s7 >
+{
+  enum { scalar_size   = ScalarSize };
+  enum { rank_dynamic = 7 };
+  enum { rank         = Rank };
+
+  unsigned N0 ;
+  unsigned N1 ;
+  unsigned N2 ;
+  unsigned N3 ;
+  unsigned N4 ;
+  unsigned N5 ;
+  unsigned N6 ;
+
+  enum { N7 = s7 };
+
+  KOKKOS_INLINE_FUNCTION
+  static
+  void assign( Shape & s ,
+               unsigned n0 , unsigned n1 , unsigned n2 , unsigned n3 ,
+               unsigned n4 , unsigned n5 , unsigned n6 , unsigned = 0 )
+  {
+    s.N0 = n0 ; s.N1 = n1 ; s.N2 = n2 ; s.N3 = n3 ;
+    s.N4 = n4 ; s.N5 = n5 ; s.N6 = n6 ;
+  }
+};
+
+// 8 == dynamic_rank <= rank <= 8
+template < unsigned ScalarSize >
+struct Shape< ScalarSize , 8 , 0,0,0,0, 0,0,0,0 >
+{
+  enum { scalar_size   = ScalarSize };
+  enum { rank_dynamic = 8 };
+  enum { rank         = 8 };
+
+  unsigned N0 ;
+  unsigned N1 ;
+  unsigned N2 ;
+  unsigned N3 ;
+  unsigned N4 ;
+  unsigned N5 ;
+  unsigned N6 ;
+  unsigned N7 ;
+
+  KOKKOS_INLINE_FUNCTION
+  static
+  void assign( Shape & s ,
+               unsigned n0 , unsigned n1 , unsigned n2 , unsigned n3 ,
+               unsigned n4 , unsigned n5 , unsigned n6 , unsigned n7 )
+  {
+    s.N0 = n0 ; s.N1 = n1 ; s.N2 = n2 ; s.N3 = n3 ;
+    s.N4 = n4 ; s.N5 = n5 ; s.N6 = n6 ; s.N7 = n7 ;
+  }
+};
+
+//----------------------------------------------------------------------------
+
+template< class ShapeType , unsigned N ,
+          unsigned R = ShapeType::rank_dynamic >
+struct ShapeInsert ;
+
+template< class ShapeType , unsigned N >
+struct ShapeInsert< ShapeType , N , 0 >
+{
+  typedef Shape< ShapeType::scalar_size ,
+                 ShapeType::rank + 1 ,
+                 N ,
+                 ShapeType::N0 ,
+                 ShapeType::N1 ,
+                 ShapeType::N2 ,
+                 ShapeType::N3 ,
+                 ShapeType::N4 ,
+                 ShapeType::N5 ,
+                 ShapeType::N6 > type ;
+};
+
+template< class ShapeType , unsigned N >
+struct ShapeInsert< ShapeType , N , 1 >
+{
+  typedef Shape< ShapeType::scalar_size ,
+                 ShapeType::rank + 1 ,
+                 0 ,
+                 N ,
+                 ShapeType::N1 ,
+                 ShapeType::N2 ,
+                 ShapeType::N3 ,
+                 ShapeType::N4 ,
+                 ShapeType::N5 ,
+                 ShapeType::N6 > type ;
+};
+
+template< class ShapeType , unsigned N >
+struct ShapeInsert< ShapeType , N , 2 >
+{
+  typedef Shape< ShapeType::scalar_size ,
+                 ShapeType::rank + 1 ,
+                 0 ,
+                 0 ,
+                 N ,
+                 ShapeType::N2 ,
+                 ShapeType::N3 ,
+                 ShapeType::N4 ,
+                 ShapeType::N5 ,
+                 ShapeType::N6 > type ;
+};
+
+template< class ShapeType , unsigned N >
+struct ShapeInsert< ShapeType , N , 3 >
+{
+  typedef Shape< ShapeType::scalar_size ,
+                 ShapeType::rank + 1 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 N ,
+                 ShapeType::N3 ,
+                 ShapeType::N4 ,
+                 ShapeType::N5 ,
+                 ShapeType::N6 > type ;
+};
+
+template< class ShapeType , unsigned N >
+struct ShapeInsert< ShapeType , N , 4 >
+{
+  typedef Shape< ShapeType::scalar_size ,
+                 ShapeType::rank + 1 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 N ,
+                 ShapeType::N4 ,
+                 ShapeType::N5 ,
+                 ShapeType::N6 > type ;
+};
+
+template< class ShapeType , unsigned N >
+struct ShapeInsert< ShapeType , N , 5 >
+{
+  typedef Shape< ShapeType::scalar_size ,
+                 ShapeType::rank + 1 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 N ,
+                 ShapeType::N5 ,
+                 ShapeType::N6 > type ;
+};
+
+template< class ShapeType , unsigned N >
+struct ShapeInsert< ShapeType , N , 6 >
+{
+  typedef Shape< ShapeType::scalar_size ,
+                 ShapeType::rank + 1 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 N ,
+                 ShapeType::N6 > type ;
+};
+
+template< class ShapeType , unsigned N >
+struct ShapeInsert< ShapeType , N , 7 >
+{
+  typedef Shape< ShapeType::scalar_size ,
+                 ShapeType::rank + 1 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 N > type ;
+};
+
+//----------------------------------------------------------------------------
+
+template< class DstShape , class SrcShape ,
+          unsigned DstRankDynamic   = DstShape::rank_dynamic ,
+          bool     DstRankDynamicOK = unsigned(DstShape::rank_dynamic) >= unsigned(SrcShape::rank_dynamic) >
+struct ShapeCompatible { enum { value = false }; };
+
+template< class DstShape , class SrcShape >
+struct ShapeCompatible< DstShape , SrcShape , 8 , true >
+{
+  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) };
+};
+
+template< class DstShape , class SrcShape >
+struct ShapeCompatible< DstShape , SrcShape , 7 , true >
+{
+  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
+                 unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
+};
+
+template< class DstShape , class SrcShape >
+struct ShapeCompatible< DstShape , SrcShape , 6 , true >
+{
+  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
+                 unsigned(DstShape::N6) == unsigned(SrcShape::N6) &&
+                 unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
+};
+
+template< class DstShape , class SrcShape >
+struct ShapeCompatible< DstShape , SrcShape , 5 , true >
+{
+  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
+                 unsigned(DstShape::N5) == unsigned(SrcShape::N5) &&
+                 unsigned(DstShape::N6) == unsigned(SrcShape::N6) &&
+                 unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
+};
+
+template< class DstShape , class SrcShape >
+struct ShapeCompatible< DstShape , SrcShape , 4 , true >
+{
+  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
+                 unsigned(DstShape::N4) == unsigned(SrcShape::N4) &&
+                 unsigned(DstShape::N5) == unsigned(SrcShape::N5) &&
+                 unsigned(DstShape::N6) == unsigned(SrcShape::N6) &&
+                 unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
+};
+
+template< class DstShape , class SrcShape >
+struct ShapeCompatible< DstShape , SrcShape , 3 , true >
+{
+  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
+                 unsigned(DstShape::N3) == unsigned(SrcShape::N3) &&
+                 unsigned(DstShape::N4) == unsigned(SrcShape::N4) &&
+                 unsigned(DstShape::N5) == unsigned(SrcShape::N5) &&
+                 unsigned(DstShape::N6) == unsigned(SrcShape::N6) &&
+                 unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
+};
+
+template< class DstShape , class SrcShape >
+struct ShapeCompatible< DstShape , SrcShape , 2 , true >
+{
+  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
+                 unsigned(DstShape::N2) == unsigned(SrcShape::N2) &&
+                 unsigned(DstShape::N3) == unsigned(SrcShape::N3) &&
+                 unsigned(DstShape::N4) == unsigned(SrcShape::N4) &&
+                 unsigned(DstShape::N5) == unsigned(SrcShape::N5) &&
+                 unsigned(DstShape::N6) == unsigned(SrcShape::N6) &&
+                 unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
+};
+
+template< class DstShape , class SrcShape >
+struct ShapeCompatible< DstShape , SrcShape , 1 , true >
+{
+  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
+                 unsigned(DstShape::N1) == unsigned(SrcShape::N1) &&
+                 unsigned(DstShape::N2) == unsigned(SrcShape::N2) &&
+                 unsigned(DstShape::N3) == unsigned(SrcShape::N3) &&
+                 unsigned(DstShape::N4) == unsigned(SrcShape::N4) &&
+                 unsigned(DstShape::N5) == unsigned(SrcShape::N5) &&
+                 unsigned(DstShape::N6) == unsigned(SrcShape::N6) &&
+                 unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
+};
+
+template< class DstShape , class SrcShape >
+struct ShapeCompatible< DstShape , SrcShape , 0 , true >
+{
+  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
+                 unsigned(DstShape::N0) == unsigned(SrcShape::N0) &&
+                 unsigned(DstShape::N1) == unsigned(SrcShape::N1) &&
+                 unsigned(DstShape::N2) == unsigned(SrcShape::N2) &&
+                 unsigned(DstShape::N3) == unsigned(SrcShape::N3) &&
+                 unsigned(DstShape::N4) == unsigned(SrcShape::N4) &&
+                 unsigned(DstShape::N5) == unsigned(SrcShape::N5) &&
+                 unsigned(DstShape::N6) == unsigned(SrcShape::N6) &&
+                 unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< unsigned ScalarSize , unsigned Rank ,
+          unsigned s0 , unsigned s1 , unsigned s2 , unsigned s3 ,
+          unsigned s4 , unsigned s5 , unsigned s6 , unsigned s7 ,
+          typename iType >
+KOKKOS_INLINE_FUNCTION
+size_t dimension( 
+  const Shape<ScalarSize,Rank,s0,s1,s2,s3,s4,s5,s6,s7> & shape ,
+  const iType & r )
+{
+  return 0 == r ? shape.N0 : (
+         1 == r ? shape.N1 : (
+         2 == r ? shape.N2 : (
+         3 == r ? shape.N3 : (
+         4 == r ? shape.N4 : (
+         5 == r ? shape.N5 : (
+         6 == r ? shape.N6 : (
+         7 == r ? shape.N7 : 1 )))))));
+}
+
+template< unsigned ScalarSize , unsigned Rank ,
+          unsigned s0 , unsigned s1 , unsigned s2 , unsigned s3 ,
+          unsigned s4 , unsigned s5 , unsigned s6 , unsigned s7 >
+KOKKOS_INLINE_FUNCTION
+size_t cardinality_count(
+  const Shape<ScalarSize,Rank,s0,s1,s2,s3,s4,s5,s6,s7> & shape )
+{
+  return size_t(shape.N0) * shape.N1 * shape.N2 * shape.N3 *
+         shape.N4 * shape.N5 * shape.N6 * shape.N7 ;
+}
+
+//----------------------------------------------------------------------------
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+#endif /* #ifndef KOKKOS_CORESHAPE_HPP */
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Singleton.hpp b/lib/kokkos/core/src/impl/Kokkos_Singleton.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..86bc94ab0be9e8cfd00ea5a95cebc906bd3aa312
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Singleton.hpp
@@ -0,0 +1,55 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_SINGLETON_HPP
+#define KOKKOS_SINGLETON_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <cstddef>
+
+namespace Kokkos { namespace Impl {
+
+
+}} // namespace Kokkos::Impl
+
+#endif // KOKKOS_SINGLETON_HPP
diff --git a/lib/kokkos/core/src/impl/Kokkos_StaticAssert.hpp b/lib/kokkos/core/src/impl/Kokkos_StaticAssert.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..25e2ec9dc1849db862d9cb0d01bfd817c584b3b8
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_StaticAssert.hpp
@@ -0,0 +1,79 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STATICASSERT_HPP
+#define KOKKOS_STATICASSERT_HPP
+
+namespace Kokkos {
+namespace Impl {
+
+template < bool , class T = void >
+struct StaticAssert ;
+
+template< class T >
+struct StaticAssert< true , T > {
+  typedef T type ;
+  static const bool value = true ;
+};
+
+template < class A , class B >
+struct StaticAssertSame ;
+
+template < class A >
+struct StaticAssertSame<A,A> { typedef A type ; };
+
+template < class A , class B >
+struct StaticAssertAssignable ;
+
+template < class A >
+struct StaticAssertAssignable<A,A> { typedef A type ; };
+
+template < class A >
+struct StaticAssertAssignable< const A , A > { typedef const A type ; };
+
+} // namespace Impl
+} // namespace Kokkos
+
+#endif /* KOKKOS_STATICASSERT_HPP */
+
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Synchronic.hpp b/lib/kokkos/core/src/impl/Kokkos_Synchronic.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b2aea14df44ea55b8c86a70c9907792b51525918
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Synchronic.hpp
@@ -0,0 +1,693 @@
+/*
+
+Copyright (c) 2014, NVIDIA Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef KOKKOS_SYNCHRONIC_HPP
+#define KOKKOS_SYNCHRONIC_HPP
+
+#include <impl/Kokkos_Synchronic_Config.hpp>
+
+#include <atomic>
+#include <chrono>
+#include <thread>
+#include <functional>
+#include <algorithm>
+
+namespace Kokkos {
+namespace Impl {
+
+enum notify_hint {
+  notify_all,
+  notify_one,
+  notify_none
+};
+enum expect_hint {
+  expect_urgent,
+  expect_delay
+};
+
+namespace Details {
+
+template <class S, class T>
+bool __synchronic_spin_wait_for_update(S const& arg, T const& nval, int attempts) noexcept {
+  int i = 0;
+  for(;i < __SYNCHRONIC_SPIN_RELAX(attempts); ++i)
+    if(__builtin_expect(arg.load(std::memory_order_relaxed) != nval,1))
+      return true;
+    else
+      __synchronic_relax();
+  for(;i < attempts; ++i)
+    if(__builtin_expect(arg.load(std::memory_order_relaxed) != nval,1))
+      return true;
+    else
+      __synchronic_yield();
+  return false;
+}
+
+struct __exponential_backoff {
+  __exponential_backoff(int arg_maximum=512) : maximum(arg_maximum), microseconds(8), x(123456789), y(362436069), z(521288629) {
+  }
+  static inline void sleep_for(std::chrono::microseconds const& time) {
+    auto t = time.count();
+    if(__builtin_expect(t > 75,0)) {
+      portable_sleep(time);
+    }
+    else if(__builtin_expect(t > 25,0))
+      __synchronic_yield();
+    else
+      __synchronic_relax();
+  }
+  void sleep_for_step() {
+    sleep_for(step());
+  }
+  std::chrono::microseconds step() {
+    float const f = ranfu();
+    int const t = int(microseconds * f);
+    if(__builtin_expect(f >= 0.95f,0))
+      microseconds = 8;
+    else
+      microseconds = (std::min)(microseconds>>1,maximum);
+    return std::chrono::microseconds(t);
+  }
+private :
+  int maximum, microseconds, x, y, z;
+  int xorshf96() {
+    int t;
+    x ^= x << 16; x ^= x >> 5; x ^= x << 1;
+    t = x; x = y; y = z; z = t ^ x ^ y;
+    return z;
+  }
+  float ranfu() {
+    return (float)(xorshf96()&(~0UL>>1)) / (float)(~0UL>>1);
+  }
+};
+
+template <class T, class Enable = void>
+struct __synchronic_base {
+
+protected:
+  std::atomic<T> atom;
+
+  void notify(notify_hint = notify_all) noexcept {
+  }
+  void notify(notify_hint = notify_all) volatile noexcept {
+  }
+
+public :
+  __synchronic_base() noexcept = default;
+  constexpr __synchronic_base(T v) noexcept : atom(v) { }
+  __synchronic_base(const __synchronic_base&) = delete;
+  ~__synchronic_base() { }
+  __synchronic_base& operator=(const __synchronic_base&) = delete;
+  __synchronic_base& operator=(const __synchronic_base&) volatile = delete;
+
+  void expect_update(T val, expect_hint = expect_urgent) const noexcept {
+    if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_A))
+      return;
+    __exponential_backoff b;
+    while(atom.load(std::memory_order_relaxed) == val) {
+      __do_backoff(b);
+      if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_B))
+        return;
+    }
+  }
+  void expect_update(T val, expect_hint = expect_urgent) const volatile noexcept {
+    if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_A))
+      return;
+    __exponential_backoff b;
+    while(atom.load(std::memory_order_relaxed) == val) {
+      __do_backoff(b);
+      if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_B))
+        return;
+    }
+  }
+
+  template <class Clock, class Duration>
+  void expect_update_until(T val, std::chrono::time_point<Clock,Duration> const& then, expect_hint = expect_urgent) const {
+    if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_A))
+      return;
+    __exponential_backoff b;
+    std::chrono::milliseconds remains = then - std::chrono::high_resolution_clock::now();
+    while(remains > std::chrono::milliseconds::zero() && atom.load(std::memory_order_relaxed) == val) {
+      __do_backoff(b);
+      if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_B))
+        return;
+      remains = then - std::chrono::high_resolution_clock::now();
+    }
+  }
+  template <class Clock, class Duration>
+  void expect_update_until(T val, std::chrono::time_point<Clock,Duration> const& then, expect_hint = expect_urgent) const volatile {
+    if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_A))
+      return;
+    __exponential_backoff b;
+    std::chrono::milliseconds remains = then - std::chrono::high_resolution_clock::now();
+    while(remains > std::chrono::milliseconds::zero() && atom.load(std::memory_order_relaxed) == val) {
+      __do_backoff(b);
+      if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_B))
+        return;
+      remains = then - std::chrono::high_resolution_clock::now();
+    }
+  }
+};
+
+#ifdef __SYNCHRONIC_COMPATIBLE
+template <class T>
+struct __synchronic_base<T, typename std::enable_if<__SYNCHRONIC_COMPATIBLE(T)>::type> {
+
+public:
+  std::atomic<T> atom;
+
+  void notify(notify_hint hint = notify_all) noexcept {
+    if(__builtin_expect(hint == notify_none,1))
+      return;
+    auto const x = count.fetch_add(0,std::memory_order_acq_rel);
+    if(__builtin_expect(x,0)) {
+      if(__builtin_expect(hint == notify_all,1))
+        __synchronic_wake_all(&atom);
+      else
+        __synchronic_wake_one(&atom);
+    }
+  }
+  void notify(notify_hint hint = notify_all) volatile noexcept {
+    if(__builtin_expect(hint == notify_none,1))
+      return;
+    auto const x = count.fetch_add(0,std::memory_order_acq_rel);
+    if(__builtin_expect(x,0)) {
+      if(__builtin_expect(hint == notify_all,1))
+        __synchronic_wake_all_volatile(&atom);
+      else
+        __synchronic_wake_one_volatile(&atom);
+    }
+  }
+
+public :
+  __synchronic_base() noexcept : count(0) { }
+  constexpr __synchronic_base(T v) noexcept : atom(v), count(0) { }
+  __synchronic_base(const __synchronic_base&) = delete;
+  ~__synchronic_base() { }
+  __synchronic_base& operator=(const __synchronic_base&) = delete;
+  __synchronic_base& operator=(const __synchronic_base&) volatile = delete;
+
+  void expect_update(T val, expect_hint = expect_urgent) const noexcept {
+    if(__builtin_expect(__synchronic_spin_wait_for_update(atom, val,__SYNCHRONIC_SPIN_COUNT_A),1))
+      return;
+    while(__builtin_expect(atom.load(std::memory_order_relaxed) == val,1)) {
+      count.fetch_add(1,std::memory_order_release);
+      __synchronic_wait(&atom,val);
+      count.fetch_add(-1,std::memory_order_acquire);
+    }
+  }
+  void expect_update(T val, expect_hint = expect_urgent) const volatile noexcept {
+    if(__builtin_expect(__synchronic_spin_wait_for_update(atom, val,__SYNCHRONIC_SPIN_COUNT_A),1))
+      return;
+    while(__builtin_expect(atom.load(std::memory_order_relaxed) == val,1)) {
+      count.fetch_add(1,std::memory_order_release);
+      __synchronic_wait_volatile(&atom,val);
+      count.fetch_add(-1,std::memory_order_acquire);
+    }
+  }
+
+  template <class Clock, class Duration>
+  void expect_update_until(T val, std::chrono::time_point<Clock,Duration> const& then, expect_hint = expect_urgent) const {
+    if(__builtin_expect(__synchronic_spin_wait_for_update(atom, val,__SYNCHRONIC_SPIN_COUNT_A),1))
+      return;
+    std::chrono::milliseconds remains = then - std::chrono::high_resolution_clock::now();
+    while(__builtin_expect(remains > std::chrono::milliseconds::zero() && atom.load(std::memory_order_relaxed) == val,1)) {
+      count.fetch_add(1,std::memory_order_release);
+      __synchronic_wait_timed(&atom,val,remains);
+      count.fetch_add(-1,std::memory_order_acquire);
+      remains = then - std::chrono::high_resolution_clock::now();
+    }
+  }
+  template <class Clock, class Duration>
+  void expect_update_until(T val, std::chrono::time_point<Clock,Duration> const& then, expect_hint = expect_urgent) const volatile {
+    if(__builtin_expect(__synchronic_spin_wait_for_update(atom, val,__SYNCHRONIC_SPIN_COUNT_A),1))
+      return;
+    std::chrono::milliseconds remains = then - std::chrono::high_resolution_clock::now();
+    while(__builtin_expect(remains > std::chrono::milliseconds::zero() && atom.load(std::memory_order_relaxed) == val,1)) {
+      count.fetch_add(1,std::memory_order_release);
+      __synchronic_wait_timed_volatile(&atom,val,remains);
+      count.fetch_add(-1,std::memory_order_acquire);
+      remains = then - std::chrono::high_resolution_clock::now();
+    }
+  }
+private:
+  mutable std::atomic<int> count;
+};
+#endif
+
+template <class T, class Enable = void>
+struct __synchronic : public __synchronic_base<T> {
+
+  __synchronic() noexcept = default;
+  constexpr __synchronic(T v) noexcept : __synchronic_base<T>(v) { }
+  __synchronic(const __synchronic&) = delete;
+  __synchronic& operator=(const __synchronic&) = delete;
+  __synchronic& operator=(const __synchronic&) volatile = delete;
+};
+
+template <class T>
+struct __synchronic<T,typename std::enable_if<std::is_integral<T>::value>::type> : public __synchronic_base<T> {
+
+  T fetch_add(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
+    auto const t = this->atom.fetch_add(v,m);
+    this->notify(n);
+    return t;
+  }
+  T fetch_add(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
+    auto const t = this->atom.fetch_add(v,m);
+    this->notify(n);
+    return t;
+  }
+  T fetch_sub(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
+    auto const t = this->atom.fetch_sub(v,m);
+    this->notify(n);
+    return t;
+  }
+  T fetch_sub(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
+    auto const t = this->atom.fetch_sub(v,m);
+    this->notify(n);
+    return t;
+  }
+  T fetch_and(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
+    auto const t = this->atom.fetch_and(v,m);
+    this->notify(n);
+    return t;
+  }
+  T fetch_and(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
+    auto const t = this->atom.fetch_and(v,m);
+    this->notify(n);
+    return t;
+  }
+  T fetch_or(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
+    auto const t = this->atom.fetch_or(v,m);
+    this->notify(n);
+    return t;
+  }
+  T fetch_or(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
+    auto const t = this->atom.fetch_or(v,m);
+    this->notify(n);
+    return t;
+  }
+  T fetch_xor(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
+    auto const t = this->atom.fetch_xor(v,m);
+    this->notify(n);
+    return t;
+  }
+  T fetch_xor(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
+    auto const t = this->atom.fetch_xor(v,m);
+    this->notify(n);
+    return t;
+  }
+
+  __synchronic() noexcept = default;
+  constexpr __synchronic(T v) noexcept : __synchronic_base<T>(v) { }
+  __synchronic(const __synchronic&) = delete;
+  __synchronic& operator=(const __synchronic&) = delete;
+  __synchronic& operator=(const __synchronic&) volatile = delete;
+
+  T operator=(T v) volatile noexcept {
+    auto const t = this->atom = v;
+    this->notify();
+    return t;
+  }
+  T operator=(T v) noexcept {
+    auto const t = this->atom = v;
+    this->notify();
+    return t;
+  }
+  T operator++(int) volatile noexcept {
+    auto const t = ++this->atom;
+    this->notify();
+    return t;
+  }
+  T operator++(int) noexcept {
+    auto const t = ++this->atom;
+    this->notify();
+    return t;
+  }
+  T operator--(int) volatile noexcept {
+    auto const t = --this->atom;
+    this->notify();
+    return t;
+  }
+  T operator--(int) noexcept {
+    auto const t = --this->atom;
+    this->notify();
+    return t;
+  }
+  T operator++() volatile noexcept {
+    auto const t = this->atom++;
+    this->notify();
+    return t;
+  }
+  T operator++() noexcept {
+    auto const t = this->atom++;
+    this->notify();
+    return t;
+  }
+  T operator--() volatile noexcept {
+    auto const t = this->atom--;
+    this->notify();
+    return t;
+  }
+  T operator--() noexcept {
+    auto const t = this->atom--;
+    this->notify();
+    return t;
+  }
+  T operator+=(T v) volatile noexcept {
+    auto const t = this->atom += v;
+    this->notify();
+    return t;
+  }
+  T operator+=(T v) noexcept {
+    auto const t = this->atom += v;
+    this->notify();
+    return t;
+  }
+  T operator-=(T v) volatile noexcept {
+    auto const t = this->atom -= v;
+    this->notify();
+    return t;
+  }
+  T operator-=(T v) noexcept {
+    auto const t = this->atom -= v;
+    this->notify();
+    return t;
+  }
+  T operator&=(T v) volatile noexcept {
+    auto const t = this->atom &= v;
+    this->notify();
+    return t;
+  }
+  T operator&=(T v) noexcept {
+    auto const t = this->atom &= v;
+    this->notify();
+    return t;
+  }
+  T operator|=(T v) volatile noexcept {
+    auto const t = this->atom |= v;
+    this->notify();
+    return t;
+  }
+  T operator|=(T v) noexcept {
+    auto const t = this->atom |= v;
+    this->notify();
+    return t;
+  }
+  T operator^=(T v) volatile noexcept {
+    auto const t = this->atom ^= v;
+    this->notify();
+    return t;
+  }
+  T operator^=(T v) noexcept {
+    auto const t = this->atom ^= v;
+    this->notify();
+    return t;
+  }
+};
+
+template <class T>
+struct __synchronic<T*> : public __synchronic_base<T*> {
+
+  T* fetch_add(ptrdiff_t v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
+    auto const t = this->atom.fetch_add(v,m);
+    this->notify(n);
+    return t;
+  }
+  T* fetch_add(ptrdiff_t v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
+    auto const t = this->atom.fetch_add(v,m);
+    this->notify(n);
+    return t;
+  }
+  T* fetch_sub(ptrdiff_t v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
+    auto const t = this->atom.fetch_sub(v,m);
+    this->notify(n);
+    return t;
+  }
+  T* fetch_sub(ptrdiff_t v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
+    auto const t = this->atom.fetch_sub(v,m);
+    this->notify(n);
+    return t;
+  }
+
+  __synchronic() noexcept = default;
+  constexpr __synchronic(T* v) noexcept : __synchronic_base<T*>(v) { }
+  __synchronic(const __synchronic&) = delete;
+  __synchronic& operator=(const __synchronic&) = delete;
+  __synchronic& operator=(const __synchronic&) volatile = delete;
+
+  T* operator=(T* v) volatile noexcept {
+    auto const t = this->atom = v;
+    this->notify();
+    return t;
+  }
+  T* operator=(T* v) noexcept {
+    auto const t = this->atom = v;
+    this->notify();
+    return t;
+  }
+  T* operator++(int) volatile noexcept {
+    auto const t = ++this->atom;
+    this->notify();
+    return t;
+  }
+  T* operator++(int) noexcept {
+    auto const t = ++this->atom;
+    this->notify();
+    return t;
+  }
+  T* operator--(int) volatile noexcept {
+    auto const t = --this->atom;
+    this->notify();
+    return t;
+  }
+  T* operator--(int) noexcept {
+    auto const t = --this->atom;
+    this->notify();
+    return t;
+  }
+  T* operator++() volatile noexcept {
+    auto const t = this->atom++;
+    this->notify();
+    return t;
+  }
+  T* operator++() noexcept {
+    auto const t = this->atom++;
+    this->notify();
+    return t;
+  }
+  T* operator--() volatile noexcept {
+    auto const t = this->atom--;
+    this->notify();
+    return t;
+  }
+  T* operator--() noexcept {
+    auto const t = this->atom--;
+    this->notify();
+    return t;
+  }
+  T* operator+=(ptrdiff_t v) volatile noexcept {
+    auto const t = this->atom += v;
+    this->notify();
+    return t;
+  }
+  T* operator+=(ptrdiff_t v) noexcept {
+    auto const t = this->atom += v;
+    this->notify();
+    return t;
+  }
+  T* operator-=(ptrdiff_t v) volatile noexcept {
+    auto const t = this->atom -= v;
+    this->notify();
+    return t;
+  }
+  T* operator-=(ptrdiff_t v) noexcept {
+    auto const t = this->atom -= v;
+    this->notify();
+    return t;
+  }
+};
+
+} //namespace Details
+
+template <class T>
+struct synchronic : public Details::__synchronic<T> {
+
+  bool is_lock_free() const volatile noexcept { return this->atom.is_lock_free(); }
+  bool is_lock_free() const noexcept { return this->atom.is_lock_free(); }
+  void store(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
+    this->atom.store(v,m);
+    this->notify(n);
+  }
+  void store(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
+    this->atom.store(v,m);
+    this->notify(n);
+  }
+  T load(std::memory_order m = std::memory_order_seq_cst) const volatile noexcept { return this->atom.load(m); }
+  T load(std::memory_order m = std::memory_order_seq_cst) const noexcept { return this->atom.load(m); }
+
+  operator T() const volatile noexcept { return (T)this->atom; }
+  operator T() const noexcept { return (T)this->atom; }
+
+  T exchange(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
+    auto const t = this->atom.exchange(v,m);
+    this->notify(n);
+    return t;
+  }
+  T exchange(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
+    auto const t = this->atom.exchange(v,m);
+    this->notify(n);
+    return t;
+  }
+  bool compare_exchange_weak(T& r, T v, std::memory_order m1, std::memory_order m2, notify_hint n = notify_all) volatile noexcept {
+    auto const t = this->atom.compare_exchange_weak(r,v,m1,m2);
+    this->notify(n);
+    return t;
+  }
+  bool compare_exchange_weak(T& r, T v, std::memory_order m1, std::memory_order m2, notify_hint n = notify_all) noexcept {
+    auto const t = this->atom.compare_exchange_weak(r,v,m1, m2);
+    this->notify(n);
+    return t;
+  }
+  bool compare_exchange_strong(T& r, T v, std::memory_order m1, std::memory_order m2, notify_hint n = notify_all) volatile noexcept {
+    auto const t = this->atom.compare_exchange_strong(r,v,m1,m2);
+    this->notify(n);
+    return t;
+  }
+  bool compare_exchange_strong(T& r, T v, std::memory_order m1, std::memory_order m2, notify_hint n = notify_all) noexcept {
+    auto const t = this->atom.compare_exchange_strong(r,v,m1,m2);
+    this->notify(n);
+    return t;
+  }
+  bool compare_exchange_weak(T& r, T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
+    auto const t = this->atom.compare_exchange_weak(r,v,m);
+    this->notify(n);
+    return t;
+  }
+  bool compare_exchange_weak(T& r, T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
+    auto const t = this->atom.compare_exchange_weak(r,v,m);
+    this->notify(n);
+    return t;
+  }
+  bool compare_exchange_strong(T& r, T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
+    auto const t = this->atom.compare_exchange_strong(r,v,m);
+    this->notify(n);
+    return t;
+  }
+  bool compare_exchange_strong(T& r, T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
+    auto const t = this->atom.compare_exchange_strong(r,v,m);
+    this->notify(n);
+    return t;
+  }
+
+  synchronic() noexcept = default;
+  constexpr synchronic(T val) noexcept : Details::__synchronic<T>(val) { }
+  synchronic(const synchronic&) = delete;
+  ~synchronic() { }
+  synchronic& operator=(const synchronic&) = delete;
+  synchronic& operator=(const synchronic&) volatile = delete;
+  T operator=(T val) noexcept {
+    return Details::__synchronic<T>::operator=(val);
+  }
+  T operator=(T val) volatile noexcept {
+    return Details::__synchronic<T>::operator=(val);
+  }
+
+  T load_when_not_equal(T val, std::memory_order order = std::memory_order_seq_cst, expect_hint h = expect_urgent) const noexcept {
+    Details::__synchronic<T>::expect_update(val,h);
+    return load(order);
+  }
+  T load_when_not_equal(T val, std::memory_order order = std::memory_order_seq_cst, expect_hint h = expect_urgent) const volatile noexcept {
+    Details::__synchronic<T>::expect_update(val,h);
+    return load(order);
+  }
+  T load_when_equal(T val, std::memory_order order = std::memory_order_seq_cst, expect_hint h = expect_urgent) const noexcept {
+    for(T nval = load(std::memory_order_relaxed); nval != val; nval = load(std::memory_order_relaxed))
+      Details::__synchronic<T>::expect_update(nval,h);
+    return load(order);
+  }
+  T load_when_equal(T val, std::memory_order order = std::memory_order_seq_cst, expect_hint h = expect_urgent) const volatile noexcept {
+    for(T nval = load(std::memory_order_relaxed); nval != val; nval = load(std::memory_order_relaxed))
+      expect_update(nval,h);
+    return load(order);
+  }
+  template <class Rep, class Period>
+  void expect_update_for(T val, std::chrono::duration<Rep,Period> const& delta, expect_hint h = expect_urgent) const {
+    Details::__synchronic<T>::expect_update_until(val, std::chrono::high_resolution_clock::now() + delta,h);
+  }
+  template < class Rep, class Period>
+  void expect_update_for(T val, std::chrono::duration<Rep,Period> const& delta, expect_hint h = expect_urgent) const volatile {
+    Details::__synchronic<T>::expect_update_until(val, std::chrono::high_resolution_clock::now() + delta,h);
+  }
+};
+
+#include <inttypes.h>
+
+typedef synchronic<char> synchronic_char;
+typedef synchronic<char> synchronic_schar;
+typedef synchronic<unsigned char> synchronic_uchar;
+typedef synchronic<short> synchronic_short;
+typedef synchronic<unsigned short> synchronic_ushort;
+typedef synchronic<int> synchronic_int;
+typedef synchronic<unsigned int> synchronic_uint;
+typedef synchronic<long> synchronic_long;
+typedef synchronic<unsigned long> synchronic_ulong;
+typedef synchronic<long long> synchronic_llong;
+typedef synchronic<unsigned long long> synchronic_ullong;
+//typedef synchronic<char16_t> synchronic_char16_t;
+//typedef synchronic<char32_t> synchronic_char32_t;
+typedef synchronic<wchar_t> synchronic_wchar_t;
+
+typedef synchronic<int_least8_t> synchronic_int_least8_t;
+typedef synchronic<uint_least8_t> synchronic_uint_least8_t;
+typedef synchronic<int_least16_t> synchronic_int_least16_t;
+typedef synchronic<uint_least16_t> synchronic_uint_least16_t;
+typedef synchronic<int_least32_t> synchronic_int_least32_t;
+typedef synchronic<uint_least32_t> synchronic_uint_least32_t;
+//typedef synchronic<int_least_64_t> synchronic_int_least_64_t;
+typedef synchronic<uint_least64_t> synchronic_uint_least64_t;
+typedef synchronic<int_fast8_t> synchronic_int_fast8_t;
+typedef synchronic<uint_fast8_t> synchronic_uint_fast8_t;
+typedef synchronic<int_fast16_t> synchronic_int_fast16_t;
+typedef synchronic<uint_fast16_t> synchronic_uint_fast16_t;
+typedef synchronic<int_fast32_t> synchronic_int_fast32_t;
+typedef synchronic<uint_fast32_t> synchronic_uint_fast32_t;
+typedef synchronic<int_fast64_t> synchronic_int_fast64_t;
+typedef synchronic<uint_fast64_t> synchronic_uint_fast64_t;
+typedef synchronic<intptr_t> synchronic_intptr_t;
+typedef synchronic<uintptr_t> synchronic_uintptr_t;
+typedef synchronic<size_t> synchronic_size_t;
+typedef synchronic<ptrdiff_t> synchronic_ptrdiff_t;
+typedef synchronic<intmax_t> synchronic_intmax_t;
+typedef synchronic<uintmax_t> synchronic_uintmax_t;
+
+}
+}
+
+#endif //__SYNCHRONIC_H
diff --git a/lib/kokkos/core/src/impl/Kokkos_Synchronic_Config.hpp b/lib/kokkos/core/src/impl/Kokkos_Synchronic_Config.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..0a6dd6e715edad752f56756ccdc6fba3d43e30fb
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Synchronic_Config.hpp
@@ -0,0 +1,169 @@
+/*
+
+Copyright (c) 2014, NVIDIA Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef KOKKOS_SYNCHRONIC_CONFIG_H
+#define KOKKOS_SYNCHRONIC_CONFIG_H
+
+#include <thread>
+#include <chrono>
+
+namespace Kokkos {
+namespace Impl {
+
+//the default yield function used inside the implementation is the Standard one
+#define __synchronic_yield std::this_thread::yield
+#define __synchronic_relax __synchronic_yield
+
+#if defined(_MSC_VER)
+    //this is a handy GCC optimization that I use inside the implementation
+    #define __builtin_expect(condition,common) condition
+    #if _MSC_VER <= 1800
+        //using certain keywords that VC++ temporarily doesn't support
+        #define _ALLOW_KEYWORD_MACROS
+        #define noexcept
+        #define constexpr
+    #endif
+    //yes, I define multiple assignment operators
+    #pragma warning(disable:4522)
+    //I don't understand how Windows is so bad at timing functions, but is OK
+    //with straight-up yield loops
+    #define __do_backoff(b) __synchronic_yield()
+#else
+#define __do_backoff(b) b.sleep_for_step()
+#endif
+
+//certain platforms have efficient support for spin-waiting built into the operating system
+#if defined(__linux__) || (defined(_WIN32_WINNT) && _WIN32_WINNT >= 0x0602)
+#if defined(_WIN32_WINNT)
+#include <winsock2.h>
+#include <Windows.h>
+    //the combination of WaitOnAddress and WakeByAddressAll is supported on Windows 8.1+
+    #define __synchronic_wait(x,v) WaitOnAddress((PVOID)x,(PVOID)&v,sizeof(v),-1)
+    #define __synchronic_wait_timed(x,v,t) WaitOnAddress((PVOID)x,(PVOID)&v,sizeof(v),std::chrono::duration_cast<std::chrono::milliseconds>(t).count())
+    #define __synchronic_wake_one(x) WakeByAddressSingle((PVOID)x)
+    #define __synchronic_wake_all(x) WakeByAddressAll((PVOID)x)
+    #define __synchronic_wait_volatile(x,v) WaitOnAddress((PVOID)x,(PVOID)&v,sizeof(v),-1)
+    #define __synchronic_wait_timed_volatile(x,v,t) WaitOnAddress((PVOID)x,(PVOID)&v,sizeof(v),std::chrono::duration_cast<std::chrono::milliseconds>(t).count())
+    #define __synchronic_wake_one_volatile(x) WakeByAddressSingle((PVOID)x)
+    #define __synchronic_wake_all_volatile(x) WakeByAddressAll((PVOID)x)
+    #define __SYNCHRONIC_COMPATIBLE(x) (std::is_pod<x>::value && (sizeof(x) <= 8))
+
+    inline void native_sleep(unsigned long microseconds)
+    {
+      // What to do if microseconds is < 1000?
+      Sleep(microseconds / 1000);
+    }
+
+    inline void native_yield()
+    {
+      SwitchToThread();
+    }
+#elif defined(__linux__)
+    #include <chrono>
+    #include <time.h>
+    #include <unistd.h>
+    #include <pthread.h>
+    #include <linux/futex.h>
+    #include <sys/syscall.h>
+    #include <climits>
+    #include <cassert>
+    template < class Rep, class Period>
+    inline timespec to_timespec(std::chrono::duration<Rep,Period> const& delta) {
+      struct timespec ts;
+      ts.tv_sec = static_cast<long>(std::chrono::duration_cast<std::chrono::seconds>(delta).count());
+      assert(!ts.tv_sec);
+      ts.tv_nsec = static_cast<long>(std::chrono::duration_cast<std::chrono::nanoseconds>(delta).count());
+      return ts;
+    }
+    inline long futex(void const* addr1, int op, int val1) {
+        return syscall(SYS_futex, addr1, op, val1, 0, 0, 0);
+    }
+    inline long futex(void const* addr1, int op, int val1, struct timespec timeout) {
+        return syscall(SYS_futex, addr1, op, val1, &timeout, 0, 0);
+    }
+    inline void native_sleep(unsigned long microseconds)
+    {
+      usleep(microseconds);
+    }
+    inline void native_yield()
+    {
+      pthread_yield();
+    }
+
+    //the combination of SYS_futex(WAIT) and SYS_futex(WAKE) is supported on all recent Linux distributions
+    #define __synchronic_wait(x,v) futex(x, FUTEX_WAIT_PRIVATE, v)
+    #define __synchronic_wait_timed(x,v,t) futex(x, FUTEX_WAIT_PRIVATE, v, to_timespec(t))
+    #define __synchronic_wake_one(x) futex(x, FUTEX_WAKE_PRIVATE, 1)
+    #define __synchronic_wake_all(x) futex(x, FUTEX_WAKE_PRIVATE, INT_MAX)
+    #define __synchronic_wait_volatile(x,v) futex(x, FUTEX_WAIT, v)
+    #define __synchronic_wait_volatile_timed(x,v,t) futex(x, FUTEX_WAIT, v, to_timespec(t))
+    #define __synchronic_wake_one_volatile(x) futex(x, FUTEX_WAKE, 1)
+    #define __synchronic_wake_all_volatile(x) futex(x, FUTEX_WAKE, INT_MAX)
+    #define __SYNCHRONIC_COMPATIBLE(x) (std::is_integral<x>::value && (sizeof(x) <= 4))
+
+    //the yield function on Linux is better replaced by sched_yield, which is tuned for spin-waiting
+    #undef __synchronic_yield
+    #define __synchronic_yield sched_yield
+
+    //for extremely short wait times, just let another hyper-thread run
+    #undef __synchronic_relax
+    #define __synchronic_relax() asm volatile("rep; nop" ::: "memory")
+
+#endif
+#endif
+
+#ifdef _GLIBCXX_USE_NANOSLEEP
+inline void portable_sleep(std::chrono::microseconds const& time)
+{ std::this_thread::sleep_for(time); }
+#else
+inline void portable_sleep(std::chrono::microseconds const& time)
+{ native_sleep(time.count()); }
+#endif
+
+#ifdef _GLIBCXX_USE_SCHED_YIELD
+inline void portable_yield()
+{ std::this_thread::yield(); }
+#else
+inline void portable_yield()
+{ native_yield(); }
+#endif
+
+//this is the number of times we initially spin, on the first wait attempt
+#define __SYNCHRONIC_SPIN_COUNT_A 16
+
+//this is how decide to yield instead of just spinning, 'c' is the current trip count
+//#define __SYNCHRONIC_SPIN_YIELD(c) true
+#define __SYNCHRONIC_SPIN_RELAX(c) (c>>3)
+
+//this is the number of times we normally spin, on every subsequent wait attempt
+#define __SYNCHRONIC_SPIN_COUNT_B 8
+
+}
+}
+
+#endif //__SYNCHRONIC_CONFIG_H
diff --git a/lib/kokkos/core/src/impl/Kokkos_Synchronic_n3998.hpp b/lib/kokkos/core/src/impl/Kokkos_Synchronic_n3998.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..facc8d6d8e67a4828aa94bd75fb7590f454b41f6
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Synchronic_n3998.hpp
@@ -0,0 +1,162 @@
+/*
+
+Copyright (c) 2014, NVIDIA Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef KOKKOS_SYNCHRONIC_N3998_HPP
+#define KOKKOS_SYNCHRONIC_N3998_HPP
+
+#include <impl/Kokkos_Synchronic.hpp>
+#include <functional>
+
+/*
+In the section below, a synchronization point represents a point at which a
+thread may block until a given synchronization condition has been reached or
+at which it may notify other threads that a synchronization condition has
+been achieved.
+*/
+namespace Kokkos { namespace Impl {
+
+    /*
+    A latch maintains an internal counter that is initialized when the latch
+    is created. The synchronization condition is reached when the counter is
+    decremented to 0. Threads may block at a synchronization point waiting
+    for the condition to be reached. When the condition is reached, any such
+    blocked threads will be released.
+    */
+    struct latch {
+        latch(int val) : count(val), released(false) { }
+        latch(const latch&) = delete;
+        latch& operator=(const latch&) = delete;
+        ~latch( ) { }
+        void arrive( ) {
+            __arrive( );
+        }
+        void arrive_and_wait( ) {
+            if(!__arrive( ))
+                wait( );
+        }
+        void wait( ) {
+            while(!released.load_when_not_equal(false,std::memory_order_acquire))
+                ;
+        }
+        bool try_wait( ) {
+            return released.load(std::memory_order_acquire);
+        }
+    private:
+        bool __arrive( ) {
+            if(count.fetch_add(-1,std::memory_order_release)!=1)
+                return false;
+            released.store(true,std::memory_order_release);
+            return true;
+        }
+        std::atomic<int> count;
+        synchronic<bool> released;
+    };
+
+    /*
+    A barrier is created with an initial value representing the number of threads
+    that can arrive at the synchronization point. When that many threads have
+    arrived, the  synchronization condition is reached and the threads are
+    released. The barrier will then reset, and may be reused for a new cycle, in
+    which the same set of threads may arrive again at the synchronization point.
+    The same set of threads shall arrive at the barrier in each cycle, otherwise
+    the behaviour is undefined.
+    */
+    struct barrier {
+        barrier(int val) : expected(val), arrived(0), nexpected(val), epoch(0) { }
+        barrier(const barrier&) = delete;
+        barrier& operator=(const barrier&) = delete;
+        ~barrier() { }
+        void arrive_and_wait() {
+            int const myepoch = epoch.load(std::memory_order_relaxed);
+            if(!__arrive(myepoch))
+                while(epoch.load_when_not_equal(myepoch,std::memory_order_acquire) == myepoch)
+                    ;
+        }
+        void arrive_and_drop() {
+            nexpected.fetch_add(-1,std::memory_order_relaxed);
+            __arrive(epoch.load(std::memory_order_relaxed));
+        }
+    private:
+        bool __arrive(int const myepoch) {
+            int const myresult = arrived.fetch_add(1,std::memory_order_acq_rel) + 1;
+            if(__builtin_expect(myresult == expected,0)) {
+                expected = nexpected.load(std::memory_order_relaxed);
+                arrived.store(0,std::memory_order_relaxed);
+                epoch.store(myepoch+1,std::memory_order_release);
+                return true;
+            }
+            return false;
+        }
+        int expected;
+        std::atomic<int> arrived, nexpected;
+        synchronic<int> epoch;
+    };
+
+    /*
+    A notifying barrier behaves as a barrier, but is constructed with a callable
+    completion function that is invoked after all threads have arrived at the
+    synchronization point, and before the synchronization condition is reached.
+    The completion may modify the set of threads that arrives at the barrier in
+    each cycle.
+    */
+    struct notifying_barrier {
+        template <typename T>
+        notifying_barrier(int val, T && f) : expected(val), arrived(0), nexpected(val), epoch(0), completion(std::forward<T>(f)) { }
+        notifying_barrier(const notifying_barrier&) = delete;
+        notifying_barrier& operator=(const notifying_barrier&) = delete;
+        ~notifying_barrier( ) { }
+        void arrive_and_wait() {
+            int const myepoch = epoch.load(std::memory_order_relaxed);
+            if(!__arrive(myepoch))
+                while(epoch.load_when_not_equal(myepoch,std::memory_order_acquire) == myepoch)
+                    ;
+        }
+        void arrive_and_drop() {
+            nexpected.fetch_add(-1,std::memory_order_relaxed);
+            __arrive(epoch.load(std::memory_order_relaxed));
+        }
+    private:
+        bool __arrive(int const myepoch) {
+            int const myresult = arrived.fetch_add(1,std::memory_order_acq_rel) + 1;
+            if(__builtin_expect(myresult == expected,0)) {
+                int const newexpected = completion();
+                expected = newexpected ? newexpected : nexpected.load(std::memory_order_relaxed);
+                arrived.store(0,std::memory_order_relaxed);
+                epoch.store(myepoch+1,std::memory_order_release);
+                return true;
+            }
+            return false;
+        }
+        int expected;
+        std::atomic<int> arrived, nexpected;
+        synchronic<int> epoch;
+        std::function<int()> completion;
+    };
+}}
+
+#endif //__N3998_H
diff --git a/lib/kokkos/core/src/impl/Kokkos_Tags.hpp b/lib/kokkos/core/src/impl/Kokkos_Tags.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..0bc2864ff1d9079f47ec4369f25388794aa52f71
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Tags.hpp
@@ -0,0 +1,198 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_TAGS_HPP
+#define KOKKOS_TAGS_HPP
+
+#include <impl/Kokkos_Traits.hpp>
+#include <Kokkos_Core_fwd.hpp>
+#include <type_traits>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+/** KOKKOS_HAVE_TYPE( Type )
+ *
+ * defines a meta-function that check if a type expose an internal typedef or
+ * type alias which matches Type
+ *
+ * e.g.
+ *   KOKKOS_HAVE_TYPE( array_layout );
+ *   struct Foo { using array_layout = void; };
+ *   have_array_layout<Foo>::value == 1;
+ */
+#define KOKKOS_HAVE_TYPE( Type )                                                \
+template <typename T>                                                           \
+struct have_##Type {                                                            \
+  template <typename U> static std::false_type have_type(...);                  \
+  template <typename U> static std::true_type  have_type( typename U::Type* );  \
+  using type = decltype(have_type<T>(nullptr));                                 \
+  static constexpr bool value = type::value;                                    \
+}
+
+/** KOKKOS_IS_CONCEPT( Concept )
+ *
+ * defines a meta-function that check if a type match the given Kokkos concept
+ * type alias which matches Type
+ *
+ * e.g.
+ *   KOKKOS_IS_CONCEPT( array_layout );
+ *   struct Foo { using array_layout = Foo; };
+ *   is_array_layout<Foo>::value == 1;
+ */
+#define KOKKOS_IS_CONCEPT( Concept )                                            \
+template <typename T>                                                           \
+struct is_##Concept {                                                           \
+  template <typename U> static std::false_type have_concept(...);               \
+  template <typename U> static auto have_concept( typename U::Concept* )        \
+                          ->typename std::is_same<T, typename U::Concept>::type;\
+  using type = decltype(have_concept<T>(nullptr));                              \
+  static constexpr bool value = type::value;                                    \
+}
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos { namespace Impl {
+
+template <typename T>
+using is_void = std::is_same<void,T>;
+
+// is_memory_space<T>::value
+KOKKOS_IS_CONCEPT( memory_space );
+
+// is_memory_traits<T>::value
+KOKKOS_IS_CONCEPT( memory_traits );
+
+// is_execution_space<T>::value
+KOKKOS_IS_CONCEPT( execution_space );
+
+// is_execution_policy<T>::value
+KOKKOS_IS_CONCEPT( execution_policy );
+
+// is_array_layout<T>::value
+KOKKOS_IS_CONCEPT( array_layout );
+
+// is_iteration_pattern<T>::value
+KOKKOS_IS_CONCEPT( iteration_pattern );
+
+// is_schedule_type<T>::value
+KOKKOS_IS_CONCEPT( schedule_type );
+
+// is_index_type<T>::value
+KOKKOS_IS_CONCEPT( index_type );
+
+}} // namespace Kokkos::Impl
+
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+template< class ExecutionSpace , class MemorySpace >
+struct Device {
+  static_assert( Impl::is_execution_space<ExecutionSpace>::value
+               , "Execution space is not valid" );
+  static_assert( Impl::is_memory_space<MemorySpace>::value
+               , "Memory space is not valid" );
+  typedef ExecutionSpace execution_space;
+  typedef MemorySpace memory_space;
+  typedef Device<execution_space,memory_space> device_type;
+};
+}
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class C , class Enable = void >
+struct is_space : public Impl::false_type {};
+
+template< class C >
+struct is_space< C
+                 , typename Impl::enable_if<(
+                     Impl::is_same< C , typename C::execution_space >::value ||
+                     Impl::is_same< C , typename C::memory_space    >::value ||
+                     Impl::is_same< C , Device<
+                                             typename C::execution_space,
+                                             typename C::memory_space> >::value
+                   )>::type
+                 >
+  : public Impl::true_type
+{
+  typedef typename C::execution_space  execution_space ;
+  typedef typename C::memory_space     memory_space ;
+
+  // The host_memory_space defines a space with host-resident memory.
+  // If the execution space's memory space is host accessible then use that execution space.
+  // else use the HostSpace.
+  typedef
+      typename Impl::if_c< Impl::is_same< memory_space , HostSpace >::value
+#ifdef KOKKOS_HAVE_CUDA
+                        || Impl::is_same< memory_space , CudaUVMSpace>::value
+                        || Impl::is_same< memory_space , CudaHostPinnedSpace>::value
+#endif
+                          , memory_space , HostSpace >::type
+      host_memory_space ;
+
+  // The host_execution_space defines a space which has access to HostSpace.
+  // If the execution space can access HostSpace then use that execution space.
+  // else use the DefaultHostExecutionSpace.
+#ifdef KOKKOS_HAVE_CUDA
+  typedef
+      typename Impl::if_c< Impl::is_same< execution_space , Cuda >::value
+                          , DefaultHostExecutionSpace , execution_space >::type
+      host_execution_space ;
+#else
+  typedef execution_space host_execution_space;
+#endif
+
+  typedef Device<host_execution_space,host_memory_space> host_mirror_space;
+};
+}
+}
+
+#endif
diff --git a/lib/kokkos/core/src/impl/Kokkos_TaskQueue.hpp b/lib/kokkos/core/src/impl/Kokkos_TaskQueue.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..663bb1985d3636e84e236660b1c58fda5579cccc
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_TaskQueue.hpp
@@ -0,0 +1,499 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+// Experimental unified task-data parallel manycore LDRD
+
+#ifndef KOKKOS_IMPL_TASKQUEUE_HPP
+#define KOKKOS_IMPL_TASKQUEUE_HPP
+
+#if defined( KOKKOS_ENABLE_TASKPOLICY )
+
+#include <string>
+#include <typeinfo>
+#include <stdexcept>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+template< typename > class TaskPolicy ;
+
+template< typename Arg1 = void , typename Arg2 = void > class Future ;
+
+} /* namespace Kokkos */
+
+namespace Kokkos {
+namespace Impl {
+
+template< typename , typename , typename > class TaskBase ;
+template< typename > class TaskExec ;
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< typename Space >
+class TaskQueueSpecialization ;
+
+/** \brief  Manage task allocation, deallocation, and scheduling.
+ *
+ *  Task execution is deferred to the TaskQueueSpecialization.
+ *  All other aspects of task management have shared implementation.
+ */
+template< typename ExecSpace >
+class TaskQueue {
+private:
+
+  friend class TaskQueueSpecialization< ExecSpace > ;
+  friend class Kokkos::TaskPolicy< ExecSpace > ;
+
+  using execution_space = ExecSpace ;
+  using specialization  = TaskQueueSpecialization< execution_space > ;
+  using memory_space    = typename specialization::memory_space ;
+  using device_type     = Kokkos::Device< execution_space , memory_space > ;
+  using memory_pool     = Kokkos::Experimental::MemoryPool< device_type > ;
+  using task_root_type  = Kokkos::Impl::TaskBase<execution_space,void,void> ;
+
+  struct Destroy {
+    TaskQueue * m_queue ;
+    void destroy_shared_allocation();
+  };
+
+  //----------------------------------------
+
+  enum : int { NumQueue = 3 };
+
+  // Queue is organized as [ priority ][ type ]
+
+  memory_pool               m_memory ;
+  task_root_type * volatile m_ready[ NumQueue ][ 2 ];
+  long                      m_accum_alloc ; // Accumulated number of allocations
+  int                       m_count_alloc ; // Current number of allocations
+  int                       m_max_alloc ;   // Maximum number of allocations
+  int                       m_ready_count ; // Number of ready or executing
+
+  //----------------------------------------
+
+  ~TaskQueue();
+  TaskQueue() = delete ;
+  TaskQueue( TaskQueue && ) = delete ;
+  TaskQueue( TaskQueue const & ) = delete ;
+  TaskQueue & operator = ( TaskQueue && ) = delete ;
+  TaskQueue & operator = ( TaskQueue const & ) = delete ;
+
+  TaskQueue
+    ( const memory_space & arg_space
+    , unsigned const arg_memory_pool_capacity
+    , unsigned const arg_memory_pool_superblock_capacity_log2
+    );
+
+  // Schedule a task
+  //   Precondition:
+  //     task is not executing
+  //     task->m_next is the dependence or zero
+  //   Postcondition:
+  //     task->m_next is linked list membership
+  KOKKOS_FUNCTION
+  void schedule( task_root_type * const );
+
+  // Complete a task
+  //   Precondition:
+  //     task is not executing
+  //     task->m_next == LockTag  =>  task is complete
+  //     task->m_next != LockTag  =>  task is respawn
+  //   Postcondition:
+  //     task->m_wait == LockTag  =>  task is complete
+  //     task->m_wait != LockTag  =>  task is waiting
+  KOKKOS_FUNCTION
+  void complete( task_root_type * );
+
+  KOKKOS_FUNCTION
+  static bool push_task( task_root_type * volatile * const
+                       , task_root_type * const );
+
+  KOKKOS_FUNCTION
+  static task_root_type * pop_task( task_root_type * volatile * const );
+
+  KOKKOS_FUNCTION static
+  void decrement( task_root_type * task );
+
+public:
+
+  // If and only if the execution space is a single thread
+  // then execute ready tasks.
+  KOKKOS_INLINE_FUNCTION
+  void iff_single_thread_recursive_execute()
+    {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      specialization::iff_single_thread_recursive_execute( this );
+#endif
+    }
+
+  void execute() { specialization::execute( this ); }
+
+  // Assign task pointer with reference counting of assigned tasks
+  template< typename LV , typename RV >
+  KOKKOS_FUNCTION static
+  void assign( TaskBase< execution_space,LV,void> ** const lhs
+             , TaskBase< execution_space,RV,void> *  const rhs )
+    {
+      using task_lhs = TaskBase< execution_space,LV,void> ;
+#if 0
+  {
+    printf( "assign( 0x%lx { 0x%lx %d %d } , 0x%lx { 0x%lx %d %d } )\n"
+          , uintptr_t( lhs ? *lhs : 0 )
+          , uintptr_t( lhs && *lhs ? (*lhs)->m_next : 0 )
+          , int( lhs && *lhs ? (*lhs)->m_task_type : 0 )
+          , int( lhs && *lhs ? (*lhs)->m_ref_count : 0 )
+          , uintptr_t(rhs)
+          , uintptr_t( rhs ? rhs->m_next : 0 )
+          , int( rhs ? rhs->m_task_type : 0 )
+          , int( rhs ? rhs->m_ref_count : 0 )
+          );
+    fflush( stdout );
+  }
+#endif
+
+      if ( *lhs ) decrement( *lhs );
+      if ( rhs ) { Kokkos::atomic_fetch_add( &(rhs->m_ref_count) , 1 ); }
+
+      // Force write of *lhs
+
+      *static_cast< task_lhs * volatile * >(lhs) = rhs ;
+
+      Kokkos::memory_fence();
+    }
+
+  KOKKOS_FUNCTION
+  size_t allocate_block_size( size_t n ); ///< Actual block size allocated
+
+  KOKKOS_FUNCTION
+  void * allocate( size_t n ); ///< Allocate from the memory pool
+
+  KOKKOS_FUNCTION
+  void deallocate( void * p , size_t n ); ///< Deallocate to the memory pool
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+class TaskBase< void , void , void > {
+public:
+  enum : int16_t   { TaskTeam = 0 , TaskSingle = 1 , Aggregate = 2 };
+  enum : uintptr_t { LockTag = ~uintptr_t(0) , EndTag = ~uintptr_t(1) };
+};
+
+/** \brief  Base class for task management, access, and execution.
+ *
+ *  Inheritance structure to allow static_cast from the task root type
+ *  and a task's FunctorType.
+ *
+ *    // Enable a Future to access result data
+ *    TaskBase< Space , ResultType , void >
+ *      : TaskBase< void , void , void >
+ *      { ... };
+ *
+ *    // Enable a functor to access the base class
+ *    TaskBase< Space , ResultType , FunctorType >
+ *      : TaskBase< Space , ResultType , void >
+ *      , FunctorType
+ *      { ... };
+ *
+ *
+ *  States of a task:
+ *
+ *    Constructing State, NOT IN a linked list
+ *      m_wait == 0
+ *      m_next == 0
+ *
+ *    Scheduling transition : Constructing -> Waiting
+ *      before:
+ *        m_wait == 0
+ *        m_next == this task's initial dependence, 0 if none
+ *      after:
+ *        m_wait == EndTag
+ *        m_next == EndTag
+ *
+ *    Waiting State, IN a linked list
+ *      m_apply != 0
+ *      m_queue != 0
+ *      m_ref_count > 0
+ *      m_wait == head of linked list of tasks waiting on this task
+ *      m_next == next of linked list of tasks
+ *
+ *    transition : Waiting -> Executing
+ *      before:
+ *        m_next == EndTag
+ *      after::
+ *        m_next == LockTag
+ *
+ *    Executing State, NOT IN a linked list
+ *      m_apply != 0
+ *      m_queue != 0
+ *      m_ref_count > 0
+ *      m_wait == head of linked list of tasks waiting on this task
+ *      m_next == LockTag
+ *
+ *    Respawn transition : Executing -> Executing-Respawn
+ *      before:
+ *        m_next == LockTag
+ *      after:
+ *        m_next == this task's updated dependence, 0 if none
+ *
+ *    Executing-Respawn State, NOT IN a linked list
+ *      m_apply != 0
+ *      m_queue != 0
+ *      m_ref_count > 0
+ *      m_wait == head of linked list of tasks waiting on this task
+ *      m_next == this task's updated dependence, 0 if none
+ *
+ *    transition : Executing -> Complete
+ *      before:
+ *        m_wait == head of linked list
+ *      after:
+ *        m_wait == LockTag
+ *
+ *    Complete State, NOT IN a linked list
+ *      m_wait == LockTag: cannot add dependence
+ *      m_next == LockTag: not a member of a wait queue
+ *
+ */
+template< typename ExecSpace >
+class TaskBase< ExecSpace , void , void >
+{
+public:
+
+  enum : int16_t   { TaskTeam   = TaskBase<void,void,void>::TaskTeam
+                   , TaskSingle = TaskBase<void,void,void>::TaskSingle
+                   , Aggregate  = TaskBase<void,void,void>::Aggregate };
+
+  enum : uintptr_t { LockTag = TaskBase<void,void,void>::LockTag
+                   , EndTag  = TaskBase<void,void,void>::EndTag };
+
+  using execution_space = ExecSpace ;
+  using queue_type      = TaskQueue< execution_space > ;
+
+  template< typename > friend class Kokkos::TaskPolicy ;
+
+  typedef void (* function_type) ( TaskBase * , void * );
+
+  // sizeof(TaskBase) == 48
+
+  function_type  m_apply ;     ///< Apply function pointer
+  queue_type   * m_queue ;     ///< Queue in which this task resides
+  TaskBase     * m_wait ;      ///< Linked list of tasks waiting on this
+  TaskBase     * m_next ;      ///< Waiting linked-list next
+  int32_t        m_ref_count ; ///< Reference count
+  int32_t        m_alloc_size ;///< Allocation size
+  int32_t        m_dep_count ; ///< Aggregate's number of dependences
+  int16_t        m_task_type ; ///< Type of task
+  int16_t        m_priority ;  ///< Priority of runnable task
+
+  TaskBase( TaskBase && ) = delete ;
+  TaskBase( const TaskBase & ) = delete ;
+  TaskBase & operator = ( TaskBase && ) = delete ;
+  TaskBase & operator = ( const TaskBase & ) = delete ;
+
+  KOKKOS_INLINE_FUNCTION ~TaskBase() = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr TaskBase() noexcept
+    : m_apply(0)
+    , m_queue(0)
+    , m_wait(0)
+    , m_next(0)
+    , m_ref_count(0)
+    , m_alloc_size(0)
+    , m_dep_count(0)
+    , m_task_type( TaskSingle )
+    , m_priority( 1 /* TaskRegularPriority */ )
+    {}
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  TaskBase ** aggregate_dependences()
+    { return reinterpret_cast<TaskBase**>( this + 1 ); }
+
+  using get_return_type = void ;
+
+  KOKKOS_INLINE_FUNCTION
+  get_return_type get() const {}
+};
+
+template < typename ExecSpace , typename ResultType >
+class TaskBase< ExecSpace , ResultType , void >
+  : public TaskBase< ExecSpace , void , void >
+{
+private:
+
+  static_assert( sizeof(TaskBase<ExecSpace,void,void>) == 48 , "" );
+
+  TaskBase( TaskBase && ) = delete ;
+  TaskBase( const TaskBase & ) = delete ;
+  TaskBase & operator = ( TaskBase && ) = delete ;
+  TaskBase & operator = ( const TaskBase & ) = delete ;
+
+public:
+
+  ResultType   m_result ;
+
+  KOKKOS_INLINE_FUNCTION ~TaskBase() = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  TaskBase()
+    : TaskBase< ExecSpace , void , void >()
+    , m_result()
+    {}
+
+  using get_return_type = ResultType const & ;
+
+  KOKKOS_INLINE_FUNCTION
+  get_return_type get() const { return m_result ; }
+};
+
+
+template< typename ExecSpace , typename ResultType , typename FunctorType >
+class TaskBase
+  : public TaskBase< ExecSpace , ResultType , void >
+  , public FunctorType
+{
+private:
+
+  TaskBase() = delete ;
+  TaskBase( TaskBase && ) = delete ;
+  TaskBase( const TaskBase & ) = delete ;
+  TaskBase & operator = ( TaskBase && ) = delete ;
+  TaskBase & operator = ( const TaskBase & ) = delete ;
+
+public:
+
+  using root_type    = TaskBase< ExecSpace , void , void > ;
+  using base_type    = TaskBase< ExecSpace , ResultType , void > ;
+  using member_type  = TaskExec< ExecSpace > ;
+  using functor_type = FunctorType ;
+  using result_type  = ResultType ;
+
+  template< typename Type >
+  KOKKOS_INLINE_FUNCTION static
+  void apply_functor
+    ( Type * const task
+    , typename std::enable_if
+        < std::is_same< typename Type::result_type , void >::value
+        , member_type * const 
+        >::type member
+    )
+    {
+      using fType = typename Type::functor_type ;
+      static_cast<fType*>(task)->operator()( *member );
+    }
+
+  template< typename Type >
+  KOKKOS_INLINE_FUNCTION static
+  void apply_functor
+    ( Type * const task
+    , typename std::enable_if
+        < ! std::is_same< typename Type::result_type , void >::value
+        , member_type * const 
+        >::type member
+    )
+    {
+      using fType = typename Type::functor_type ;
+      static_cast<fType*>(task)->operator()( *member , task->m_result );
+    }
+
+  KOKKOS_FUNCTION static
+  void apply( root_type * root , void * exec )
+    {
+      TaskBase    * const lock   = reinterpret_cast< TaskBase * >( root_type::LockTag );
+      TaskBase    * const task   = static_cast< TaskBase * >( root );
+      member_type * const member = reinterpret_cast< member_type * >( exec );
+
+      TaskBase::template apply_functor( task , member );
+
+      // Task may be serial or team.
+      // If team then must synchronize before querying task->m_next.
+      // If team then only one thread calls destructor.
+
+      member->team_barrier();
+
+      if ( 0 == member->team_rank() && lock == task->m_next ) {
+        // Did not respawn, destroy the functor to free memory
+        static_cast<functor_type*>(task)->~functor_type();
+        // Cannot destroy the task until its dependences
+        // have been processed.
+      }
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  TaskBase( FunctorType const & arg_functor )
+    : base_type()
+    , FunctorType( arg_functor )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  ~TaskBase() {}
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
+#endif /* #ifndef KOKKOS_IMPL_TASKQUEUE_HPP */
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp b/lib/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..70a880d4a2e341a9f8e78df97c57531ca53492f6
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp
@@ -0,0 +1,569 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#if defined( KOKKOS_ENABLE_TASKPOLICY )
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+void TaskQueue< ExecSpace >::Destroy::destroy_shared_allocation()
+{
+  m_queue->~TaskQueue();
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+TaskQueue< ExecSpace >::TaskQueue
+  ( const TaskQueue< ExecSpace >::memory_space & arg_space
+  , unsigned const arg_memory_pool_capacity
+  , unsigned const arg_memory_pool_superblock_capacity_log2
+  )
+  : m_memory( arg_space
+            , arg_memory_pool_capacity
+            , arg_memory_pool_superblock_capacity_log2 )
+  , m_ready()
+  , m_accum_alloc(0)
+  , m_max_alloc(0)
+  , m_ready_count(0)
+{
+  for ( int i = 0 ; i < NumQueue ; ++i ) {
+    m_ready[i][0] = (task_root_type *) task_root_type::EndTag ;
+    m_ready[i][1] = (task_root_type *) task_root_type::EndTag ;
+  }
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+TaskQueue< ExecSpace >::~TaskQueue()
+{
+  // Verify that queues are empty and ready count is zero
+
+  for ( int i = 0 ; i < NumQueue ; ++i ) {
+    for ( int j = 0 ; j < 2 ; ++j ) {
+      if ( m_ready[i][j] != (task_root_type *) task_root_type::EndTag ) {
+        Kokkos::abort("TaskQueue::~TaskQueue ERROR: has ready tasks");
+      }
+    }
+  }
+
+  if ( 0 != m_ready_count ) {
+    Kokkos::abort("TaskQueue::~TaskQueue ERROR: has ready or executing tasks");
+  }
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+KOKKOS_FUNCTION
+void TaskQueue< ExecSpace >::decrement
+  ( TaskQueue< ExecSpace >::task_root_type * task )
+{
+  const int count = Kokkos::atomic_fetch_add(&(task->m_ref_count),-1);
+
+#if 0
+  if ( 1 == count ) {
+    printf( "decrement-destroy( 0x%lx { 0x%lx %d %d } )\n"
+          , uintptr_t( task )
+          , uintptr_t( task->m_next )
+          , int( task->m_task_type )
+          , int( task->m_ref_count )
+          );
+  }
+#endif
+
+  if ( ( 1 == count ) && 
+       ( task->m_next == (task_root_type *) task_root_type::LockTag ) ) {
+    // Reference count is zero and task is complete, deallocate.
+    task->m_queue->deallocate( task , task->m_alloc_size );
+  }   
+  else if ( count <= 1 ) { 
+    Kokkos::abort("TaskPolicy task has negative reference count or is incomplete" );
+  }   
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+KOKKOS_FUNCTION
+size_t TaskQueue< ExecSpace >::allocate_block_size( size_t n )
+{
+  return m_memory.allocate_block_size( n );
+}
+
+template< typename ExecSpace >
+KOKKOS_FUNCTION
+void * TaskQueue< ExecSpace >::allocate( size_t n )
+{
+  void * const p = m_memory.allocate(n);
+
+  if ( p ) {
+    Kokkos::atomic_increment( & m_accum_alloc );
+    Kokkos::atomic_increment( & m_count_alloc );
+
+    if ( m_max_alloc < m_count_alloc ) m_max_alloc = m_count_alloc ;
+  }
+
+  return p ;
+}
+
+template< typename ExecSpace >
+KOKKOS_FUNCTION
+void TaskQueue< ExecSpace >::deallocate( void * p , size_t n )
+{
+  m_memory.deallocate( p , n );
+  Kokkos::atomic_decrement( & m_count_alloc );
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+KOKKOS_FUNCTION
+bool TaskQueue< ExecSpace >::push_task
+  ( TaskQueue< ExecSpace >::task_root_type * volatile * const queue
+  , TaskQueue< ExecSpace >::task_root_type * const task
+  )
+{
+  // Push task into a concurrently pushed and popped queue.
+  // The queue is a linked list where 'task->m_next' form the links.
+  // Fail the push attempt if the queue is locked;
+  // otherwise retry until the push succeeds.
+
+#if 0
+  printf( "push_task( 0x%lx { 0x%lx } 0x%lx { 0x%lx 0x%lx %d %d %d } )\n"
+        , uintptr_t(queue)
+        , uintptr_t(*queue)
+        , uintptr_t(task)
+        , uintptr_t(task->m_wait)
+        , uintptr_t(task->m_next)
+        , task->m_task_type
+        , task->m_priority
+        , task->m_ref_count );
+#endif
+
+  task_root_type * const zero = (task_root_type *) 0 ;
+  task_root_type * const lock = (task_root_type *) task_root_type::LockTag ;
+
+  task_root_type * volatile * const next = & task->m_next ;
+
+  if ( zero != *next ) {
+    Kokkos::abort("TaskQueue::push_task ERROR: already a member of another queue" );
+  }
+
+  task_root_type * y = *queue ;
+
+  while ( lock != y ) {
+
+    *next = y ;
+
+    // Do not proceed until '*next' has been stored.
+    Kokkos::memory_fence();
+
+    task_root_type * const x = y ;
+
+    y = Kokkos::atomic_compare_exchange(queue,y,task);
+
+    if ( x == y ) return true ;
+  }
+
+  // Failed, replace 'task->m_next' value since 'task' remains
+  // not a member of a queue.
+
+  *next = zero ;
+
+  // Do not proceed until '*next' has been stored.
+  Kokkos::memory_fence();
+
+  return false ;
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+KOKKOS_FUNCTION
+typename TaskQueue< ExecSpace >::task_root_type *
+TaskQueue< ExecSpace >::pop_task
+  ( TaskQueue< ExecSpace >::task_root_type * volatile * const queue )
+{
+  // Pop task from a concurrently pushed and popped queue.
+  // The queue is a linked list where 'task->m_next' form the links.
+
+  task_root_type * const zero = (task_root_type *) 0 ;
+  task_root_type * const lock = (task_root_type *) task_root_type::LockTag ;
+  task_root_type * const end  = (task_root_type *) task_root_type::EndTag ;
+
+  // *queue is
+  //   end   => an empty queue
+  //   lock  => a locked queue
+  //   valid
+
+  // Retry until the lock is acquired or the queue is empty.
+
+  task_root_type * task = *queue ;
+
+  while ( end != task ) {
+
+    // The only possible values for the queue are
+    // (1) lock, (2) end, or (3) a valid task.
+    // Thus zero will never appear in the queue.
+    //
+    // If queue is locked then just read by guaranteeing
+    // the CAS will fail.
+
+    if ( lock == task ) task = 0 ;
+
+    task_root_type * const x = task ;
+
+    task = Kokkos::atomic_compare_exchange(queue,task,lock);
+
+    if ( x == task ) break ; // CAS succeeded and queue is locked
+  }
+
+  if ( end != task ) {
+
+    // This thread has locked the queue and removed 'task' from the queue.
+    // Extract the next entry of the queue from 'task->m_next'
+    // and mark 'task' as popped from a queue by setting
+    // 'task->m_next = lock'.
+
+    task_root_type * const next =
+      Kokkos::atomic_exchange( & task->m_next , lock );
+
+    // Place the next entry in the head of the queue,
+    // which also unlocks the queue.
+
+    task_root_type * const unlock =
+      Kokkos::atomic_exchange( queue , next );
+
+    if ( next == zero || next == lock || lock != unlock ) {
+      Kokkos::abort("TaskQueue::pop_task ERROR");
+    }
+  }
+
+#if 0
+  if ( end != task ) {
+    printf( "pop_task( 0x%lx 0x%lx { 0x%lx 0x%lx %d %d %d } )\n"
+          , uintptr_t(queue)
+          , uintptr_t(task)
+          , uintptr_t(task->m_wait)
+          , uintptr_t(task->m_next)
+          , int(task->m_task_type)
+          , int(task->m_priority)
+          , int(task->m_ref_count) );
+  }
+#endif
+
+  return task ;
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+KOKKOS_FUNCTION
+void TaskQueue< ExecSpace >::schedule
+  ( TaskQueue< ExecSpace >::task_root_type * const task )
+{
+  // Schedule a runnable or when_all task upon construction / spawn
+  // and upon completion of other tasks that 'task' is waiting on.
+
+  // Precondition on runnable task state:
+  //   task is either constructing or executing
+  //
+  //   Constructing state:
+  //     task->m_wait == 0
+  //     task->m_next == dependence
+  //   Executing-respawn state:
+  //     task->m_wait == head of linked list
+  //     task->m_next == dependence
+  //
+  //  Task state transition:
+  //     Constructing      ->  Waiting
+  //     Executing-respawn ->  Waiting
+  //
+  //  Postcondition on task state:
+  //     task->m_wait == head of linked list
+  //     task->m_next == member of linked list
+
+#if 0
+  printf( "schedule( 0x%lx { 0x%lx 0x%lx %d %d %d }\n"
+        , uintptr_t(task)
+        , uintptr_t(task->m_wait)
+        , uintptr_t(task->m_next)
+        , task->m_task_type
+        , task->m_priority
+        , task->m_ref_count );
+#endif
+
+  task_root_type * const zero = (task_root_type *) 0 ;
+  task_root_type * const lock = (task_root_type *) task_root_type::LockTag ;
+  task_root_type * const end  = (task_root_type *) task_root_type::EndTag ;
+
+  //----------------------------------------
+  {
+    // If Constructing then task->m_wait == 0
+    // Change to waiting by task->m_wait = EndTag
+
+    task_root_type * const init =
+      Kokkos::atomic_compare_exchange( & task->m_wait , zero , end );
+
+    // Precondition
+
+    if ( lock == init ) {
+      Kokkos::abort("TaskQueue::schedule ERROR: task is complete");
+    }
+
+    // if ( init == 0 ) Constructing       ->  Waiting
+    // else             Executing-Respawn  ->  Waiting
+  }
+  //----------------------------------------
+
+  if ( task_root_type::Aggregate != task->m_task_type ) {
+
+    // Scheduling a runnable task which may have a depencency 'dep'.
+    // Extract dependence, if any, from task->m_next.
+    // If 'dep' is not null then attempt to push 'task'
+    // into the wait queue of 'dep'.
+    // If the push succeeds then 'task' may be
+    // processed or executed by another thread at any time.
+    // If the push fails then 'dep' is complete and 'task'
+    // is ready to execute.
+
+    task_root_type * dep = Kokkos::atomic_exchange( & task->m_next , zero );
+
+    const bool is_ready = 
+      ( 0 == dep ) || ( ! push_task( & dep->m_wait , task ) );
+
+    // Reference count for dep was incremented when assigned
+    // to task->m_next so that if it completed prior to the
+    // above push_task dep would not be destroyed.
+    // dep reference count can now be decremented,
+    // which may deallocate the task.
+    TaskQueue::assign( & dep , (task_root_type *)0 );
+
+    if ( is_ready ) {
+
+      // No dependence or 'dep' is complete so push task into ready queue.
+      // Increment the ready count before pushing into ready queue
+      // to track number of ready + executing tasks.
+      // The ready count will be decremented when the task is complete.
+
+      Kokkos::atomic_increment( & m_ready_count );
+
+      task_root_type * volatile * const queue =
+        & m_ready[ task->m_priority ][ task->m_task_type ];
+
+      // A push_task fails if the ready queue is locked.
+      // A ready queue is only locked during a push or pop;
+      // i.e., it is never permanently locked.
+      // Retry push to ready queue until it succeeds.
+      // When the push succeeds then 'task' may be
+      // processed or executed by another thread at any time.
+
+      while ( ! push_task( queue , task ) );
+    }
+  }
+  //----------------------------------------
+  else {
+    // Scheduling a 'when_all' task with multiple dependences.
+    // This scheduling may be called when the 'when_all' is
+    // (1) created or
+    // (2) being removed from a completed task's wait list.
+
+    task_root_type ** const aggr = task->aggregate_dependences();
+
+    // Assume the 'when_all' is complete until a dependence is
+    // found that is not complete.
+
+    bool is_complete = true ;
+
+    for ( int i = task->m_dep_count ; 0 < i && is_complete ; ) {
+
+      --i ;
+
+      // Loop dependences looking for an incomplete task.
+      // Add this task to the incomplete task's wait queue.
+
+      // Remove a task 'x' from the dependence list.
+      // The reference count of 'x' was incremented when
+      // it was assigned into the dependence list.
+
+      task_root_type * x = Kokkos::atomic_exchange( aggr + i , zero );
+
+      if ( x ) {
+
+        // If x->m_wait is not locked then push succeeds
+        // and the aggregate is not complete.
+        // If the push succeeds then this when_all 'task' may be
+        // processed by another thread at any time.
+        // For example, 'x' may be completeed by another
+        // thread and then re-schedule this when_all 'task'.
+
+        is_complete = ! push_task( & x->m_wait , task );
+
+        // Decrement reference count which had been incremented
+        // when 'x' was added to the dependence list.
+
+        TaskQueue::assign( & x , zero );
+      }
+    }
+
+    if ( is_complete ) {
+      // The when_all 'task' was not added to a wait queue because
+      // all dependences were complete so this aggregate is complete.
+      // Complete the when_all 'task' to schedule other tasks
+      // that are waiting for the when_all 'task' to complete.
+
+      task->m_next = lock ;
+
+      complete( task );
+
+      // '*task' may have been deleted upon completion
+    }
+  }
+  //----------------------------------------
+  // Postcondition:
+  //   A runnable 'task' was pushed into a wait or ready queue.
+  //   An aggregate 'task' was either pushed to a wait queue
+  //   or completed.
+  // Concurrent execution may have already popped 'task'
+  // from a queue and processed it as appropriate.
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+KOKKOS_FUNCTION
+void TaskQueue< ExecSpace >::complete
+  ( TaskQueue< ExecSpace >::task_root_type * task )
+{
+  // Complete a runnable task that has finished executing
+  // or a when_all task when all of its dependeneces are complete.
+
+  task_root_type * const zero = (task_root_type *) 0 ;
+  task_root_type * const lock = (task_root_type *) task_root_type::LockTag ;
+  task_root_type * const end  = (task_root_type *) task_root_type::EndTag ;
+
+#if 0
+  printf( "complete( 0x%lx { 0x%lx 0x%lx %d %d %d }\n"
+        , uintptr_t(task)
+        , uintptr_t(task->m_wait)
+        , uintptr_t(task->m_next)
+        , task->m_task_type
+        , task->m_priority
+        , task->m_ref_count );
+  fflush( stdout );
+#endif
+
+  const bool runnable = task_root_type::Aggregate != task->m_task_type ;
+
+  //----------------------------------------
+
+  if ( runnable && lock != task->m_next ) {
+    // Is a runnable task has finished executing and requested respawn.
+    // Schedule the task for subsequent execution.
+
+    schedule( task );
+  }
+  //----------------------------------------
+  else {
+    // Is either an aggregate or a runnable task that executed
+    // and did not respawn.  Transition this task to complete.
+
+    // If 'task' is an aggregate then any of the runnable tasks that
+    // it depends upon may be attempting to complete this 'task'.
+    // Must only transition a task once to complete status.
+    // This is controled by atomically locking the wait queue.
+
+    // Stop other tasks from adding themselves to this task's wait queue
+    // by locking the head of this task's wait queue.
+
+    task_root_type * x = Kokkos::atomic_exchange( & task->m_wait , lock );
+
+    if ( x != (task_root_type *) lock ) {
+
+      // This thread has transitioned this 'task' to complete.
+      // 'task' is no longer in a queue and is not executing
+      // so decrement the reference count from 'task's creation.
+      // If no other references to this 'task' then it will be deleted.
+
+      TaskQueue::assign( & task , zero );
+
+      // This thread has exclusive access to the wait list so
+      // the concurrency-safe pop_task function is not needed.
+      // Schedule the tasks that have been waiting on the input 'task',
+      // which may have been deleted.
+
+      while ( x != end ) {
+
+        // Set x->m_next = zero  <=  no dependence
+
+        task_root_type * const next =
+          (task_root_type *) Kokkos::atomic_exchange( & x->m_next , zero );
+
+        schedule( x );
+
+        x = next ;
+      }
+    }
+  }
+
+  if ( runnable ) {
+    // A runnable task was popped from a ready queue and executed.
+    // If respawned into a ready queue then the ready count was incremented
+    // so decrement whether respawned or not.
+    Kokkos::atomic_decrement( & m_ready_count );
+  }
+}
+
+//----------------------------------------------------------------------------
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+
+#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Timer.hpp b/lib/kokkos/core/src/impl/Kokkos_Timer.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..1f14e42874bda3c43f5f18bced120d73366abd40
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Timer.hpp
@@ -0,0 +1,118 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_IMPLWALLTIME_HPP
+#define KOKKOS_IMPLWALLTIME_HPP
+
+#include <stddef.h>
+
+#ifdef _MSC_VER
+#undef KOKKOS_USE_LIBRT
+#include <gettimeofday.c>
+#else
+#ifdef KOKKOS_USE_LIBRT
+#include <ctime>
+#else
+#include <sys/time.h>
+#endif
+#endif
+
+namespace Kokkos {
+namespace Impl {
+
+/** \brief  Time since construction */
+
+class Timer {
+private:
+  #ifdef KOKKOS_USE_LIBRT
+	struct timespec m_old;
+  #else
+	struct timeval m_old ;
+  #endif
+  Timer( const Timer & );
+  Timer & operator = ( const Timer & );
+public:
+
+  inline
+  void reset() {
+    #ifdef KOKKOS_USE_LIBRT
+	  clock_gettime(CLOCK_REALTIME, &m_old);
+    #else
+	  gettimeofday( & m_old , ((struct timezone *) NULL ) );
+    #endif
+  }
+
+  inline
+  ~Timer() {}
+
+  inline
+  Timer() { reset(); }
+
+  inline
+  double seconds() const
+  {
+    #ifdef KOKKOS_USE_LIBRT
+      struct timespec m_new;
+      clock_gettime(CLOCK_REALTIME, &m_new);
+
+      return ( (double) ( m_new.tv_sec  - m_old.tv_sec ) ) +
+             ( (double) ( m_new.tv_nsec - m_old.tv_nsec ) * 1.0e-9 );
+    #else
+      struct timeval m_new ;
+
+      ::gettimeofday( & m_new , ((struct timezone *) NULL ) );
+
+      return ( (double) ( m_new.tv_sec  - m_old.tv_sec ) ) +
+             ( (double) ( m_new.tv_usec - m_old.tv_usec ) * 1.0e-6 );
+    #endif
+  }
+};
+
+} // namespace Impl
+
+  using Kokkos::Impl::Timer ;
+
+} // namespace Kokkos
+
+#endif /* #ifndef KOKKOS_IMPLWALLTIME_HPP */
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Traits.hpp b/lib/kokkos/core/src/impl/Kokkos_Traits.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..278f715bc917adf0dc2d8b93cfde7549c6febf7a
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Traits.hpp
@@ -0,0 +1,501 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOSTRAITS_HPP
+#define KOKKOSTRAITS_HPP
+
+#include <stddef.h>
+#include <stdint.h>
+#include <Kokkos_Macros.hpp>
+#include <string>
+#include <type_traits>
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+// Help with C++11 variadic argument packs
+
+template< unsigned I , typename ... Pack >
+struct get_type { typedef void type ; };
+
+template< typename T , typename ... Pack >
+struct get_type< 0 , T , Pack ... >
+{ typedef T type ; };
+
+template< unsigned I , typename T , typename ... Pack >
+struct get_type< I , T , Pack ... >
+{ typedef typename get_type< I - 1 , Pack ... >::type type ; };
+
+
+template< typename T , typename ... Pack >
+struct has_type { enum { value = false }; };
+
+template< typename T , typename S , typename ... Pack >
+struct has_type<T,S,Pack...>
+{
+private:
+
+  enum { self_value = std::is_same<T,S>::value };
+
+  typedef has_type<T,Pack...> next ;
+
+  static_assert( ! ( self_value && next::value )
+               , "Error: more than one member of the argument pack matches the type" );
+
+public:
+
+  enum { value = self_value || next::value };
+
+};
+
+
+template< typename DefaultType
+        , template< typename > class Condition
+        , typename ... Pack >
+struct has_condition 
+{
+  enum { value = false };
+  typedef DefaultType type ;
+};
+
+template< typename DefaultType
+        , template< typename > class Condition
+        , typename S
+        , typename ... Pack >
+struct has_condition< DefaultType , Condition , S , Pack... >
+{
+private:
+
+  enum { self_value = Condition<S>::value };
+
+  typedef has_condition< DefaultType , Condition , Pack... > next ;
+
+  static_assert( ! ( self_value && next::value )
+               , "Error: more than one member of the argument pack satisfies condition" );
+
+public:
+
+  enum { value = self_value || next::value };
+
+  typedef typename
+    std::conditional< self_value , S , typename next::type >::type
+      type ;
+};
+
+
+template< class ... Args >
+struct are_integral { enum { value = true }; };
+
+template< typename T , class ... Args >
+struct are_integral<T,Args...> {
+  enum { value =
+    // Accept std::is_integral OR std::is_enum as an integral value
+    // since a simple enum value is automically convertable to an
+    // integral value.
+    ( std::is_integral<T>::value || std::is_enum<T>::value )
+    &&
+    are_integral<Args...>::value };
+};
+
+//----------------------------------------------------------------------------
+/* C++11 conformal compile-time type traits utilities.
+ * Prefer to use C++11 when portably available.
+ */
+//----------------------------------------------------------------------------
+// C++11 Helpers:
+
+template < class T , T v >
+struct integral_constant
+{
+  // Declaration of 'static const' causes an unresolved linker symbol in debug
+  // static const T value = v ;
+  enum { value = T(v) };
+  typedef T value_type;
+  typedef integral_constant<T,v> type;
+  KOKKOS_INLINE_FUNCTION operator T() { return v ; }
+};
+
+typedef integral_constant<bool,false> false_type ;
+typedef integral_constant<bool,true>  true_type ;
+
+//----------------------------------------------------------------------------
+// C++11 Type relationships:
+
+template< class X , class Y > struct is_same : public false_type {};
+template< class X >           struct is_same<X,X> : public true_type {};
+
+//----------------------------------------------------------------------------
+// C++11 Type properties:
+
+template <typename T> struct is_const : public false_type {};
+template <typename T> struct is_const<const T> : public true_type {};
+template <typename T> struct is_const<const T & > : public true_type {};
+
+template <typename T> struct is_array : public false_type {};
+template <typename T> struct is_array< T[] > : public true_type {};
+template <typename T, unsigned N > struct is_array< T[N] > : public true_type {};
+
+//----------------------------------------------------------------------------
+// C++11 Type transformations:
+
+template <typename T> struct remove_const { typedef T type; };
+template <typename T> struct remove_const<const T> { typedef T type; };
+template <typename T> struct remove_const<const T & > { typedef T & type; };
+
+template <typename T> struct add_const { typedef const T type; };
+template <typename T> struct add_const<T & > { typedef const T & type; };
+template <typename T> struct add_const<const T> { typedef const T type; };
+template <typename T> struct add_const<const T & > { typedef const T & type; };
+
+template <typename T> struct remove_reference { typedef T type ; };
+template <typename T> struct remove_reference< T & > { typedef T type ; };
+template <typename T> struct remove_reference< const T & > { typedef const T type ; };
+
+template <typename T> struct remove_extent { typedef T type ; };
+template <typename T> struct remove_extent<T[]> { typedef T type ; };
+template <typename T, unsigned N > struct remove_extent<T[N]> { typedef T type ; };
+
+//----------------------------------------------------------------------------
+// C++11 Other type generators:
+
+template< bool , class T , class F >
+struct condition { typedef F type ; };
+
+template< class T , class F >
+struct condition<true,T,F> { typedef T type ; };
+
+template< bool , class = void >
+struct enable_if ;
+
+template< class T >
+struct enable_if< true , T > { typedef T type ; };
+
+//----------------------------------------------------------------------------
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+// Other traits
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+
+template< class , class T = void >
+struct enable_if_type { typedef T type ; };
+
+//----------------------------------------------------------------------------
+
+template< bool B >
+struct bool_ : public integral_constant<bool,B> {};
+
+template< unsigned I >
+struct unsigned_ : public integral_constant<unsigned,I> {};
+
+template< int I >
+struct int_ : public integral_constant<int,I> {};
+
+typedef bool_<true> true_;
+typedef bool_<false> false_;
+//----------------------------------------------------------------------------
+// if_
+
+template < bool Cond , typename TrueType , typename FalseType>
+struct if_c
+{
+  enum { value = Cond };
+
+  typedef FalseType type;
+
+
+  typedef typename remove_const<
+          typename remove_reference<type>::type >::type value_type ;
+
+  typedef typename add_const<value_type>::type const_value_type ;
+
+  static KOKKOS_INLINE_FUNCTION
+  const_value_type & select( const_value_type & v ) { return v ; }
+
+  static KOKKOS_INLINE_FUNCTION
+  value_type & select( value_type & v ) { return v ; }
+
+  template< class T >
+  static KOKKOS_INLINE_FUNCTION
+  value_type & select( const T & ) { value_type * ptr(0); return *ptr ; }
+
+
+  template< class T >
+  static KOKKOS_INLINE_FUNCTION
+  const_value_type & select( const T & , const_value_type & v ) { return v ; }
+
+  template< class T >
+  static KOKKOS_INLINE_FUNCTION
+  value_type & select( const T & , value_type & v ) { return v ; }
+};
+
+template <typename TrueType, typename FalseType>
+struct if_c< true , TrueType , FalseType >
+{
+  enum { value = true };
+
+  typedef TrueType type;
+
+
+  typedef typename remove_const<
+          typename remove_reference<type>::type >::type value_type ;
+
+  typedef typename add_const<value_type>::type const_value_type ;
+
+  static KOKKOS_INLINE_FUNCTION
+  const_value_type & select( const_value_type & v ) { return v ; }
+
+  static KOKKOS_INLINE_FUNCTION
+  value_type & select( value_type & v ) { return v ; }
+
+  template< class T >
+  static KOKKOS_INLINE_FUNCTION
+  value_type & select( const T & ) { value_type * ptr(0); return *ptr ; }
+
+
+  template< class F >
+  static KOKKOS_INLINE_FUNCTION
+  const_value_type & select( const_value_type & v , const F & ) { return v ; }
+
+  template< class F >
+  static KOKKOS_INLINE_FUNCTION
+  value_type & select( value_type & v , const F & ) { return v ; }
+};
+
+template< typename TrueType >
+struct if_c< false , TrueType , void >
+{
+  enum { value = false };
+
+  typedef void type ;
+  typedef void value_type ;
+};
+
+template< typename FalseType >
+struct if_c< true , void , FalseType >
+{
+  enum { value = true };
+
+  typedef void type ;
+  typedef void value_type ;
+};
+
+template <typename Cond, typename TrueType, typename FalseType>
+struct if_ : public if_c<Cond::value, TrueType, FalseType> {};
+
+//----------------------------------------------------------------------------
+
+// Allows aliased types:
+template< typename T >
+struct is_integral : public integral_constant< bool ,
+  (
+    std::is_same< T ,          char >::value ||
+    std::is_same< T , unsigned char >::value ||
+    std::is_same< T ,          short int >::value ||
+    std::is_same< T , unsigned short int >::value ||
+    std::is_same< T ,          int >::value ||
+    std::is_same< T , unsigned int >::value ||
+    std::is_same< T ,          long int >::value ||
+    std::is_same< T , unsigned long int >::value ||
+    std::is_same< T ,          long long int >::value ||
+    std::is_same< T , unsigned long long int >::value ||
+
+    std::is_same< T , int8_t   >::value ||
+    std::is_same< T , int16_t  >::value ||
+    std::is_same< T , int32_t  >::value ||
+    std::is_same< T , int64_t  >::value ||
+    std::is_same< T , uint8_t  >::value ||
+    std::is_same< T , uint16_t >::value ||
+    std::is_same< T , uint32_t >::value ||
+    std::is_same< T , uint64_t >::value 
+  )>
+{};
+//----------------------------------------------------------------------------
+
+template<typename T>
+struct is_label : public false_type {};
+
+template<>
+struct is_label<const char*> : public true_type {};
+
+template<>
+struct is_label<char*> : public true_type {};
+
+
+template<int N>
+struct is_label<const char[N]> : public true_type {};
+
+template<int N>
+struct is_label<char[N]> : public true_type {};
+
+
+template<>
+struct is_label<const std::string> : public true_type {};
+
+template<>
+struct is_label<std::string> : public true_type {};
+
+// These 'constexpr'functions can be used as
+// both regular functions and meta-function.
+
+/**\brief  There exists integral 'k' such that N = 2^k */
+KOKKOS_INLINE_FUNCTION
+constexpr bool is_integral_power_of_two( const size_t N )
+{ return ( 0 < N ) && ( 0 == ( N & ( N - 1 ) ) ); }
+
+/**\brief  Return integral 'k' such that N = 2^k, assuming valid.  */
+KOKKOS_INLINE_FUNCTION
+constexpr unsigned integral_power_of_two_assume_valid( const size_t N )
+{ return N == 1 ? 0 : 1 + integral_power_of_two_assume_valid( N >> 1 ); }
+
+/**\brief  Return integral 'k' such that N = 2^k, if exists.
+ *         If does not exist return ~0u.
+ */
+KOKKOS_INLINE_FUNCTION
+constexpr unsigned integral_power_of_two( const size_t N )
+{ return is_integral_power_of_two(N) ? integral_power_of_two_assume_valid(N) : ~0u ; }
+
+//----------------------------------------------------------------------------
+
+template < size_t N >
+struct is_power_of_two
+{
+  enum type { value = (N > 0) && !(N & (N-1)) };
+};
+
+template < size_t N , bool OK = is_power_of_two<N>::value >
+struct power_of_two ;
+
+template < size_t N >
+struct power_of_two<N,true>
+{
+  enum type { value = 1+ power_of_two<(N>>1),true>::value };
+};
+
+template <>
+struct power_of_two<2,true>
+{
+  enum type { value = 1 };
+};
+
+template <>
+struct power_of_two<1,true>
+{
+  enum type { value = 0 };
+};
+
+/** \brief  If power of two then return power,
+ *          otherwise return ~0u.
+ */
+static KOKKOS_FORCEINLINE_FUNCTION
+unsigned power_of_two_if_valid( const unsigned N )
+{
+  unsigned p = ~0u ;
+  if ( N && ! ( N & ( N - 1 ) ) ) {
+#if defined( __CUDA_ARCH__ ) && defined( KOKKOS_HAVE_CUDA )
+    p = __ffs(N) - 1 ;
+#elif defined( __GNUC__ ) || defined( __GNUG__ )
+    p = __builtin_ffs(N) - 1 ;
+#elif defined( __INTEL_COMPILER )
+    p = _bit_scan_forward(N);
+#else
+    p = 0 ;
+    for ( unsigned j = 1 ; ! ( N & j ) ; j <<= 1 ) { ++p ; }
+#endif
+  }
+  return p ;
+}
+
+//----------------------------------------------------------------------------
+
+template< typename T , T v , bool NonZero = ( v != T(0) ) >
+struct integral_nonzero_constant
+{
+  // Declaration of 'static const' causes an unresolved linker symbol in debug
+  // static const T value = v ;
+  enum { value = T(v) };
+  typedef T value_type ;
+  typedef integral_nonzero_constant<T,v> type ;
+  KOKKOS_INLINE_FUNCTION integral_nonzero_constant( const T & ) {}
+};
+
+template< typename T , T zero >
+struct integral_nonzero_constant<T,zero,false>
+{
+  const T value ;
+  typedef T value_type ;
+  typedef integral_nonzero_constant<T,0> type ;
+  KOKKOS_INLINE_FUNCTION integral_nonzero_constant( const T & v ) : value(v) {}
+};
+
+//----------------------------------------------------------------------------
+
+template < class C > struct is_integral_constant : public false_
+{
+  typedef void integral_type ;
+  enum { integral_value = 0 };
+};
+
+template < typename T , T v >
+struct is_integral_constant< integral_constant<T,v> > : public true_
+{
+  typedef T integral_type ;
+  enum { integral_value = v };
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOSTRAITS_HPP */
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewDefault.hpp b/lib/kokkos/core/src/impl/Kokkos_ViewDefault.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..94c8e13c1d445953fabc852aaece3fa8d07fa5eb
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_ViewDefault.hpp
@@ -0,0 +1,886 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_VIEWDEFAULT_HPP
+#define KOKKOS_VIEWDEFAULT_HPP
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+struct ViewAssignment< ViewDefault , ViewDefault , void >
+{
+  typedef ViewDefault Specialize ;
+
+  //------------------------------------
+  /** \brief  Compatible value and shape and LayoutLeft/Right to LayoutStride*/
+
+  template< class DT , class DL , class DD , class DM ,
+            class ST , class SL , class SD , class SM >
+  KOKKOS_INLINE_FUNCTION
+  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
+                  const View<ST,SL,SD,SM,Specialize> & src ,
+                  const typename enable_if<(
+                    ViewAssignable< ViewTraits<DT,DL,DD,DM> ,
+                                    ViewTraits<ST,SL,SD,SM> >::value
+                    ||
+                    ( ViewAssignable< ViewTraits<DT,DL,DD,DM> ,
+                                      ViewTraits<ST,SL,SD,SM> >::assignable_value
+                      &&
+                      ShapeCompatible< typename ViewTraits<DT,DL,DD,DM>::shape_type ,
+                                       typename ViewTraits<ST,SL,SD,SM>::shape_type >::value
+                      &&
+                      is_same< typename ViewTraits<DT,DL,DD,DM>::array_layout,LayoutStride>::value
+                      && (is_same< typename ViewTraits<ST,SL,SD,SM>::array_layout,LayoutLeft>::value ||
+                          is_same< typename ViewTraits<ST,SL,SD,SM>::array_layout,LayoutRight>::value))
+                  )>::type * = 0 )
+  {
+    dst.m_offset_map.assign( src.m_offset_map );
+
+    dst.m_management = src.m_management ;
+
+    dst.m_ptr_on_device = ViewDataManagement< ViewTraits<DT,DL,DD,DM> >::create_handle( src.m_ptr_on_device, src.m_tracker );
+
+    if( dst.is_managed )
+      dst.m_tracker = src.m_tracker ;
+    else {
+      dst.m_tracker = AllocationTracker();
+      dst.m_management.set_unmanaged();
+    }
+  }
+
+
+  /** \brief  Assign 1D Strided View to LayoutLeft or LayoutRight if stride[0]==1 */
+
+  template< class DT , class DL , class DD , class DM ,
+            class ST , class SD , class SM >
+  KOKKOS_INLINE_FUNCTION
+  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
+                  const View<ST,LayoutStride,SD,SM,Specialize> & src ,
+                  const typename enable_if<(
+                    (
+                      ViewAssignable< ViewTraits<DT,DL,DD,DM> ,
+                                    ViewTraits<ST,LayoutStride,SD,SM> >::value
+                      ||
+                      ( ViewAssignable< ViewTraits<DT,DL,DD,DM> ,
+                                      ViewTraits<ST,LayoutStride,SD,SM> >::assignable_value
+                        &&
+                        ShapeCompatible< typename ViewTraits<DT,DL,DD,DM>::shape_type ,
+                                       typename ViewTraits<ST,LayoutStride,SD,SM>::shape_type >::value
+                      )
+                     )
+                     &&
+                      (View<DT,DL,DD,DM,Specialize>::rank==1)
+                     && (is_same< typename ViewTraits<DT,DL,DD,DM>::array_layout,LayoutLeft>::value ||
+                          is_same< typename ViewTraits<DT,DL,DD,DM>::array_layout,LayoutRight>::value)
+                  )>::type * = 0 )
+  {
+    size_t strides[8];
+    src.stride(strides);
+    if(strides[0]!=1) {
+      Kokkos::abort("Trying to assign strided 1D View to LayoutRight or LayoutLeft which is not stride-1");
+    }
+    dst.m_offset_map.assign( src.dimension_0(), 0, 0, 0, 0, 0, 0, 0, 0 );
+
+    dst.m_management = src.m_management ;
+
+    dst.m_ptr_on_device = ViewDataManagement< ViewTraits<DT,DL,DD,DM> >::create_handle( src.m_ptr_on_device, src.m_tracker );
+
+    if( dst.is_managed )
+      dst.m_tracker = src.m_tracker ;
+    else {
+      dst.m_tracker = AllocationTracker();
+      dst.m_management.set_unmanaged();
+    }
+  }
+
+  //------------------------------------
+  /** \brief  Deep copy data from compatible value type, layout, rank, and specialization.
+   *          Check the dimensions and allocation lengths at runtime.
+   */
+  template< class DT , class DL , class DD , class DM ,
+            class ST , class SL , class SD , class SM >
+  inline static
+  void deep_copy( const View<DT,DL,DD,DM,Specialize> & dst ,
+                  const View<ST,SL,SD,SM,Specialize> & src ,
+                  const typename Impl::enable_if<(
+                    Impl::is_same< typename ViewTraits<DT,DL,DD,DM>::value_type ,
+                                   typename ViewTraits<ST,SL,SD,SM>::non_const_value_type >::value
+                    &&
+                    Impl::is_same< typename ViewTraits<DT,DL,DD,DM>::array_layout ,
+                                   typename ViewTraits<ST,SL,SD,SM>::array_layout >::value
+                    &&
+                    ( unsigned(ViewTraits<DT,DL,DD,DM>::rank) == unsigned(ViewTraits<ST,SL,SD,SM>::rank) )
+                  )>::type * = 0 )
+  {
+    typedef typename ViewTraits<DT,DL,DD,DM>::memory_space dst_memory_space ;
+    typedef typename ViewTraits<ST,SL,SD,SM>::memory_space src_memory_space ;
+
+    if ( dst.ptr_on_device() != src.ptr_on_device() ) {
+
+      Impl::assert_shapes_are_equal( dst.m_offset_map , src.m_offset_map );
+
+      const size_t nbytes = dst.m_offset_map.scalar_size * dst.m_offset_map.capacity();
+
+      DeepCopy< dst_memory_space , src_memory_space >( dst.ptr_on_device() , src.ptr_on_device() , nbytes );
+    }
+  }
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class ExecSpace , class DT , class DL, class DD, class DM, class DS >
+struct ViewDefaultConstruct< ExecSpace , Kokkos::View<DT,DL,DD,DM,DS> , true >
+{
+  Kokkos::View<DT,DL,DD,DM,DS> * const m_ptr ;
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  void operator()( const typename ExecSpace::size_type& i ) const
+    { new(m_ptr+i) Kokkos::View<DT,DL,DD,DM,DS>(); }
+
+  ViewDefaultConstruct( Kokkos::View<DT,DL,DD,DM,DS> * pointer , size_t capacity )
+    : m_ptr( pointer )
+    {
+      Kokkos::RangePolicy< ExecSpace > range( 0 , capacity );
+      parallel_for( range , *this );
+      ExecSpace::fence();
+    }
+};
+
+template< class SrcDataType , class SrcArg1Type , class SrcArg2Type , class SrcArg3Type
+        , class SubArg0_type , class SubArg1_type , class SubArg2_type , class SubArg3_type
+        , class SubArg4_type , class SubArg5_type , class SubArg6_type , class SubArg7_type
+        >
+struct ViewSubview< View< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type , Impl::ViewDefault >
+                  , SubArg0_type , SubArg1_type , SubArg2_type , SubArg3_type
+                  , SubArg4_type , SubArg5_type , SubArg6_type , SubArg7_type >
+{
+private:
+
+  typedef View< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type , Impl::ViewDefault >  SrcViewType ;
+
+  enum { V0 = Impl::is_same< SubArg0_type , void >::value ? 1 : 0 };
+  enum { V1 = Impl::is_same< SubArg1_type , void >::value ? 1 : 0 };
+  enum { V2 = Impl::is_same< SubArg2_type , void >::value ? 1 : 0 };
+  enum { V3 = Impl::is_same< SubArg3_type , void >::value ? 1 : 0 };
+  enum { V4 = Impl::is_same< SubArg4_type , void >::value ? 1 : 0 };
+  enum { V5 = Impl::is_same< SubArg5_type , void >::value ? 1 : 0 };
+  enum { V6 = Impl::is_same< SubArg6_type , void >::value ? 1 : 0 };
+  enum { V7 = Impl::is_same< SubArg7_type , void >::value ? 1 : 0 };
+
+  // The source view rank must be equal to the input argument rank
+  // Once a void argument is encountered all subsequent arguments must be void.
+  enum { InputRank =
+    Impl::StaticAssert<( SrcViewType::rank ==
+                         ( V0 ? 0 : (
+                           V1 ? 1 : (
+                           V2 ? 2 : (
+                           V3 ? 3 : (
+                           V4 ? 4 : (
+                           V5 ? 5 : (
+                           V6 ? 6 : (
+                           V7 ? 7 : 8 ))))))) ))
+                       &&
+                       ( SrcViewType::rank ==
+                         ( 8 - ( V0 + V1 + V2 + V3 + V4 + V5 + V6 + V7 ) ) )
+    >::value ? SrcViewType::rank : 0 };
+
+  enum { R0 = Impl::ViewOffsetRange< SubArg0_type >::is_range ? 1 : 0 };
+  enum { R1 = Impl::ViewOffsetRange< SubArg1_type >::is_range ? 1 : 0 };
+  enum { R2 = Impl::ViewOffsetRange< SubArg2_type >::is_range ? 1 : 0 };
+  enum { R3 = Impl::ViewOffsetRange< SubArg3_type >::is_range ? 1 : 0 };
+  enum { R4 = Impl::ViewOffsetRange< SubArg4_type >::is_range ? 1 : 0 };
+  enum { R5 = Impl::ViewOffsetRange< SubArg5_type >::is_range ? 1 : 0 };
+  enum { R6 = Impl::ViewOffsetRange< SubArg6_type >::is_range ? 1 : 0 };
+  enum { R7 = Impl::ViewOffsetRange< SubArg7_type >::is_range ? 1 : 0 };
+
+  enum { OutputRank = unsigned(R0) + unsigned(R1) + unsigned(R2) + unsigned(R3)
+                    + unsigned(R4) + unsigned(R5) + unsigned(R6) + unsigned(R7) };
+
+  // Reverse
+  enum { R0_rev = 0 == InputRank ? 0u : (
+                  1 == InputRank ? unsigned(R0) : (
+                  2 == InputRank ? unsigned(R1) : (
+                  3 == InputRank ? unsigned(R2) : (
+                  4 == InputRank ? unsigned(R3) : (
+                  5 == InputRank ? unsigned(R4) : (
+                  6 == InputRank ? unsigned(R5) : (
+                  7 == InputRank ? unsigned(R6) : unsigned(R7) ))))))) };
+
+  typedef typename SrcViewType::array_layout  SrcViewLayout ;
+
+  // Choose array layout, attempting to preserve original layout if at all possible.
+  typedef typename Impl::if_c<
+     ( // Same Layout IF
+       // OutputRank 0
+       ( OutputRank == 0 )
+       ||
+       // OutputRank 1 or 2, InputLayout Left, Interval 0
+       // because single stride one or second index has a stride.
+       ( OutputRank <= 2 && R0 && Impl::is_same<SrcViewLayout,LayoutLeft>::value )
+       ||
+       // OutputRank 1 or 2, InputLayout Right, Interval [InputRank-1]
+       // because single stride one or second index has a stride.
+       ( OutputRank <= 2 && R0_rev && Impl::is_same<SrcViewLayout,LayoutRight>::value )
+     ), SrcViewLayout , Kokkos::LayoutStride >::type OutputViewLayout ;
+
+  // Choose data type as a purely dynamic rank array to accomodate a runtime range.
+  typedef typename Impl::if_c< OutputRank == 0 , typename SrcViewType::value_type ,
+          typename Impl::if_c< OutputRank == 1 , typename SrcViewType::value_type *,
+          typename Impl::if_c< OutputRank == 2 , typename SrcViewType::value_type **,
+          typename Impl::if_c< OutputRank == 3 , typename SrcViewType::value_type ***,
+          typename Impl::if_c< OutputRank == 4 , typename SrcViewType::value_type ****,
+          typename Impl::if_c< OutputRank == 5 , typename SrcViewType::value_type *****,
+          typename Impl::if_c< OutputRank == 6 , typename SrcViewType::value_type ******,
+          typename Impl::if_c< OutputRank == 7 , typename SrcViewType::value_type *******,
+                                                 typename SrcViewType::value_type ********
+  >::type >::type >::type >::type >::type >::type >::type >::type  OutputData ;
+
+  // Choose space.
+  // If the source view's template arg1 or arg2 is a space then use it,
+  // otherwise use the source view's execution space.
+
+  typedef typename Impl::if_c< Impl::is_space< SrcArg1Type >::value , SrcArg1Type ,
+          typename Impl::if_c< Impl::is_space< SrcArg2Type >::value , SrcArg2Type , typename SrcViewType::device_type
+  >::type >::type OutputSpace ;
+
+public:
+
+  // If keeping the layout then match non-data type arguments
+  // else keep execution space and memory traits.
+  typedef typename
+    Impl::if_c< Impl::is_same< SrcViewLayout , OutputViewLayout >::value
+              , Kokkos::View< OutputData , SrcArg1Type , SrcArg2Type , SrcArg3Type , Impl::ViewDefault >
+              , Kokkos::View< OutputData , OutputViewLayout , OutputSpace
+                            , typename SrcViewType::memory_traits
+                            , Impl::ViewDefault >
+              >::type  type ;
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+// Construct subview of a Rank 8 view
+template< class DstDataType , class DstArg1Type , class DstArg2Type , class DstArg3Type >
+template< class SrcDataType , class SrcArg1Type , class SrcArg2Type , class SrcArg3Type
+        , class SubArg0_type , class SubArg1_type , class SubArg2_type , class SubArg3_type
+        , class SubArg4_type , class SubArg5_type , class SubArg6_type , class SubArg7_type
+        >
+KOKKOS_INLINE_FUNCTION
+View< DstDataType , DstArg1Type , DstArg2Type , DstArg3Type , Impl::ViewDefault >::
+View( const View< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type , Impl::ViewDefault > & src
+    , const SubArg0_type & arg0
+    , const SubArg1_type & arg1
+    , const SubArg2_type & arg2
+    , const SubArg3_type & arg3
+    , const SubArg4_type & arg4
+    , const SubArg5_type & arg5
+    , const SubArg6_type & arg6
+    , const SubArg7_type & arg7
+    )
+  : m_ptr_on_device( (typename traits::value_type*) NULL)
+  , m_offset_map()
+  , m_management()
+  , m_tracker()
+{
+  // This constructor can only be used to construct a subview
+  // from the source view.  This type must match the subview type
+  // deduced from the source view and subview arguments.
+
+  typedef Impl::ViewSubview< View< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type , Impl::ViewDefault >
+                           , SubArg0_type , SubArg1_type , SubArg2_type , SubArg3_type
+                           , SubArg4_type , SubArg5_type , SubArg6_type , SubArg7_type >
+    ViewSubviewDeduction ;
+
+  enum { is_a_valid_subview_constructor =
+    Impl::StaticAssert<
+      Impl::is_same< View , typename ViewSubviewDeduction::type >::value
+    >::value
+  };
+
+  if ( is_a_valid_subview_constructor ) {
+
+    typedef Impl::ViewOffsetRange< SubArg0_type > R0 ;
+    typedef Impl::ViewOffsetRange< SubArg1_type > R1 ;
+    typedef Impl::ViewOffsetRange< SubArg2_type > R2 ;
+    typedef Impl::ViewOffsetRange< SubArg3_type > R3 ;
+    typedef Impl::ViewOffsetRange< SubArg4_type > R4 ;
+    typedef Impl::ViewOffsetRange< SubArg5_type > R5 ;
+    typedef Impl::ViewOffsetRange< SubArg6_type > R6 ;
+    typedef Impl::ViewOffsetRange< SubArg7_type > R7 ;
+
+    // 'assign_subview' returns whether the subview offset_map
+    // introduces noncontiguity in the view.
+    const bool introduce_noncontiguity =
+      m_offset_map.assign_subview( src.m_offset_map
+                                 , R0::dimension( src.m_offset_map.N0 , arg0 )
+                                 , R1::dimension( src.m_offset_map.N1 , arg1 )
+                                 , R2::dimension( src.m_offset_map.N2 , arg2 )
+                                 , R3::dimension( src.m_offset_map.N3 , arg3 )
+                                 , R4::dimension( src.m_offset_map.N4 , arg4 )
+                                 , R5::dimension( src.m_offset_map.N5 , arg5 )
+                                 , R6::dimension( src.m_offset_map.N6 , arg6 )
+                                 , R7::dimension( src.m_offset_map.N7 , arg7 )
+                                 );
+
+    if ( m_offset_map.capacity() ) {
+
+      m_management = src.m_management ;
+
+      if ( introduce_noncontiguity ) m_management.set_noncontiguous();
+
+      m_ptr_on_device = src.m_ptr_on_device +
+                        src.m_offset_map( R0::begin( arg0 )
+                                        , R1::begin( arg1 )
+                                        , R2::begin( arg2 )
+                                        , R3::begin( arg3 )
+                                        , R4::begin( arg4 )
+                                        , R5::begin( arg5 )
+                                        , R6::begin( arg6 )
+                                        , R7::begin( arg7 ) );
+      m_tracker = src.m_tracker ;
+    }
+  }
+}
+
+// Construct subview of a Rank 7 view
+template< class DstDataType , class DstArg1Type , class DstArg2Type , class DstArg3Type >
+template< class SrcDataType , class SrcArg1Type , class SrcArg2Type , class SrcArg3Type
+        , class SubArg0_type , class SubArg1_type , class SubArg2_type , class SubArg3_type
+        , class SubArg4_type , class SubArg5_type , class SubArg6_type
+        >
+KOKKOS_INLINE_FUNCTION
+View< DstDataType , DstArg1Type , DstArg2Type , DstArg3Type , Impl::ViewDefault >::
+View( const View< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type , Impl::ViewDefault > & src
+    , const SubArg0_type & arg0
+    , const SubArg1_type & arg1
+    , const SubArg2_type & arg2
+    , const SubArg3_type & arg3
+    , const SubArg4_type & arg4
+    , const SubArg5_type & arg5
+    , const SubArg6_type & arg6
+    )
+  : m_ptr_on_device( (typename traits::value_type*) NULL)
+  , m_offset_map()
+  , m_management()
+  , m_tracker()
+{
+  // This constructor can only be used to construct a subview
+  // from the source view.  This type must match the subview type
+  // deduced from the source view and subview arguments.
+
+  typedef Impl::ViewSubview< View< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type , Impl::ViewDefault >
+                           , SubArg0_type , SubArg1_type , SubArg2_type , SubArg3_type
+                           , SubArg4_type , SubArg5_type , SubArg6_type , void >
+    ViewSubviewDeduction ;
+
+  enum { is_a_valid_subview_constructor =
+    Impl::StaticAssert<
+      Impl::is_same< View , typename ViewSubviewDeduction::type >::value
+    >::value
+  };
+
+  if ( is_a_valid_subview_constructor ) {
+
+    typedef Impl::ViewOffsetRange< SubArg0_type > R0 ;
+    typedef Impl::ViewOffsetRange< SubArg1_type > R1 ;
+    typedef Impl::ViewOffsetRange< SubArg2_type > R2 ;
+    typedef Impl::ViewOffsetRange< SubArg3_type > R3 ;
+    typedef Impl::ViewOffsetRange< SubArg4_type > R4 ;
+    typedef Impl::ViewOffsetRange< SubArg5_type > R5 ;
+    typedef Impl::ViewOffsetRange< SubArg6_type > R6 ;
+
+    // 'assign_subview' returns whether the subview offset_map
+    // introduces noncontiguity in the view.
+    const bool introduce_noncontiguity =
+      m_offset_map.assign_subview( src.m_offset_map
+                                 , R0::dimension( src.m_offset_map.N0 , arg0 )
+                                 , R1::dimension( src.m_offset_map.N1 , arg1 )
+                                 , R2::dimension( src.m_offset_map.N2 , arg2 )
+                                 , R3::dimension( src.m_offset_map.N3 , arg3 )
+                                 , R4::dimension( src.m_offset_map.N4 , arg4 )
+                                 , R5::dimension( src.m_offset_map.N5 , arg5 )
+                                 , R6::dimension( src.m_offset_map.N6 , arg6 )
+                                 , 0
+                                 );
+
+    if ( m_offset_map.capacity() ) {
+
+      m_management = src.m_management ;
+
+      if ( introduce_noncontiguity ) m_management.set_noncontiguous();
+
+      m_ptr_on_device = src.m_ptr_on_device +
+                        src.m_offset_map( R0::begin( arg0 )
+                                        , R1::begin( arg1 )
+                                        , R2::begin( arg2 )
+                                        , R3::begin( arg3 )
+                                        , R4::begin( arg4 )
+                                        , R5::begin( arg5 )
+                                        , R6::begin( arg6 )
+                                        );
+      m_tracker = src.m_tracker ;
+    }
+  }
+}
+
+// Construct subview of a Rank 6 view
+template< class DstDataType , class DstArg1Type , class DstArg2Type , class DstArg3Type >
+template< class SrcDataType , class SrcArg1Type , class SrcArg2Type , class SrcArg3Type
+        , class SubArg0_type , class SubArg1_type , class SubArg2_type , class SubArg3_type
+        , class SubArg4_type , class SubArg5_type
+        >
+KOKKOS_INLINE_FUNCTION
+View< DstDataType , DstArg1Type , DstArg2Type , DstArg3Type , Impl::ViewDefault >::
+View( const View< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type , Impl::ViewDefault > & src
+    , const SubArg0_type & arg0
+    , const SubArg1_type & arg1
+    , const SubArg2_type & arg2
+    , const SubArg3_type & arg3
+    , const SubArg4_type & arg4
+    , const SubArg5_type & arg5
+    )
+  : m_ptr_on_device( (typename traits::value_type*) NULL)
+  , m_offset_map()
+  , m_management()
+  , m_tracker()
+{
+  // This constructor can only be used to construct a subview
+  // from the source view.  This type must match the subview type
+  // deduced from the source view and subview arguments.
+
+  typedef Impl::ViewSubview< View< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type , Impl::ViewDefault >
+                           , SubArg0_type , SubArg1_type , SubArg2_type , SubArg3_type
+                           , SubArg4_type , SubArg5_type , void , void >
+    ViewSubviewDeduction ;
+
+  enum { is_a_valid_subview_constructor =
+    Impl::StaticAssert<
+      Impl::is_same< View , typename ViewSubviewDeduction::type >::value
+    >::value
+  };
+
+  if ( is_a_valid_subview_constructor ) {
+
+    typedef Impl::ViewOffsetRange< SubArg0_type > R0 ;
+    typedef Impl::ViewOffsetRange< SubArg1_type > R1 ;
+    typedef Impl::ViewOffsetRange< SubArg2_type > R2 ;
+    typedef Impl::ViewOffsetRange< SubArg3_type > R3 ;
+    typedef Impl::ViewOffsetRange< SubArg4_type > R4 ;
+    typedef Impl::ViewOffsetRange< SubArg5_type > R5 ;
+
+    // 'assign_subview' returns whether the subview offset_map
+    // introduces noncontiguity in the view.
+    const bool introduce_noncontiguity =
+      m_offset_map.assign_subview( src.m_offset_map
+                                 , R0::dimension( src.m_offset_map.N0 , arg0 )
+                                 , R1::dimension( src.m_offset_map.N1 , arg1 )
+                                 , R2::dimension( src.m_offset_map.N2 , arg2 )
+                                 , R3::dimension( src.m_offset_map.N3 , arg3 )
+                                 , R4::dimension( src.m_offset_map.N4 , arg4 )
+                                 , R5::dimension( src.m_offset_map.N5 , arg5 )
+                                 , 0
+                                 , 0
+                                 );
+
+    if ( m_offset_map.capacity() ) {
+
+      m_management = src.m_management ;
+
+      if ( introduce_noncontiguity ) m_management.set_noncontiguous();
+
+      m_ptr_on_device = src.m_ptr_on_device +
+                        src.m_offset_map( R0::begin( arg0 )
+                                        , R1::begin( arg1 )
+                                        , R2::begin( arg2 )
+                                        , R3::begin( arg3 )
+                                        , R4::begin( arg4 )
+                                        , R5::begin( arg5 )
+                                        );
+      m_tracker = src.m_tracker ;
+    }
+  }
+}
+
+// Construct subview of a Rank 5 view
+template< class DstDataType , class DstArg1Type , class DstArg2Type , class DstArg3Type >
+template< class SrcDataType , class SrcArg1Type , class SrcArg2Type , class SrcArg3Type
+        , class SubArg0_type , class SubArg1_type , class SubArg2_type , class SubArg3_type
+        , class SubArg4_type
+        >
+KOKKOS_INLINE_FUNCTION
+View< DstDataType , DstArg1Type , DstArg2Type , DstArg3Type , Impl::ViewDefault >::
+View( const View< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type , Impl::ViewDefault > & src
+    , const SubArg0_type & arg0
+    , const SubArg1_type & arg1
+    , const SubArg2_type & arg2
+    , const SubArg3_type & arg3
+    , const SubArg4_type & arg4
+    )
+  : m_ptr_on_device( (typename traits::value_type*) NULL)
+  , m_offset_map()
+  , m_management()
+  , m_tracker()
+{
+  // This constructor can only be used to construct a subview
+  // from the source view.  This type must match the subview type
+  // deduced from the source view and subview arguments.
+
+  typedef Impl::ViewSubview< View< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type , Impl::ViewDefault >
+                           , SubArg0_type , SubArg1_type , SubArg2_type , SubArg3_type
+                           , SubArg4_type , void , void , void >
+    ViewSubviewDeduction ;
+
+  enum { is_a_valid_subview_constructor =
+    Impl::StaticAssert<
+      Impl::is_same< View , typename ViewSubviewDeduction::type >::value
+    >::value
+  };
+
+  if ( is_a_valid_subview_constructor ) {
+
+    typedef Impl::ViewOffsetRange< SubArg0_type > R0 ;
+    typedef Impl::ViewOffsetRange< SubArg1_type > R1 ;
+    typedef Impl::ViewOffsetRange< SubArg2_type > R2 ;
+    typedef Impl::ViewOffsetRange< SubArg3_type > R3 ;
+    typedef Impl::ViewOffsetRange< SubArg4_type > R4 ;
+
+    // 'assign_subview' returns whether the subview offset_map
+    // introduces noncontiguity in the view.
+    const bool introduce_noncontiguity =
+      m_offset_map.assign_subview( src.m_offset_map
+                                 , R0::dimension( src.m_offset_map.N0 , arg0 )
+                                 , R1::dimension( src.m_offset_map.N1 , arg1 )
+                                 , R2::dimension( src.m_offset_map.N2 , arg2 )
+                                 , R3::dimension( src.m_offset_map.N3 , arg3 )
+                                 , R4::dimension( src.m_offset_map.N4 , arg4 )
+                                 , 0
+                                 , 0
+                                 , 0
+                                 );
+
+    if ( m_offset_map.capacity() ) {
+
+      m_management = src.m_management ;
+
+      if ( introduce_noncontiguity ) m_management.set_noncontiguous();
+
+      m_ptr_on_device = src.m_ptr_on_device +
+                        src.m_offset_map( R0::begin( arg0 )
+                                        , R1::begin( arg1 )
+                                        , R2::begin( arg2 )
+                                        , R3::begin( arg3 )
+                                        , R4::begin( arg4 )
+                                        );
+      m_tracker = src.m_tracker ;
+    }
+  }
+}
+
+// Construct subview of a Rank 4 view
+template< class DstDataType , class DstArg1Type , class DstArg2Type , class DstArg3Type >
+template< class SrcDataType , class SrcArg1Type , class SrcArg2Type , class SrcArg3Type
+        , class SubArg0_type , class SubArg1_type , class SubArg2_type , class SubArg3_type
+        >
+KOKKOS_INLINE_FUNCTION
+View< DstDataType , DstArg1Type , DstArg2Type , DstArg3Type , Impl::ViewDefault >::
+View( const View< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type , Impl::ViewDefault > & src
+    , const SubArg0_type & arg0
+    , const SubArg1_type & arg1
+    , const SubArg2_type & arg2
+    , const SubArg3_type & arg3
+    )
+  : m_ptr_on_device( (typename traits::value_type*) NULL)
+  , m_offset_map()
+  , m_management()
+  , m_tracker()
+{
+  // This constructor can only be used to construct a subview
+  // from the source view.  This type must match the subview type
+  // deduced from the source view and subview arguments.
+
+  typedef Impl::ViewSubview< View< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type , Impl::ViewDefault >
+                           , SubArg0_type , SubArg1_type , SubArg2_type , SubArg3_type
+                           , void , void , void , void >
+    ViewSubviewDeduction ;
+
+  enum { is_a_valid_subview_constructor =
+    Impl::StaticAssert<
+      Impl::is_same< View , typename ViewSubviewDeduction::type >::value
+    >::value
+  };
+
+  if ( is_a_valid_subview_constructor ) {
+
+    typedef Impl::ViewOffsetRange< SubArg0_type > R0 ;
+    typedef Impl::ViewOffsetRange< SubArg1_type > R1 ;
+    typedef Impl::ViewOffsetRange< SubArg2_type > R2 ;
+    typedef Impl::ViewOffsetRange< SubArg3_type > R3 ;
+
+    // 'assign_subview' returns whether the subview offset_map
+    // introduces noncontiguity in the view.
+    const bool introduce_noncontiguity =
+      m_offset_map.assign_subview( src.m_offset_map
+                                 , R0::dimension( src.m_offset_map.N0 , arg0 )
+                                 , R1::dimension( src.m_offset_map.N1 , arg1 )
+                                 , R2::dimension( src.m_offset_map.N2 , arg2 )
+                                 , R3::dimension( src.m_offset_map.N3 , arg3 )
+                                 , 0
+                                 , 0
+                                 , 0
+                                 , 0
+                                 );
+
+    if ( m_offset_map.capacity() ) {
+
+      m_management = src.m_management ;
+
+      if ( introduce_noncontiguity ) m_management.set_noncontiguous();
+
+      m_ptr_on_device = src.m_ptr_on_device +
+                        src.m_offset_map( R0::begin( arg0 )
+                                        , R1::begin( arg1 )
+                                        , R2::begin( arg2 )
+                                        , R3::begin( arg3 )
+                                        );
+      m_tracker = src.m_tracker ;
+    }
+  }
+}
+
+// Construct subview of a Rank 3 view
+template< class DstDataType , class DstArg1Type , class DstArg2Type , class DstArg3Type >
+template< class SrcDataType , class SrcArg1Type , class SrcArg2Type , class SrcArg3Type
+        , class SubArg0_type , class SubArg1_type , class SubArg2_type
+        >
+KOKKOS_INLINE_FUNCTION
+View< DstDataType , DstArg1Type , DstArg2Type , DstArg3Type , Impl::ViewDefault >::
+View( const View< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type , Impl::ViewDefault > & src
+    , const SubArg0_type & arg0
+    , const SubArg1_type & arg1
+    , const SubArg2_type & arg2
+    )
+  : m_ptr_on_device( (typename traits::value_type*) NULL)
+  , m_offset_map()
+  , m_management()
+  , m_tracker()
+{
+  // This constructor can only be used to construct a subview
+  // from the source view.  This type must match the subview type
+  // deduced from the source view and subview arguments.
+
+  typedef Impl::ViewSubview< View< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type , Impl::ViewDefault >
+                           , SubArg0_type , SubArg1_type , SubArg2_type , void , void , void , void , void >
+    ViewSubviewDeduction ;
+
+  enum { is_a_valid_subview_constructor =
+    Impl::StaticAssert<
+      Impl::is_same< View , typename ViewSubviewDeduction::type >::value
+    >::value
+  };
+
+  if ( is_a_valid_subview_constructor ) {
+
+    typedef Impl::ViewOffsetRange< SubArg0_type > R0 ;
+    typedef Impl::ViewOffsetRange< SubArg1_type > R1 ;
+    typedef Impl::ViewOffsetRange< SubArg2_type > R2 ;
+
+    // 'assign_subview' returns whether the subview offset_map
+    // introduces noncontiguity in the view.
+    const bool introduce_noncontiguity =
+      m_offset_map.assign_subview( src.m_offset_map
+                                 , R0::dimension( src.m_offset_map.N0 , arg0 )
+                                 , R1::dimension( src.m_offset_map.N1 , arg1 )
+                                 , R2::dimension( src.m_offset_map.N2 , arg2 )
+                                 , 0 , 0 , 0 , 0 , 0);
+
+    if ( m_offset_map.capacity() ) {
+
+      m_management = src.m_management ;
+
+      if ( introduce_noncontiguity ) m_management.set_noncontiguous();
+
+      m_ptr_on_device = src.m_ptr_on_device +
+                        src.m_offset_map( R0::begin( arg0 )
+                                        , R1::begin( arg1 )
+                                        , R2::begin( arg2 )
+                                        );
+      m_tracker = src.m_tracker ;
+    }
+  }
+}
+
+// Construct subview of a Rank 2 view
+template< class DstDataType , class DstArg1Type , class DstArg2Type , class DstArg3Type >
+template< class SrcDataType , class SrcArg1Type , class SrcArg2Type , class SrcArg3Type
+        , class SubArg0_type , class SubArg1_type
+        >
+KOKKOS_INLINE_FUNCTION
+View< DstDataType , DstArg1Type , DstArg2Type , DstArg3Type , Impl::ViewDefault >::
+View( const View< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type , Impl::ViewDefault > & src
+    , const SubArg0_type & arg0
+    , const SubArg1_type & arg1
+    )
+  : m_ptr_on_device( (typename traits::value_type*) NULL)
+  , m_offset_map()
+  , m_management()
+  , m_tracker()
+{
+  // This constructor can only be used to construct a subview
+  // from the source view.  This type must match the subview type
+  // deduced from the source view and subview arguments.
+
+  typedef Impl::ViewSubview< View< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type , Impl::ViewDefault >
+                           , SubArg0_type , SubArg1_type , void , void , void , void , void , void >
+    ViewSubviewDeduction ;
+
+  enum { is_a_valid_subview_constructor =
+    Impl::StaticAssert<
+      Impl::is_same< View , typename ViewSubviewDeduction::type >::value
+    >::value
+  };
+
+  if ( is_a_valid_subview_constructor ) {
+
+    typedef Impl::ViewOffsetRange< SubArg0_type > R0 ;
+    typedef Impl::ViewOffsetRange< SubArg1_type > R1 ;
+
+    // 'assign_subview' returns whether the subview offset_map
+    // introduces noncontiguity in the view.
+    const bool introduce_noncontiguity =
+      m_offset_map.assign_subview( src.m_offset_map
+                                 , R0::dimension( src.m_offset_map.N0 , arg0 )
+                                 , R1::dimension( src.m_offset_map.N1 , arg1 )
+                                 , 0 , 0 , 0 , 0 , 0 , 0 );
+
+    if ( m_offset_map.capacity() ) {
+
+      m_management = src.m_management ;
+
+      if ( introduce_noncontiguity ) m_management.set_noncontiguous();
+
+      m_ptr_on_device = src.m_ptr_on_device +
+                        src.m_offset_map( R0::begin( arg0 )
+                                        , R1::begin( arg1 )
+                                        );
+      m_tracker = src.m_tracker ;
+    }
+  }
+}
+
+// Construct subview of a Rank 1 view
+template< class DstDataType , class DstArg1Type , class DstArg2Type , class DstArg3Type >
+template< class SrcDataType , class SrcArg1Type , class SrcArg2Type , class SrcArg3Type
+        , class SubArg0_type
+        >
+KOKKOS_INLINE_FUNCTION
+View< DstDataType , DstArg1Type , DstArg2Type , DstArg3Type , Impl::ViewDefault >::
+View( const View< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type , Impl::ViewDefault > & src
+    , const SubArg0_type & arg0
+    )
+  : m_ptr_on_device( (typename traits::value_type*) NULL)
+  , m_offset_map()
+  , m_management()
+  , m_tracker()
+{
+  // This constructor can only be used to construct a subview
+  // from the source view.  This type must match the subview type
+  // deduced from the source view and subview arguments.
+
+  typedef Impl::ViewSubview< View< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type , Impl::ViewDefault >
+                           , SubArg0_type , void , void , void , void , void , void , void >
+    ViewSubviewDeduction ;
+
+  enum { is_a_valid_subview_constructor =
+    Impl::StaticAssert<
+      Impl::is_same< View , typename ViewSubviewDeduction::type >::value
+    >::value
+  };
+
+  if ( is_a_valid_subview_constructor ) {
+
+    typedef Impl::ViewOffsetRange< SubArg0_type > R0 ;
+
+    // 'assign_subview' returns whether the subview offset_map
+    // introduces noncontiguity in the view.
+    const bool introduce_noncontiguity =
+      m_offset_map.assign_subview( src.m_offset_map
+                                 , R0::dimension( src.m_offset_map.N0 , arg0 )
+                                 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
+
+    if ( m_offset_map.capacity() ) {
+
+      m_management = src.m_management ;
+
+      if ( introduce_noncontiguity ) m_management.set_noncontiguous();
+
+      m_ptr_on_device = src.m_ptr_on_device +
+                        src.m_offset_map( R0::begin( arg0 )
+                                        );
+      m_tracker = src.m_tracker ;
+    }
+  }
+}
+
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_VIEWDEFAULT_HPP */
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewOffset.hpp b/lib/kokkos/core/src/impl/Kokkos_ViewOffset.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..5748e722c0076e9f47a7c538bd4d2b6f7458e9b8
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_ViewOffset.hpp
@@ -0,0 +1,1341 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_VIEWOFFSET_HPP
+#define KOKKOS_VIEWOFFSET_HPP
+
+#include <Kokkos_Pair.hpp>
+#include <Kokkos_Layout.hpp>
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_Shape.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos { namespace Impl {
+
+template < class ShapeType , class LayoutType , typename Enable = void >
+struct ViewOffset ;
+
+//----------------------------------------------------------------------------
+// LayoutLeft AND ( 1 >= rank OR 0 == rank_dynamic ) : no padding / striding
+template < class ShapeType >
+struct ViewOffset< ShapeType , LayoutLeft
+                 , typename enable_if<( 1 >= ShapeType::rank
+                                        ||
+                                        0 == ShapeType::rank_dynamic
+                                      )>::type >
+  : public ShapeType
+{
+  typedef size_t     size_type ;
+  typedef ShapeType  shape_type ;
+  typedef LayoutLeft array_layout ;
+
+  enum { has_padding = false };
+
+  template< unsigned R >
+  KOKKOS_INLINE_FUNCTION
+  void assign( size_t n )
+    { assign_shape_dimension<R>( *this , n ); }
+
+  // Return whether the subview introduced noncontiguity
+  template< class S , class L >
+  KOKKOS_INLINE_FUNCTION
+  typename Impl::enable_if<( 0 == shape_type::rank &&
+                             Impl::is_same<L,LayoutLeft>::value
+                           ), bool >::type
+  assign_subview( const ViewOffset<S,L,void> &
+                , const size_t n0
+                , const size_t n1
+                , const size_t n2
+                , const size_t n3
+                , const size_t n4
+                , const size_t n5
+                , const size_t n6
+                , const size_t n7
+                )
+    {
+      return false ; // did not introduce noncontiguity
+    }
+
+  // This subview must be 1 == rank and 1 == rank_dynamic.
+  // The source dimension #0 must be non-zero and all other dimensions are zero.
+  // Return whether the subview introduced noncontiguity
+  template< class S , class L >
+  KOKKOS_INLINE_FUNCTION
+  typename Impl::enable_if<( 1 == shape_type::rank &&
+                             1 == shape_type::rank_dynamic &&
+                             1 <= S::rank &&
+                             Impl::is_same<L,LayoutLeft>::value
+                           ), bool >::type
+  assign_subview( const ViewOffset<S,L,void> &
+                , const size_t n0
+                , const size_t n1
+                , const size_t n2
+                , const size_t n3
+                , const size_t n4
+                , const size_t n5
+                , const size_t n6
+                , const size_t n7
+                )
+    {
+      // n1 .. n7 must be zero
+      shape_type::N0 = n0 ;
+      return false ; // did not introduce noncontiguity
+    }
+
+
+  KOKKOS_INLINE_FUNCTION
+  void assign( size_t n0 , size_t n1 , size_t n2 , size_t n3
+             , size_t n4 , size_t n5 , size_t n6 , size_t n7
+             , size_t = 0 )
+    { shape_type::assign( *this , n0, n1, n2, n3, n4, n5, n6, n7 ); }
+
+  template< class ShapeRHS >
+  KOKKOS_INLINE_FUNCTION
+  void assign( const ViewOffset< ShapeRHS , LayoutLeft > & rhs
+             , typename enable_if<( int(ShapeRHS::rank) == int(shape_type::rank)
+                                    &&
+                                    int(ShapeRHS::rank_dynamic) <= int(shape_type::rank_dynamic)
+                                  )>::type * = 0 )
+    { shape_type::assign( *this , rhs.N0, rhs.N1, rhs.N2, rhs.N3, rhs.N4, rhs.N5, rhs.N6, rhs.N7 ); }
+
+  template< class ShapeRHS >
+  KOKKOS_INLINE_FUNCTION
+  void assign( const ViewOffset< ShapeRHS , LayoutRight > & rhs
+             , typename enable_if<( 1 == int(ShapeRHS::rank)
+                                    &&
+                                    1 == int(shape_type::rank)
+                                    &&
+                                    1 == int(shape_type::rank_dynamic)
+                                  )>::type * = 0 )
+    { shape_type::assign( *this , rhs.N0, rhs.N1, rhs.N2, rhs.N3, rhs.N4, rhs.N5, rhs.N6, rhs.N7 ); }
+
+  KOKKOS_INLINE_FUNCTION
+  void set_padding() {}
+
+  KOKKOS_INLINE_FUNCTION
+  size_type cardinality() const
+    { return size_type(shape_type::N0) * shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 * shape_type::N5 * shape_type::N6 * shape_type::N7 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type capacity() const
+    { return size_type(shape_type::N0) * shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 * shape_type::N5 * shape_type::N6 * shape_type::N7 ; }
+
+  // Stride with [ rank ] value is the total length
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  void stride( iType * const s ) const
+    {
+      s[0] = 1 ;
+      if ( 0 < shape_type::rank ) { s[1] = shape_type::N0 ; }
+      if ( 1 < shape_type::rank ) { s[2] = s[1] * shape_type::N1 ; }
+      if ( 2 < shape_type::rank ) { s[3] = s[2] * shape_type::N2 ; }
+      if ( 3 < shape_type::rank ) { s[4] = s[3] * shape_type::N3 ; }
+      if ( 4 < shape_type::rank ) { s[5] = s[4] * shape_type::N4 ; }
+      if ( 5 < shape_type::rank ) { s[6] = s[5] * shape_type::N5 ; }
+      if ( 6 < shape_type::rank ) { s[7] = s[6] * shape_type::N6 ; }
+      if ( 7 < shape_type::rank ) { s[8] = s[7] * shape_type::N7 ; }
+    }
+
+  KOKKOS_INLINE_FUNCTION size_type stride_0() const { return 1 ; }
+  KOKKOS_INLINE_FUNCTION size_type stride_1() const { return shape_type::N0 ; }
+  KOKKOS_INLINE_FUNCTION size_type stride_2() const { return shape_type::N0 * shape_type::N1 ; }
+  KOKKOS_INLINE_FUNCTION size_type stride_3() const { return shape_type::N0 * shape_type::N1 * shape_type::N2 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_4() const
+    { return shape_type::N0 * shape_type::N1 * shape_type::N2 * shape_type::N3 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_5() const
+    { return shape_type::N0 * shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_6() const
+    { return shape_type::N0 * shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 * shape_type::N5 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_7() const
+    { return shape_type::N0 * shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 * shape_type::N5 * shape_type::N6 ; }
+
+  // rank 1
+  template< typename I0 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const & i0 ) const { return i0 ; }
+
+  // rank 2
+  template < typename I0 , typename I1 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const & i0 , I1 const & i1 ) const
+    { return i0 + shape_type::N0 * i1 ; }
+
+  //rank 3
+  template <typename I0, typename I1, typename I2>
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0
+                      , I1 const& i1
+                      , I2 const& i2
+                      ) const
+    {
+      return i0 + shape_type::N0 * (
+             i1 + shape_type::N1 * i2 );
+    }
+
+  //rank 4
+  template <typename I0, typename I1, typename I2, typename I3>
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2, I3 const& i3 ) const
+    {
+      return i0 + shape_type::N0 * (
+             i1 + shape_type::N1 * (
+             i2 + shape_type::N2 * i3 ));
+    }
+
+  //rank 5
+  template < typename I0, typename I1, typename I2, typename I3
+            ,typename I4 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2, I3 const& i3, I4 const& i4 ) const
+    {
+      return i0 + shape_type::N0 * (
+             i1 + shape_type::N1 * (
+             i2 + shape_type::N2 * (
+             i3 + shape_type::N3 * i4 )));
+    }
+
+  //rank 6
+  template < typename I0, typename I1, typename I2, typename I3
+            ,typename I4, typename I5 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2, I3 const& i3, I4 const& i4, I5 const& i5 ) const
+    {
+      return i0 + shape_type::N0 * (
+             i1 + shape_type::N1 * (
+             i2 + shape_type::N2 * (
+             i3 + shape_type::N3 * (
+             i4 + shape_type::N4 * i5 ))));
+    }
+
+  //rank 7
+  template < typename I0, typename I1, typename I2, typename I3
+            ,typename I4, typename I5, typename I6 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2, I3 const& i3, I4 const& i4, I5 const& i5, I6 const& i6) const
+  {
+    return i0 + shape_type::N0 * (
+           i1 + shape_type::N1 * (
+           i2 + shape_type::N2 * (
+           i3 + shape_type::N3 * (
+           i4 + shape_type::N4 * (
+           i5 + shape_type::N5 * i6 )))));
+  }
+
+  //rank 8
+  template < typename I0, typename I1, typename I2, typename I3
+            ,typename I4, typename I5, typename I6, typename I7 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2, I3 const& i3, I4 const& i4, I5 const& i5, I6 const& i6, I7 const& i7) const
+  {
+    return i0 + shape_type::N0 * (
+           i1 + shape_type::N1 * (
+           i2 + shape_type::N2 * (
+           i3 + shape_type::N3 * (
+           i4 + shape_type::N4 * (
+           i5 + shape_type::N5 * (
+           i6 + shape_type::N6 * i7 ))))));
+  }
+};
+
+//----------------------------------------------------------------------------
+// LayoutLeft AND ( 1 < rank AND 0 < rank_dynamic ) : has padding / striding
+template < class ShapeType >
+struct ViewOffset< ShapeType , LayoutLeft
+                 , typename enable_if<( 1 < ShapeType::rank
+                                        &&
+                                        0 < ShapeType::rank_dynamic
+                                      )>::type >
+  : public ShapeType
+{
+  typedef size_t     size_type ;
+  typedef ShapeType  shape_type ;
+  typedef LayoutLeft array_layout ;
+
+  enum { has_padding = true };
+
+  size_type S0 ;
+
+  // This subview must be 2 == rank and 2 == rank_dynamic
+  // due to only having stride #0.
+  // The source dimension #0 must be non-zero for stride-one leading dimension.
+  // At most subsequent dimension can be non-zero.
+  // Return whether the subview introduced noncontiguity.
+  template< class S , class L >
+  KOKKOS_INLINE_FUNCTION
+  typename Impl::enable_if<( 2 == shape_type::rank &&
+                             2 == shape_type::rank_dynamic &&
+                             2 <= S::rank &&
+                             Impl::is_same<L,LayoutLeft>::value
+                           ), bool >::type
+  assign_subview( const ViewOffset<S,L,void> & rhs
+                , const size_t n0
+                , const size_t n1
+                , const size_t n2
+                , const size_t n3
+                , const size_t n4
+                , const size_t n5
+                , const size_t n6
+                , const size_t n7
+                )
+    {
+      // N1 = second non-zero dimension
+      // S0 = stride for second non-zero dimension
+      shape_type::N0 = n0 ;
+      shape_type::N1 = 0 ;
+      S0 = 0 ;
+
+      if      (                n1 ) { shape_type::N1 = n1 ; S0 = rhs.stride_1(); }
+      else if ( 2 < S::rank && n2 ) { shape_type::N1 = n2 ; S0 = rhs.stride_2(); }
+      else if ( 3 < S::rank && n3 ) { shape_type::N1 = n3 ; S0 = rhs.stride_3(); }
+      else if ( 4 < S::rank && n4 ) { shape_type::N1 = n4 ; S0 = rhs.stride_4(); }
+      else if ( 5 < S::rank && n5 ) { shape_type::N1 = n5 ; S0 = rhs.stride_5(); }
+      else if ( 6 < S::rank && n6 ) { shape_type::N1 = n6 ; S0 = rhs.stride_6(); }
+      else if ( 7 < S::rank && n7 ) { shape_type::N1 = n7 ; S0 = rhs.stride_7(); }
+
+      // Introduce noncontiguity if change the first dimension
+      // or took a range of a dimension after the second.
+      return ( size_t(shape_type::N0) != size_t(rhs.N0) ) || ( 0 == n1 );
+    }
+
+
+  template< unsigned R >
+  KOKKOS_INLINE_FUNCTION
+  void assign( size_t n )
+    { assign_shape_dimension<R>( *this , n ); }
+
+
+  KOKKOS_INLINE_FUNCTION
+  void assign( size_t n0 , size_t n1 , size_t n2 , size_t n3
+             , size_t n4 , size_t n5 , size_t n6 , size_t n7
+             , size_t = 0 )
+    { shape_type::assign( *this , n0, n1, n2, n3, n4, n5, n6, n7 ); S0 = shape_type::N0 ; }
+
+  template< class ShapeRHS >
+  KOKKOS_INLINE_FUNCTION
+  void assign( const ViewOffset< ShapeRHS , LayoutLeft > & rhs
+             , typename enable_if<( int(ShapeRHS::rank) == int(shape_type::rank)
+                                    &&
+                                    int(ShapeRHS::rank_dynamic) <= int(shape_type::rank_dynamic)
+                                    &&
+                                    int(ShapeRHS::rank_dynamic) == 0
+                                  )>::type * = 0 )
+    {
+      shape_type::assign( *this , rhs.N0, rhs.N1, rhs.N2, rhs.N3, rhs.N4, rhs.N5, rhs.N6, rhs.N7 );
+      S0 = shape_type::N0 ; // No padding when dynamic_rank == 0
+    }
+
+  template< class ShapeRHS >
+  KOKKOS_INLINE_FUNCTION
+  void assign( const ViewOffset< ShapeRHS , LayoutLeft > & rhs
+             , typename enable_if<( int(ShapeRHS::rank) == int(shape_type::rank)
+                                    &&
+                                    int(ShapeRHS::rank_dynamic) <= int(shape_type::rank_dynamic)
+                                    &&
+                                    int(ShapeRHS::rank_dynamic) > 0
+                                  )>::type * = 0 )
+    {
+      shape_type::assign( *this , rhs.N0, rhs.N1, rhs.N2, rhs.N3, rhs.N4, rhs.N5, rhs.N6, rhs.N7 );
+      S0 = rhs.S0 ; // possibly padding when dynamic rank > 0
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void set_padding()
+    {
+      enum { div   = MEMORY_ALIGNMENT / shape_type::scalar_size };
+      enum { mod   = MEMORY_ALIGNMENT % shape_type::scalar_size };
+      enum { align = 0 == mod ? div : 0 };
+
+      if ( align && MEMORY_ALIGNMENT_THRESHOLD * align < S0 ) {
+
+        const size_type count_mod = S0 % ( div ? div : 1 );
+
+        if ( count_mod ) { S0 += align - count_mod ; }
+      }
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type cardinality() const
+    { return size_type(shape_type::N0) * shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 * shape_type::N5 * shape_type::N6 * shape_type::N7 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type capacity() const
+    { return size_type(S0) * shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 * shape_type::N5 * shape_type::N6 * shape_type::N7 ; }
+
+  // Stride with [ rank ] as total length
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  void stride( iType * const s ) const
+    {
+      s[0] = 1 ;
+      if ( 0 < shape_type::rank ) { s[1] = S0 ; }
+      if ( 1 < shape_type::rank ) { s[2] = s[1] * shape_type::N1 ; }
+      if ( 2 < shape_type::rank ) { s[3] = s[2] * shape_type::N2 ; }
+      if ( 3 < shape_type::rank ) { s[4] = s[3] * shape_type::N3 ; }
+      if ( 4 < shape_type::rank ) { s[5] = s[4] * shape_type::N4 ; }
+      if ( 5 < shape_type::rank ) { s[6] = s[5] * shape_type::N5 ; }
+      if ( 6 < shape_type::rank ) { s[7] = s[6] * shape_type::N6 ; }
+      if ( 7 < shape_type::rank ) { s[8] = s[7] * shape_type::N7 ; }
+    }
+
+  KOKKOS_INLINE_FUNCTION size_type stride_0() const { return 1 ; }
+  KOKKOS_INLINE_FUNCTION size_type stride_1() const { return S0 ; }
+  KOKKOS_INLINE_FUNCTION size_type stride_2() const { return S0 * shape_type::N1 ; }
+  KOKKOS_INLINE_FUNCTION size_type stride_3() const { return S0 * shape_type::N1 * shape_type::N2 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_4() const
+    { return S0 * shape_type::N1 * shape_type::N2 * shape_type::N3 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_5() const
+    { return S0 * shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_6() const
+    { return S0 * shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 * shape_type::N5 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_7() const
+    { return S0 * shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 * shape_type::N5 * shape_type::N6 ; }
+
+  // rank 2
+  template < typename I0 , typename I1 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const & i0 , I1 const & i1) const
+    { return i0 + S0 * i1 ; }
+
+  //rank 3
+  template <typename I0, typename I1, typename I2>
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2 ) const
+    {
+      return i0 + S0 * (
+             i1 + shape_type::N1 * i2 );
+    }
+
+  //rank 4
+  template <typename I0, typename I1, typename I2, typename I3>
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2, I3 const& i3 ) const
+    {
+      return i0 + S0 * (
+             i1 + shape_type::N1 * (
+             i2 + shape_type::N2 * i3 ));
+    }
+
+  //rank 5
+  template < typename I0, typename I1, typename I2, typename I3
+            ,typename I4 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2, I3 const& i3, I4 const& i4 ) const
+    {
+      return i0 + S0 * (
+             i1 + shape_type::N1 * (
+             i2 + shape_type::N2 * (
+             i3 + shape_type::N3 * i4 )));
+    }
+
+  //rank 6
+  template < typename I0, typename I1, typename I2, typename I3
+            ,typename I4, typename I5 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2, I3 const& i3, I4 const& i4, I5 const& i5 ) const
+    {
+      return i0 + S0 * (
+             i1 + shape_type::N1 * (
+             i2 + shape_type::N2 * (
+             i3 + shape_type::N3 * (
+             i4 + shape_type::N4 * i5 ))));
+    }
+
+  //rank 7
+  template < typename I0, typename I1, typename I2, typename I3
+            ,typename I4, typename I5, typename I6 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2, I3 const& i3, I4 const& i4, I5 const& i5, I6 const& i6 ) const
+  {
+    return i0 + S0 * (
+           i1 + shape_type::N1 * (
+           i2 + shape_type::N2 * (
+           i3 + shape_type::N3 * (
+           i4 + shape_type::N4 * (
+           i5 + shape_type::N5 * i6 )))));
+  }
+
+  //rank 8
+  template < typename I0, typename I1, typename I2, typename I3
+            ,typename I4, typename I5, typename I6, typename I7 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2, I3 const& i3, I4 const& i4, I5 const& i5, I6 const& i6, I7 const& i7 ) const
+  {
+    return i0 + S0 * (
+           i1 + shape_type::N1 * (
+           i2 + shape_type::N2 * (
+           i3 + shape_type::N3 * (
+           i4 + shape_type::N4 * (
+           i5 + shape_type::N5 * (
+           i6 + shape_type::N6 * i7 ))))));
+  }
+};
+
+//----------------------------------------------------------------------------
+// LayoutRight AND ( 1 >= rank OR 1 >= rank_dynamic ) : no padding / striding
+template < class ShapeType >
+struct ViewOffset< ShapeType , LayoutRight
+                 , typename enable_if<( 1 >= ShapeType::rank
+                                        ||
+                                        1 >= ShapeType::rank_dynamic
+                                      )>::type >
+  : public ShapeType
+{
+  typedef size_t       size_type;
+  typedef ShapeType    shape_type;
+  typedef LayoutRight  array_layout ;
+
+  enum { has_padding = false };
+
+  // This subview must be 1 == rank and 1 == rank_dynamic
+  // The source view's last dimension must be non-zero
+  // Return whether the subview introduced noncontiguity
+  template< class S , class L >
+  KOKKOS_INLINE_FUNCTION
+  typename Impl::enable_if<( 0 == shape_type::rank &&
+                             Impl::is_same<L,LayoutRight>::value
+                           ), bool >::type
+  assign_subview( const ViewOffset<S,L,void> &
+                , const size_t n0
+                , const size_t n1
+                , const size_t n2
+                , const size_t n3
+                , const size_t n4
+                , const size_t n5
+                , const size_t n6
+                , const size_t n7
+                )
+    { return false ; }
+
+  // This subview must be 1 == rank and 1 == rank_dynamic
+  // The source view's last dimension must be non-zero
+  // Return whether the subview introduced noncontiguity
+  template< class S , class L >
+  KOKKOS_INLINE_FUNCTION
+  typename Impl::enable_if<( 1 == shape_type::rank &&
+                             1 == shape_type::rank_dynamic &&
+                             1 <= S::rank &&
+                             Impl::is_same<L,LayoutRight>::value
+                           ), bool >::type
+  assign_subview( const ViewOffset<S,L,void> &
+                , const size_t n0
+                , const size_t n1
+                , const size_t n2
+                , const size_t n3
+                , const size_t n4
+                , const size_t n5
+                , const size_t n6
+                , const size_t n7
+                )
+    {
+      shape_type::N0 = S::rank == 1 ? n0 : (
+                       S::rank == 2 ? n1 : (
+                       S::rank == 3 ? n2 : (
+                       S::rank == 4 ? n3 : (
+                       S::rank == 5 ? n4 : (
+                       S::rank == 6 ? n5 : (
+                       S::rank == 7 ? n6 : n7 ))))));
+      // should have n0 .. n_(rank-2) equal zero
+      return false ;
+    }
+
+  template< unsigned R >
+  KOKKOS_INLINE_FUNCTION
+  void assign( size_t n )
+    { assign_shape_dimension<R>( *this , n ); }
+
+  KOKKOS_INLINE_FUNCTION
+  void assign( size_t n0 , size_t n1 , size_t n2 , size_t n3
+             , size_t n4 , size_t n5 , size_t n6 , size_t n7
+             , size_t = 0 )
+    { shape_type::assign( *this , n0, n1, n2, n3, n4, n5, n6, n7 ); }
+
+  template< class ShapeRHS >
+  KOKKOS_INLINE_FUNCTION
+  void assign( const ViewOffset< ShapeRHS , LayoutRight > & rhs
+             , typename enable_if<( int(ShapeRHS::rank) == int(shape_type::rank)
+                                    &&
+                                    int(ShapeRHS::rank_dynamic) <= int(shape_type::rank_dynamic)
+                                  )>::type * = 0 )
+    { shape_type::assign( *this , rhs.N0, rhs.N1, rhs.N2, rhs.N3, rhs.N4, rhs.N5, rhs.N6, rhs.N7 ); }
+
+  template< class ShapeRHS >
+  KOKKOS_INLINE_FUNCTION
+  void assign( const ViewOffset< ShapeRHS , LayoutLeft > & rhs
+             , typename enable_if<( 1 == int(ShapeRHS::rank)
+                                    &&
+                                    1 == int(shape_type::rank)
+                                    &&
+                                    1 == int(shape_type::rank_dynamic)
+                                  )>::type * = 0 )
+    { shape_type::assign( *this , rhs.N0, rhs.N1, rhs.N2, rhs.N3, rhs.N4, rhs.N5, rhs.N6, rhs.N7 ); }
+
+  KOKKOS_INLINE_FUNCTION
+  void set_padding() {}
+
+  KOKKOS_INLINE_FUNCTION
+  size_type cardinality() const
+    { return size_type(shape_type::N0) * shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 * shape_type::N5 * shape_type::N6 * shape_type::N7 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type capacity() const
+    { return size_type(shape_type::N0) * shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 * shape_type::N5 * shape_type::N6 * shape_type::N7 ; }
+
+  size_type stride_R() const
+    {
+      return size_type(shape_type::N1) * shape_type::N2 * shape_type::N3 *
+             shape_type::N4 * shape_type::N5 * shape_type::N6 * shape_type::N7 ;
+    };
+
+  // Stride with [rank] as total length
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  void stride( iType * const s ) const
+    {
+      size_type n = 1 ;
+      if ( 7 < shape_type::rank ) { s[7] = n ; n *= shape_type::N7 ; }
+      if ( 6 < shape_type::rank ) { s[6] = n ; n *= shape_type::N6 ; }
+      if ( 5 < shape_type::rank ) { s[5] = n ; n *= shape_type::N5 ; }
+      if ( 4 < shape_type::rank ) { s[4] = n ; n *= shape_type::N4 ; }
+      if ( 3 < shape_type::rank ) { s[3] = n ; n *= shape_type::N3 ; }
+      if ( 2 < shape_type::rank ) { s[2] = n ; n *= shape_type::N2 ; }
+      if ( 1 < shape_type::rank ) { s[1] = n ; n *= shape_type::N1 ; }
+      if ( 0 < shape_type::rank ) { s[0] = n ; }
+      s[shape_type::rank] = n * shape_type::N0 ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_7() const { return 1 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_6() const { return shape_type::N7 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_5() const { return shape_type::N7 * shape_type::N6 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_4() const { return shape_type::N7 * shape_type::N6 * shape_type::N5 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_3() const { return shape_type::N7 * shape_type::N6 * shape_type::N5 * shape_type::N4 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_2() const { return shape_type::N7 * shape_type::N6 * shape_type::N5 * shape_type::N4 * shape_type::N3 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_1() const { return shape_type::N7 * shape_type::N6 * shape_type::N5 * shape_type::N4 * shape_type::N3 * shape_type::N2 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_0() const { return shape_type::N7 * shape_type::N6 * shape_type::N5 * shape_type::N4 * shape_type::N3 * shape_type::N2 * shape_type::N1 ; }
+
+  // rank 1
+  template <typename I0>
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0) const
+    {
+      return i0 ;
+    }
+
+  // rank 2
+  template <typename I0, typename I1>
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1 ) const
+    {
+      return i1 + shape_type::N1 * i0 ;
+    }
+
+  template <typename I0, typename I1, typename I2>
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2 ) const
+    {
+      return i2 + shape_type::N2 * (
+             i1 + shape_type::N1 * ( i0 ));
+    }
+
+  template <typename I0, typename I1, typename I2, typename I3>
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2 , I3 const& i3 ) const
+    {
+      return i3 + shape_type::N3 * (
+             i2 + shape_type::N2 * (
+             i1 + shape_type::N1 * ( i0 )));
+    }
+
+  template < typename I0, typename I1, typename I2, typename I3
+            ,typename I4 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2 , I3 const& i3, I4 const& i4 ) const
+    {
+      return i4 + shape_type::N4 * (
+             i3 + shape_type::N3 * (
+             i2 + shape_type::N2 * (
+             i1 + shape_type::N1 * ( i0 ))));
+    }
+
+  template < typename I0, typename I1, typename I2, typename I3
+            ,typename I4, typename I5 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2 , I3 const& i3, I4 const& i4, I5 const& i5 ) const
+  {
+    return i5 + shape_type::N5 * (
+           i4 + shape_type::N4 * (
+           i3 + shape_type::N3 * (
+           i2 + shape_type::N2 * (
+           i1 + shape_type::N1 * ( i0 )))));
+  }
+
+  template < typename I0, typename I1, typename I2, typename I3
+            ,typename I4, typename I5, typename I6 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2 , I3 const& i3, I4 const& i4, I5 const& i5, I6 const& i6 ) const
+  {
+    return i6 + shape_type::N6 * (
+           i5 + shape_type::N5 * (
+           i4 + shape_type::N4 * (
+           i3 + shape_type::N3 * (
+           i2 + shape_type::N2 * (
+           i1 + shape_type::N1 * ( i0 ))))));
+  }
+
+  template < typename I0, typename I1, typename I2, typename I3
+            ,typename I4, typename I5, typename I6, typename I7 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2 , I3 const& i3, I4 const& i4, I5 const& i5, I6 const& i6, I7 const& i7 ) const
+  {
+    return i7 + shape_type::N7 * (
+           i6 + shape_type::N6 * (
+           i5 + shape_type::N5 * (
+           i4 + shape_type::N4 * (
+           i3 + shape_type::N3 * (
+           i2 + shape_type::N2 * (
+           i1 + shape_type::N1 * ( i0 )))))));
+  }
+};
+
+//----------------------------------------------------------------------------
+// LayoutRight AND ( 1 < rank AND 1 < rank_dynamic ) : has padding / striding
+template < class ShapeType >
+struct ViewOffset< ShapeType , LayoutRight
+                 , typename enable_if<( 1 < ShapeType::rank
+                                        &&
+                                        1 < ShapeType::rank_dynamic
+                                      )>::type >
+  : public ShapeType
+{
+  typedef size_t       size_type;
+  typedef ShapeType    shape_type;
+  typedef LayoutRight  array_layout ;
+
+  enum { has_padding = true };
+
+  size_type SR ;
+
+  // This subview must be 2 == rank and 2 == rank_dynamic
+  // due to only having stride #(rank-1).
+  // The source dimension #(rank-1) must be non-zero for stride-one leading dimension.
+  // At most one prior dimension can be non-zero.
+  // Return whether the subview introduced noncontiguity.
+  template< class S , class L >
+  KOKKOS_INLINE_FUNCTION
+  typename Impl::enable_if<( 2 == shape_type::rank &&
+                             2 == shape_type::rank_dynamic &&
+                             2 <= S::rank &&
+                             Impl::is_same<L,LayoutRight>::value
+                           ), bool >::type
+  assign_subview( const ViewOffset<S,L,void> & rhs
+                , const size_t n0
+                , const size_t n1
+                , const size_t n2
+                , const size_t n3
+                , const size_t n4
+                , const size_t n5
+                , const size_t n6
+                , const size_t n7
+                )
+    {
+      const size_type nR = S::rank == 2 ? n1 : (
+                           S::rank == 3 ? n2 : (
+                           S::rank == 4 ? n3 : (
+                           S::rank == 5 ? n4 : (
+                           S::rank == 6 ? n5 : (
+                           S::rank == 7 ? n6 : n7 )))));
+
+      // N0 = first non-zero-dimension
+      // N1 = last non-zero dimension
+      // SR = stride for second non-zero dimension
+      shape_type::N0 = 0 ;
+      shape_type::N1 = nR ;
+      SR = 0 ;
+
+      if      (                n0 ) { shape_type::N0 = n0 ; SR = rhs.stride_0(); }
+      else if ( 2 < S::rank && n1 ) { shape_type::N0 = n1 ; SR = rhs.stride_1(); }
+      else if ( 3 < S::rank && n2 ) { shape_type::N0 = n2 ; SR = rhs.stride_2(); }
+      else if ( 4 < S::rank && n3 ) { shape_type::N0 = n3 ; SR = rhs.stride_3(); }
+      else if ( 5 < S::rank && n4 ) { shape_type::N0 = n4 ; SR = rhs.stride_4(); }
+      else if ( 6 < S::rank && n5 ) { shape_type::N0 = n5 ; SR = rhs.stride_5(); }
+      else if ( 7 < S::rank && n6 ) { shape_type::N0 = n6 ; SR = rhs.stride_6(); }
+
+      // Introduce noncontiguous if change the last dimension
+      // or take a range of a dimension other than the second-to-last dimension.
+
+      return 2 == S::rank ? ( size_t(shape_type::N1) != size_t(rhs.N1) || 0 == n0 ) : (
+             3 == S::rank ? ( size_t(shape_type::N1) != size_t(rhs.N2) || 0 == n1 ) : (
+             4 == S::rank ? ( size_t(shape_type::N1) != size_t(rhs.N3) || 0 == n2 ) : (
+             5 == S::rank ? ( size_t(shape_type::N1) != size_t(rhs.N4) || 0 == n3 ) : (
+             6 == S::rank ? ( size_t(shape_type::N1) != size_t(rhs.N5) || 0 == n4 ) : (
+             7 == S::rank ? ( size_t(shape_type::N1) != size_t(rhs.N6) || 0 == n5 ) : (
+                            ( size_t(shape_type::N1) != size_t(rhs.N7) || 0 == n6 ) ))))));
+    }
+
+  template< unsigned R >
+  KOKKOS_INLINE_FUNCTION
+  void assign( size_t n )
+    { assign_shape_dimension<R>( *this , n ); }
+
+  KOKKOS_INLINE_FUNCTION
+  void assign( size_t n0 , size_t n1 , size_t n2 , size_t n3
+             , size_t n4 , size_t n5 , size_t n6 , size_t n7
+             , size_t = 0 )
+    {
+      shape_type::assign( *this , n0, n1, n2, n3, n4, n5, n6, n7 );
+      SR = size_type(shape_type::N1) * shape_type::N2 * shape_type::N3 * shape_type::N4 * shape_type::N5 * shape_type::N6 * shape_type::N7 ;
+    }
+
+  template< class ShapeRHS >
+  KOKKOS_INLINE_FUNCTION
+  void assign( const ViewOffset< ShapeRHS , LayoutRight > & rhs
+             , typename enable_if<( int(ShapeRHS::rank) == int(shape_type::rank)
+                                    &&
+                                    int(ShapeRHS::rank_dynamic) <= int(shape_type::rank_dynamic)
+                                    &&
+                                    int(ShapeRHS::rank_dynamic) <= 1
+                                  )>::type * = 0 )
+    {
+      shape_type::assign( *this , rhs.N0, rhs.N1, rhs.N2, rhs.N3, rhs.N4, rhs.N5, rhs.N6, rhs.N7 );
+      SR = shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 * shape_type::N5 * shape_type::N6 * shape_type::N7 ;
+    }
+
+  template< class ShapeRHS >
+  KOKKOS_INLINE_FUNCTION
+  void assign( const ViewOffset< ShapeRHS , LayoutRight > & rhs
+             , typename enable_if<( int(ShapeRHS::rank) == int(shape_type::rank)
+                                    &&
+                                    int(ShapeRHS::rank_dynamic) <= int(shape_type::rank_dynamic)
+                                    &&
+                                    int(ShapeRHS::rank_dynamic) > 1
+                                  )>::type * = 0 )
+    {
+      shape_type::assign( *this , rhs.N0, rhs.N1, rhs.N2, rhs.N3, rhs.N4, rhs.N5, rhs.N6, rhs.N7 );
+      SR = rhs.SR ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void set_padding()
+    {
+      enum { div   = MEMORY_ALIGNMENT / shape_type::scalar_size };
+      enum { mod   = MEMORY_ALIGNMENT % shape_type::scalar_size };
+      enum { align = 0 == mod ? div : 0 };
+
+      if ( align && MEMORY_ALIGNMENT_THRESHOLD * align < SR ) {
+
+        const size_type count_mod = SR % ( div ? div : 1 );
+
+        if ( count_mod ) { SR += align - count_mod ; }
+      }
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type cardinality() const
+    { return size_type(shape_type::N0) * shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 * shape_type::N5 * shape_type::N6 * shape_type::N7 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type capacity() const { return shape_type::N0 * SR ; }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  void stride( iType * const s ) const
+    {
+      size_type n = 1 ;
+      if ( 7 < shape_type::rank ) { s[7] = n ; n *= shape_type::N7 ; }
+      if ( 6 < shape_type::rank ) { s[6] = n ; n *= shape_type::N6 ; }
+      if ( 5 < shape_type::rank ) { s[5] = n ; n *= shape_type::N5 ; }
+      if ( 4 < shape_type::rank ) { s[4] = n ; n *= shape_type::N4 ; }
+      if ( 3 < shape_type::rank ) { s[3] = n ; n *= shape_type::N3 ; }
+      if ( 2 < shape_type::rank ) { s[2] = n ; n *= shape_type::N2 ; }
+      if ( 1 < shape_type::rank ) { s[1] = n ; n *= shape_type::N1 ; }
+      if ( 0 < shape_type::rank ) { s[0] = SR ; }
+      s[shape_type::rank] = SR * shape_type::N0 ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_7() const { return 1 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_6() const { return shape_type::N7 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_5() const { return shape_type::N7 * shape_type::N6 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_4() const { return shape_type::N7 * shape_type::N6 * shape_type::N5 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_3() const { return shape_type::N7 * shape_type::N6 * shape_type::N5 * shape_type::N4 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_2() const { return shape_type::N7 * shape_type::N6 * shape_type::N5 * shape_type::N4 * shape_type::N3 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_1() const { return shape_type::N7 * shape_type::N6 * shape_type::N5 * shape_type::N4 * shape_type::N3 * shape_type::N2 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_0() const { return SR ; }
+
+  // rank 2
+  template <typename I0, typename I1>
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1 ) const
+    {
+      return i1 + i0 * SR ;
+    }
+
+  template <typename I0, typename I1, typename I2>
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2 ) const
+    {
+      return i2 + shape_type::N2 * ( i1 ) +
+             i0 * SR ;
+    }
+
+  template <typename I0, typename I1, typename I2, typename I3>
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2 , I3 const& i3 ) const
+    {
+      return i3 + shape_type::N3 * (
+             i2 + shape_type::N2 * ( i1 )) +
+             i0 * SR ;
+    }
+
+  template < typename I0, typename I1, typename I2, typename I3
+            ,typename I4 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2 , I3 const& i3, I4 const& i4 ) const
+    {
+      return i4 + shape_type::N4 * (
+             i3 + shape_type::N3 * (
+             i2 + shape_type::N2 * ( i1 ))) +
+             i0 * SR ;
+    }
+
+  template < typename I0, typename I1, typename I2, typename I3
+            ,typename I4, typename I5 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2 , I3 const& i3, I4 const& i4, I5 const& i5 ) const
+  {
+    return i5 + shape_type::N5 * (
+           i4 + shape_type::N4 * (
+           i3 + shape_type::N3 * (
+           i2 + shape_type::N2 * ( i1 )))) +
+           i0 * SR ;
+  }
+
+  template < typename I0, typename I1, typename I2, typename I3
+            ,typename I4, typename I5, typename I6 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2 , I3 const& i3, I4 const& i4, I5 const& i5, I6 const& i6 ) const
+  {
+    return i6 + shape_type::N6 * (
+           i5 + shape_type::N5 * (
+           i4 + shape_type::N4 * (
+           i3 + shape_type::N3 * (
+           i2 + shape_type::N2 * ( i1 ))))) +
+           i0 * SR ;
+  }
+
+  template < typename I0, typename I1, typename I2, typename I3
+            ,typename I4, typename I5, typename I6, typename I7 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2 , I3 const& i3, I4 const& i4, I5 const& i5, I6 const& i6, I7 const& i7 ) const
+  {
+    return i7 + shape_type::N7 * (
+           i6 + shape_type::N6 * (
+           i5 + shape_type::N5 * (
+           i4 + shape_type::N4 * (
+           i3 + shape_type::N3 * (
+           i2 + shape_type::N2 * ( i1 )))))) +
+           i0 * SR ;
+  }
+};
+
+//----------------------------------------------------------------------------
+// LayoutStride : 
+template < class ShapeType >
+struct ViewOffset< ShapeType , LayoutStride
+                 , typename enable_if<( 0 < ShapeType::rank )>::type >
+  : public ShapeType
+{
+  typedef size_t        size_type;
+  typedef ShapeType     shape_type;
+  typedef LayoutStride  array_layout ;
+
+  size_type S[ shape_type::rank + 1 ];
+
+  template< class SType , class L >
+  KOKKOS_INLINE_FUNCTION
+  bool assign_subview( const ViewOffset<SType,L,void> & rhs
+                     , const size_type n0
+                     , const size_type n1
+                     , const size_type n2
+                     , const size_type n3
+                     , const size_type n4
+                     , const size_type n5
+                     , const size_type n6
+                     , const size_type n7
+                     )
+    {
+      shape_type::assign( *this, 0,0,0,0, 0,0,0,0 );
+
+      for ( int i = 0 ; i < int(shape_type::rank+1) ; ++i ) { S[i] = 0 ; }
+
+      // preconditions:
+      //  shape_type::rank <= rhs.rank
+      //  shape_type::rank == count of nonzero( rhs_dim[i] )
+      size_type dim[8] = { n0 , n1 , n2 , n3 , n4 , n5 , n6 , n7 };
+      size_type str[ SType::rank + 1 ];
+
+      rhs.stride( str );
+
+      // contract the zero-dimensions
+      int r = 0 ;
+      for ( int i = 0 ; i < int(SType::rank) ; ++i ) {
+        if ( 0 != dim[i] ) {
+          dim[r] = dim[i] ;
+          str[r] = str[i] ;
+          ++r ;
+        }
+      }
+
+      if ( int(shape_type::rank) == r ) {
+        // The shape is non-zero
+        for ( int i = 0 ; i < int(shape_type::rank) ; ++i ) {
+          const size_type cap = dim[i] * ( S[i] = str[i] );
+          if ( S[ shape_type::rank ] < cap ) S[ shape_type::rank ] = cap ;
+        }
+        // set the contracted nonzero dimensions
+        shape_type::assign( *this, dim[0], dim[1], dim[2], dim[3], dim[4], dim[5], dim[6], dim[7] );
+      }
+
+      return true ; // definitely noncontiguous
+    }
+
+  template< unsigned R >
+  KOKKOS_INLINE_FUNCTION
+  void assign( size_t n )
+    { assign_shape_dimension<R>( *this , n ); }
+
+  template< class ShapeRHS , class Layout >
+  KOKKOS_INLINE_FUNCTION
+  void assign( const ViewOffset<ShapeRHS,Layout> & rhs
+             , typename enable_if<( int(ShapeRHS::rank) == int(shape_type::rank) )>::type * = 0 )
+    {
+      rhs.stride(S);
+      shape_type::assign( *this, rhs.N0, rhs.N1, rhs.N2, rhs.N3, rhs.N4, rhs.N5, rhs.N6, rhs.N7 );
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void assign( const LayoutStride & layout )
+  {
+    size_type max = 0 ;
+    for ( int i = 0 ; i < shape_type::rank ; ++i ) {
+      S[i] = layout.stride[i] ;
+      const size_type m = layout.dimension[i] * S[i] ;
+      if ( max < m ) { max = m ; }
+    }
+    S[ shape_type::rank ] = max ;
+    shape_type::assign( *this, layout.dimension[0], layout.dimension[1],
+                               layout.dimension[2], layout.dimension[3],
+                               layout.dimension[4], layout.dimension[5],
+                               layout.dimension[6], layout.dimension[7] );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void assign( size_t s0 , size_t s1 , size_t s2 , size_t s3
+             , size_t s4 , size_t s5 , size_t s6 , size_t s7
+             , size_t s8 )
+    {
+      const size_t str[9] = { s0, s1, s2, s3, s4, s5, s6, s7, s8 };
+
+      // Last argument is the total length.
+      // Total length must be non-zero.
+      // All strides must be non-zero and less than total length.
+      bool ok = 0 < str[ shape_type::rank ] ;
+
+      for ( int i = 0 ; ( i < shape_type::rank ) &&
+                        ( ok = 0 < str[i] && str[i] < str[ shape_type::rank ] ); ++i );
+
+      if ( ok ) {
+        size_t dim[8] = { 1,1,1,1,1,1,1,1 }; 
+        int iorder[9] = { 0,0,0,0,0,0,0,0,0 }; 
+
+        // Ordering of strides smallest to largest.
+        for ( int i = 1 ; i < shape_type::rank ; ++i ) {
+          int j = i ;
+          for ( ; 0 < j && str[i] < str[ iorder[j-1] ] ; --j ) {
+            iorder[j] = iorder[j-1] ;
+          }
+          iorder[j] = i ;
+        }
+
+        // Last argument is the total length.
+        iorder[ shape_type::rank ] = shape_type::rank ;
+
+        // Determine dimension associated with each stride.
+        // Guarantees non-overlap by truncating dimension
+        // if ( 0 != str[ iorder[i+1] ] % str[ iorder[i] ] )
+        for ( int i = 0 ; i < shape_type::rank ; ++i ) {
+          dim[ iorder[i] ] = str[ iorder[i+1] ] / str[ iorder[i] ] ;
+        }
+
+        // Assign dimensions and strides:
+        shape_type::assign( *this, dim[0], dim[1], dim[2], dim[3], dim[4], dim[5], dim[6], dim[7] );
+        for ( int i = 0 ; i <= shape_type::rank ; ++i ) { S[i] = str[i] ; }
+      }
+      else {
+        shape_type::assign(*this,0,0,0,0,0,0,0,0);
+        for ( int i = 0 ; i <= shape_type::rank ; ++i ) { S[i] = 0 ; }
+      }
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void set_padding() {}
+
+  KOKKOS_INLINE_FUNCTION
+  size_type cardinality() const
+    { return shape_type::N0 * shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 * shape_type::N5 * shape_type::N6 * shape_type::N7 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type capacity() const { return S[ shape_type::rank ]; }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  void stride( iType * const s ) const
+    { for ( int i = 0 ; i <= shape_type::rank ; ++i ) { s[i] = S[i] ; } }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_0() const { return S[0] ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_1() const { return S[1] ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_2() const { return S[2] ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_3() const { return S[3] ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_4() const { return S[4] ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_5() const { return S[5] ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_6() const { return S[6] ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_7() const { return S[7] ; }
+
+  // rank 1
+  template <typename I0 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if< (std::is_integral<I0>::value) && (shape_type::rank==1),size_type>::type
+    operator()( I0 const& i0) const
+    {
+      return i0 * S[0] ;
+    }
+
+  // rank 2
+  template <typename I0, typename I1>
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if< (std::is_integral<I0>::value) && (shape_type::rank==2),size_type>::type
+    operator()( I0 const& i0, I1 const& i1 ) const
+    {
+      return i0 * S[0] + i1 * S[1] ;
+    }
+
+  template <typename I0, typename I1, typename I2>
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if< (std::is_integral<I0>::value) && (shape_type::rank==3),size_type>::type
+    operator()( I0 const& i0, I1 const& i1, I2 const& i2 ) const
+    {
+      return i0 * S[0] + i1 * S[1] + i2 * S[2] ;
+    }
+
+  template <typename I0, typename I1, typename I2, typename I3>
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if< (std::is_integral<I0>::value) && (shape_type::rank==4),size_type>::type
+    operator()( I0 const& i0, I1 const& i1, I2 const& i2 , I3 const& i3 ) const
+    {
+      return i0 * S[0] + i1 * S[1] + i2 * S[2] + i3 * S[3] ;
+    }
+
+  template < typename I0, typename I1, typename I2, typename I3
+            ,typename I4 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if< (std::is_integral<I0>::value) && (shape_type::rank==5),size_type>::type
+    operator()( I0 const& i0, I1 const& i1, I2 const& i2 , I3 const& i3, I4 const& i4 ) const
+    {
+      return i0 * S[0] + i1 * S[1] + i2 * S[2] + i3 * S[3] + i4 * S[4] ;
+    }
+
+  template < typename I0, typename I1, typename I2, typename I3
+            ,typename I4, typename I5 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if< (std::is_integral<I0>::value) && (shape_type::rank==6),size_type>::type
+    operator()( I0 const& i0, I1 const& i1, I2 const& i2 , I3 const& i3, I4 const& i4, I5 const& i5 ) const
+    {
+      return i0 * S[0] + i1 * S[1] + i2 * S[2] + i3 * S[3] + i4 * S[4] + i5 * S[5] ;
+    }
+
+  template < typename I0, typename I1, typename I2, typename I3
+            ,typename I4, typename I5, typename I6 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if< (std::is_integral<I0>::value) && (shape_type::rank==7),size_type>::type
+    operator()( I0 const& i0, I1 const& i1, I2 const& i2 , I3 const& i3, I4 const& i4, I5 const& i5, I6 const& i6 ) const
+    {
+      return i0 * S[0] + i1 * S[1] + i2 * S[2] + i3 * S[3] + i4 * S[4] + i5 * S[5] + i6 * S[6] ;
+    }
+
+  template < typename I0, typename I1, typename I2, typename I3
+            ,typename I4, typename I5, typename I6, typename I7 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if< (std::is_integral<I0>::value) && (shape_type::rank==8),size_type>::type
+    operator()( I0 const& i0, I1 const& i1, I2 const& i2 , I3 const& i3, I4 const& i4, I5 const& i5, I6 const& i6, I7 const& i7 ) const
+    {
+      return i0 * S[0] + i1 * S[1] + i2 * S[2] + i3 * S[3] + i4 * S[4] + i5 * S[5] + i6 * S[6] + i7 * S[7] ;
+    }
+};
+
+//----------------------------------------------------------------------------
+
+template< class T >
+struct ViewOffsetRange {
+
+  enum { OK_integral_type = Impl::StaticAssert< Impl::is_integral<T>::value >::value };
+
+  enum { is_range = false };
+
+  KOKKOS_INLINE_FUNCTION static
+  size_t dimension( size_t const , T const & ) { return 0 ; }
+
+  KOKKOS_INLINE_FUNCTION static
+  size_t begin( T const & i ) { return size_t(i) ; }
+};
+
+template<>
+struct ViewOffsetRange<void> {
+  enum { is_range = false };
+};
+
+template<>
+struct ViewOffsetRange< Kokkos::ALL > {
+  enum { is_range = true };
+
+  KOKKOS_INLINE_FUNCTION static
+  size_t dimension( size_t const n , ALL const & ) { return n ; }
+
+  KOKKOS_INLINE_FUNCTION static
+  size_t begin( ALL const & ) { return 0 ; }
+};
+
+template< typename iType >
+struct ViewOffsetRange< std::pair<iType,iType> > {
+
+  enum { OK_integral_type = Impl::StaticAssert< Impl::is_integral<iType>::value >::value };
+
+  enum { is_range = true };
+
+  KOKKOS_INLINE_FUNCTION static
+  size_t dimension( size_t const n , std::pair<iType,iType> const & r )
+    { return ( size_t(r.first) < size_t(r.second) && size_t(r.second) <= n ) ? size_t(r.second) - size_t(r.first) : 0 ; }
+
+  KOKKOS_INLINE_FUNCTION static
+  size_t begin( std::pair<iType,iType> const & r ) { return size_t(r.first) ; }
+};
+
+template< typename iType >
+struct ViewOffsetRange< Kokkos::pair<iType,iType> > {
+
+  enum { OK_integral_type = Impl::StaticAssert< Impl::is_integral<iType>::value >::value };
+
+  enum { is_range = true };
+
+  KOKKOS_INLINE_FUNCTION static
+  size_t dimension( size_t const n , Kokkos::pair<iType,iType> const & r )
+    { return ( size_t(r.first) < size_t(r.second) && size_t(r.second) <= n ) ? size_t(r.second) - size_t(r.first) : 0 ; }
+
+  KOKKOS_INLINE_FUNCTION static
+  size_t begin( Kokkos::pair<iType,iType> const & r ) { return size_t(r.first) ; }
+};
+
+}} // namespace Kokkos::Impl
+
+#endif //KOKKOS_VIEWOFFSET_HPP
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewSupport.hpp b/lib/kokkos/core/src/impl/Kokkos_ViewSupport.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..8b63039f57000e9d3b0ffa2aaad5a0c3c94d27c4
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_ViewSupport.hpp
@@ -0,0 +1,393 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_VIEWSUPPORT_HPP
+#define KOKKOS_VIEWSUPPORT_HPP
+
+#include <algorithm>
+#include <Kokkos_ExecPolicy.hpp>
+#include <impl/Kokkos_Shape.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+/** \brief  Evaluate if LHS = RHS view assignment is allowed. */
+template< class ViewLHS , class ViewRHS >
+struct ViewAssignable
+{
+  // Same memory space.
+  // Same value type.
+  // Compatible 'const' qualifier
+  // Cannot assign managed = unmannaged
+  enum { assignable_value =
+    ( is_same< typename ViewLHS::value_type ,
+               typename ViewRHS::value_type >::value
+      ||
+      is_same< typename ViewLHS::value_type ,
+               typename ViewRHS::const_value_type >::value )
+    &&
+    is_same< typename ViewLHS::memory_space ,
+             typename ViewRHS::memory_space >::value
+    &&
+    ( ! ( ViewLHS::is_managed && ! ViewRHS::is_managed ) )
+  };
+
+  enum { assignable_shape =
+    // Compatible shape and matching layout:
+    ( ShapeCompatible< typename ViewLHS::shape_type ,
+                       typename ViewRHS::shape_type >::value
+      &&
+      is_same< typename ViewLHS::array_layout ,
+               typename ViewRHS::array_layout >::value )
+    ||
+    // Matching layout, same rank, and LHS dynamic rank
+    ( is_same< typename ViewLHS::array_layout ,
+               typename ViewRHS::array_layout >::value
+      &&
+      int(ViewLHS::rank) == int(ViewRHS::rank)
+      &&
+      int(ViewLHS::rank) == int(ViewLHS::rank_dynamic) )
+    ||
+    // Both rank-0, any shape and layout
+    ( int(ViewLHS::rank) == 0 && int(ViewRHS::rank) == 0 )
+    ||
+    // Both rank-1 and LHS is dynamic rank-1, any shape and layout
+    ( int(ViewLHS::rank) == 1 && int(ViewRHS::rank) == 1 &&
+      int(ViewLHS::rank_dynamic) == 1 )
+    };
+
+  enum { value = assignable_value && assignable_shape };
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class ExecSpace , class Type , bool Initialize >
+struct ViewDefaultConstruct
+{ ViewDefaultConstruct( Type * , size_t ) {} };
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class OutputView , class InputView  , unsigned Rank = OutputView::Rank >
+struct ViewRemap
+{
+  typedef typename OutputView::size_type   size_type ;
+
+  const OutputView output ;
+  const InputView  input ;
+  const size_type n0 ;
+  const size_type n1 ;
+  const size_type n2 ;
+  const size_type n3 ;
+  const size_type n4 ;
+  const size_type n5 ;
+  const size_type n6 ;
+  const size_type n7 ;
+
+  ViewRemap( const OutputView & arg_out , const InputView & arg_in )
+    : output( arg_out ), input( arg_in )
+    , n0( std::min( (size_t)arg_out.dimension_0() , (size_t)arg_in.dimension_0() ) )
+    , n1( std::min( (size_t)arg_out.dimension_1() , (size_t)arg_in.dimension_1() ) )
+    , n2( std::min( (size_t)arg_out.dimension_2() , (size_t)arg_in.dimension_2() ) )
+    , n3( std::min( (size_t)arg_out.dimension_3() , (size_t)arg_in.dimension_3() ) )
+    , n4( std::min( (size_t)arg_out.dimension_4() , (size_t)arg_in.dimension_4() ) )
+    , n5( std::min( (size_t)arg_out.dimension_5() , (size_t)arg_in.dimension_5() ) )
+    , n6( std::min( (size_t)arg_out.dimension_6() , (size_t)arg_in.dimension_6() ) )
+    , n7( std::min( (size_t)arg_out.dimension_7() , (size_t)arg_in.dimension_7() ) )
+    {
+      typedef typename OutputView::execution_space execution_space ;
+      Kokkos::RangePolicy< execution_space > range( 0 , n0 );
+      parallel_for( range , *this );
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type i0 ) const
+  {
+    for ( size_type i1 = 0 ; i1 < n1 ; ++i1 ) {
+    for ( size_type i2 = 0 ; i2 < n2 ; ++i2 ) {
+    for ( size_type i3 = 0 ; i3 < n3 ; ++i3 ) {
+    for ( size_type i4 = 0 ; i4 < n4 ; ++i4 ) {
+    for ( size_type i5 = 0 ; i5 < n5 ; ++i5 ) {
+    for ( size_type i6 = 0 ; i6 < n6 ; ++i6 ) {
+    for ( size_type i7 = 0 ; i7 < n7 ; ++i7 ) {
+      output.at(i0,i1,i2,i3,i4,i5,i6,i7) = input.at(i0,i1,i2,i3,i4,i5,i6,i7);
+    }}}}}}}
+  }
+};
+
+template< class OutputView , class InputView  >
+struct ViewRemap< OutputView ,  InputView , 0 >
+{
+  typedef typename OutputView::value_type   value_type ;
+  typedef typename OutputView::memory_space dst_space ;
+  typedef typename InputView ::memory_space src_space ;
+
+  ViewRemap( const OutputView & arg_out , const InputView & arg_in )
+  {
+    DeepCopy< dst_space , src_space >( arg_out.ptr_on_device() ,
+                                       arg_in.ptr_on_device() ,
+                                       sizeof(value_type) );
+  }
+};
+
+//----------------------------------------------------------------------------
+
+template< class ExecSpace , class Type >
+struct ViewDefaultConstruct< ExecSpace , Type , true >
+{
+  Type * const m_ptr ;
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  void operator()( const typename ExecSpace::size_type& i ) const
+    { m_ptr[i] = Type(); }
+
+  ViewDefaultConstruct( Type * pointer , size_t capacity )
+    : m_ptr( pointer )
+    {
+      Kokkos::RangePolicy< ExecSpace > range( 0 , capacity );
+      parallel_for( range , *this );
+      ExecSpace::fence();
+    }
+};
+
+template< class OutputView , unsigned Rank = OutputView::Rank ,
+          class Enabled = void >
+struct ViewFill
+{
+  typedef typename OutputView::const_value_type  const_value_type ;
+  typedef typename OutputView::size_type         size_type ;
+
+  const OutputView output ;
+  const_value_type input ;
+
+  ViewFill( const OutputView & arg_out , const_value_type & arg_in )
+    : output( arg_out ), input( arg_in )
+    {
+      typedef typename OutputView::execution_space execution_space ;
+      Kokkos::RangePolicy< execution_space > range( 0 , output.dimension_0() );
+      parallel_for( range , *this );
+      execution_space::fence();
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type i0 ) const
+  {
+    for ( size_type i1 = 0 ; i1 < output.dimension_1() ; ++i1 ) {
+    for ( size_type i2 = 0 ; i2 < output.dimension_2() ; ++i2 ) {
+    for ( size_type i3 = 0 ; i3 < output.dimension_3() ; ++i3 ) {
+    for ( size_type i4 = 0 ; i4 < output.dimension_4() ; ++i4 ) {
+    for ( size_type i5 = 0 ; i5 < output.dimension_5() ; ++i5 ) {
+    for ( size_type i6 = 0 ; i6 < output.dimension_6() ; ++i6 ) {
+    for ( size_type i7 = 0 ; i7 < output.dimension_7() ; ++i7 ) {
+      output.at(i0,i1,i2,i3,i4,i5,i6,i7) = input ;
+    }}}}}}}
+  }
+};
+
+template< class OutputView >
+struct ViewFill< OutputView , 0 >
+{
+  typedef typename OutputView::const_value_type  const_value_type ;
+  typedef typename OutputView::memory_space      dst_space ;
+
+  ViewFill( const OutputView & arg_out , const_value_type & arg_in )
+  {
+    DeepCopy< dst_space , dst_space >( arg_out.ptr_on_device() , & arg_in ,
+                                       sizeof(const_value_type) );
+  }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+struct ViewAllocateWithoutInitializing {
+
+  const std::string label ;
+
+  ViewAllocateWithoutInitializing() : label() {}
+  explicit ViewAllocateWithoutInitializing( const std::string & arg_label ) : label( arg_label ) {}
+  explicit ViewAllocateWithoutInitializing( const char * const  arg_label ) : label( arg_label ) {}
+};
+
+struct ViewAllocate {
+
+  const std::string  label ;
+
+  ViewAllocate() : label() {}
+  ViewAllocate( const std::string & arg_label ) : label( arg_label ) {}
+  ViewAllocate( const char * const  arg_label ) : label( arg_label ) {}
+};
+
+}
+
+namespace Kokkos {
+namespace Impl {
+
+template< class Traits , class AllocationProperties , class Enable = void >
+struct ViewAllocProp : public Kokkos::Impl::false_type {};
+
+template< class Traits >
+struct ViewAllocProp< Traits , Kokkos::ViewAllocate
+  , typename Kokkos::Impl::enable_if<(
+      Traits::is_managed && ! Kokkos::Impl::is_const< typename Traits::value_type >::value
+    )>::type >
+  : public Kokkos::Impl::true_type
+{
+  typedef size_t               size_type ;
+  typedef const ViewAllocate & property_type ;
+
+  enum { Initialize = true };
+  enum { AllowPadding = false };
+
+  inline
+  static const std::string & label( property_type p ) { return p.label ; }
+};
+
+template< class Traits >
+struct ViewAllocProp< Traits , std::string
+  , typename Kokkos::Impl::enable_if<(
+      Traits::is_managed && ! Kokkos::Impl::is_const< typename Traits::value_type >::value
+    )>::type >
+  : public Kokkos::Impl::true_type
+{
+  typedef size_t              size_type ;
+  typedef const std::string & property_type ;
+
+  enum { Initialize = true };
+  enum { AllowPadding = false };
+
+  inline
+  static const std::string & label( property_type s ) { return s ; }
+};
+
+template< class Traits , unsigned N >
+struct ViewAllocProp< Traits , char[N]
+  , typename Kokkos::Impl::enable_if<(
+      Traits::is_managed && ! Kokkos::Impl::is_const< typename Traits::value_type >::value
+    )>::type >
+  : public Kokkos::Impl::true_type
+{
+private:
+  typedef char label_type[N] ;
+public:
+
+  typedef size_t             size_type ;
+  typedef const label_type & property_type ;
+
+  enum { Initialize = true };
+  enum { AllowPadding = false };
+
+  inline
+  static std::string label( property_type s ) { return std::string(s) ; }
+};
+
+template< class Traits >
+struct ViewAllocProp< Traits , Kokkos::ViewAllocateWithoutInitializing
+  , typename Kokkos::Impl::enable_if<(
+      Traits::is_managed && ! Kokkos::Impl::is_const< typename Traits::value_type >::value
+    )>::type >
+  : public Kokkos::Impl::true_type
+{
+  typedef size_t size_type ;
+  typedef const Kokkos::ViewAllocateWithoutInitializing & property_type ;
+
+  enum { Initialize = false };
+  enum { AllowPadding = false };
+
+  inline
+  static std::string label( property_type s ) { return s.label ; }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class Traits , class PointerProperties , class Enable = void >
+struct ViewRawPointerProp : public Kokkos::Impl::false_type {};
+
+template< class Traits , typename T >
+struct ViewRawPointerProp< Traits , T ,
+  typename Kokkos::Impl::enable_if<(
+    Impl::is_same< T , typename Traits::value_type >::value ||
+    Impl::is_same< T , typename Traits::non_const_value_type >::value
+  )>::type >
+  : public Kokkos::Impl::true_type
+{
+  typedef size_t size_type ;
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_VIEWSUPPORT_HPP */
+
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewTileLeft.hpp b/lib/kokkos/core/src/impl/Kokkos_ViewTileLeft.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..61d2e35702f998a83e0796e7d291dff7e3466dd4
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_ViewTileLeft.hpp
@@ -0,0 +1,56 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_VIEWTILELEFT_HPP
+#define KOKKOS_VIEWTILELEFT_HPP
+
+#include <impl/KokkosExp_ViewTile.hpp>
+
+namespace Kokkos {
+
+using Kokkos::Experimental::tile_subview ;
+
+}
+
+#endif /* #ifndef KOKKOS_VIEWTILELEFT_HPP */
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Volatile_Load.hpp b/lib/kokkos/core/src/impl/Kokkos_Volatile_Load.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..420ee63891e6ddb0995ad7bbbcfba2f0548c2bd9
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Volatile_Load.hpp
@@ -0,0 +1,242 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_VOLATILE_LOAD )
+#define KOKKOS_VOLATILE_LOAD
+
+#if defined( __GNUC__ ) /* GNU C   */ || \
+    defined( __GNUG__ ) /* GNU C++ */ || \
+    defined( __clang__ )
+
+#define KOKKOS_MAY_ALIAS __attribute__((__may_alias__))
+
+#else
+
+#define KOKKOS_MAY_ALIAS
+
+#endif
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+
+template <typename T>
+KOKKOS_FORCEINLINE_FUNCTION
+T volatile_load(T const volatile * const src_ptr)
+{
+  typedef uint64_t KOKKOS_MAY_ALIAS T64;
+  typedef uint32_t KOKKOS_MAY_ALIAS T32;
+  typedef uint16_t KOKKOS_MAY_ALIAS T16;
+  typedef uint8_t  KOKKOS_MAY_ALIAS T8;
+
+  enum {
+    NUM_8  = sizeof(T),
+    NUM_16 = NUM_8 / 2,
+    NUM_32 = NUM_8 / 4,
+    NUM_64 = NUM_8 / 8
+  };
+
+  union {
+    T   const volatile * const ptr;
+    T64 const volatile * const ptr64;
+    T32 const volatile * const ptr32;
+    T16 const volatile * const ptr16;
+    T8  const volatile * const ptr8;
+  } src = {src_ptr};
+
+  T result;
+
+  union {
+    T   * const ptr;
+    T64 * const ptr64;
+    T32 * const ptr32;
+    T16 * const ptr16;
+    T8  * const ptr8;
+  } dst = {&result};
+
+  for (int i=0; i < NUM_64; ++i) {
+    dst.ptr64[i] = src.ptr64[i];
+  }
+
+  if ( NUM_64*2 < NUM_32 ) {
+    dst.ptr32[NUM_64*2] = src.ptr32[NUM_64*2];
+  }
+
+  if ( NUM_32*2 < NUM_16 ) {
+    dst.ptr16[NUM_32*2] = src.ptr16[NUM_32*2];
+  }
+
+  if ( NUM_16*2 < NUM_8 ) {
+    dst.ptr8[NUM_16*2] = src.ptr8[NUM_16*2];
+  }
+
+  return result;
+}
+
+template <typename T>
+KOKKOS_FORCEINLINE_FUNCTION
+void volatile_store(T volatile * const dst_ptr, T const volatile * const src_ptr)
+{
+  typedef uint64_t KOKKOS_MAY_ALIAS T64;
+  typedef uint32_t KOKKOS_MAY_ALIAS T32;
+  typedef uint16_t KOKKOS_MAY_ALIAS T16;
+  typedef uint8_t  KOKKOS_MAY_ALIAS T8;
+
+  enum {
+    NUM_8  = sizeof(T),
+    NUM_16 = NUM_8 / 2,
+    NUM_32 = NUM_8 / 4,
+    NUM_64 = NUM_8 / 8
+  };
+
+  union {
+    T   const volatile * const ptr;
+    T64 const volatile * const ptr64;
+    T32 const volatile * const ptr32;
+    T16 const volatile * const ptr16;
+    T8  const volatile * const ptr8;
+  } src = {src_ptr};
+
+  union {
+    T   volatile * const ptr;
+    T64 volatile * const ptr64;
+    T32 volatile * const ptr32;
+    T16 volatile * const ptr16;
+    T8  volatile * const ptr8;
+  } dst = {dst_ptr};
+
+  for (int i=0; i < NUM_64; ++i) {
+    dst.ptr64[i] = src.ptr64[i];
+  }
+
+  if ( NUM_64*2 < NUM_32 ) {
+    dst.ptr32[NUM_64*2] = src.ptr32[NUM_64*2];
+  }
+
+  if ( NUM_32*2 < NUM_16 ) {
+    dst.ptr16[NUM_32*2] = src.ptr16[NUM_32*2];
+  }
+
+  if ( NUM_16*2 < NUM_8 ) {
+    dst.ptr8[NUM_16*2] = src.ptr8[NUM_16*2];
+  }
+}
+
+template <typename T>
+KOKKOS_FORCEINLINE_FUNCTION
+void volatile_store(T volatile * const dst_ptr, T const * const src_ptr)
+{
+  typedef uint64_t KOKKOS_MAY_ALIAS T64;
+  typedef uint32_t KOKKOS_MAY_ALIAS T32;
+  typedef uint16_t KOKKOS_MAY_ALIAS T16;
+  typedef uint8_t  KOKKOS_MAY_ALIAS T8;
+
+  enum {
+    NUM_8  = sizeof(T),
+    NUM_16 = NUM_8 / 2,
+    NUM_32 = NUM_8 / 4,
+    NUM_64 = NUM_8 / 8
+  };
+
+  union {
+    T   const * const ptr;
+    T64 const * const ptr64;
+    T32 const * const ptr32;
+    T16 const * const ptr16;
+    T8  const * const ptr8;
+  } src = {src_ptr};
+
+  union {
+    T   volatile * const ptr;
+    T64 volatile * const ptr64;
+    T32 volatile * const ptr32;
+    T16 volatile * const ptr16;
+    T8  volatile * const ptr8;
+  } dst = {dst_ptr};
+
+  for (int i=0; i < NUM_64; ++i) {
+    dst.ptr64[i] = src.ptr64[i];
+  }
+
+  if ( NUM_64*2 < NUM_32 ) {
+    dst.ptr32[NUM_64*2] = src.ptr32[NUM_64*2];
+  }
+
+  if ( NUM_32*2 < NUM_16 ) {
+    dst.ptr16[NUM_32*2] = src.ptr16[NUM_32*2];
+  }
+
+  if ( NUM_16*2 < NUM_8 ) {
+    dst.ptr8[NUM_16*2] = src.ptr8[NUM_16*2];
+  }
+}
+
+template <typename T>
+KOKKOS_FORCEINLINE_FUNCTION
+void volatile_store(T volatile * dst_ptr, T const volatile & src)
+{ volatile_store(dst_ptr, &src); }
+
+template <typename T>
+KOKKOS_FORCEINLINE_FUNCTION
+void volatile_store(T volatile * dst_ptr, T const & src)
+{ volatile_store(dst_ptr, &src); }
+
+template <typename T>
+KOKKOS_FORCEINLINE_FUNCTION
+T safe_load(T const * const ptr)
+{
+#if !defined( __MIC__ )
+  return *ptr;
+#else
+  return volatile_load(ptr);
+#endif
+}
+
+} // namespace kokkos
+
+#undef KOKKOS_MAY_ALIAS
+
+#endif
+
+
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_hwloc.cpp b/lib/kokkos/core/src/impl/Kokkos_hwloc.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..cb561f711c3e3f86b07c8c9f24d96bb39bb3d765
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_hwloc.cpp
@@ -0,0 +1,726 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#define DEBUG_PRINT 0
+
+#include <iostream>
+#include <sstream>
+#include <algorithm>
+
+#include <Kokkos_Macros.hpp>
+#include <Kokkos_hwloc.hpp>
+#include <impl/Kokkos_Error.hpp>
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace hwloc {
+
+/* Return 0 if asynchronous, 1 if synchronous and include process. */
+unsigned thread_mapping( const char * const label ,
+                         const bool allow_async ,
+                         unsigned & thread_count ,
+                         unsigned & use_numa_count ,
+                         unsigned & use_cores_per_numa ,
+                         std::pair<unsigned,unsigned> threads_coord[] )
+{
+  const bool     hwloc_avail            = Kokkos::hwloc::available();
+  const unsigned avail_numa_count       = hwloc_avail ? hwloc::get_available_numa_count() : 1 ;
+  const unsigned avail_cores_per_numa   = hwloc_avail ? hwloc::get_available_cores_per_numa() : thread_count ;
+  const unsigned avail_threads_per_core = hwloc_avail ? hwloc::get_available_threads_per_core() : 1 ;
+
+  // (numa,core) coordinate of the process:
+  const std::pair<unsigned,unsigned> proc_coord = Kokkos::hwloc::get_this_thread_coordinate();
+
+  //------------------------------------------------------------------------
+  // Defaults for unspecified inputs:
+
+  if ( ! use_numa_count ) {
+    // Default to use all NUMA regions
+    use_numa_count = ! thread_count ? avail_numa_count : (
+                       thread_count < avail_numa_count ? thread_count : avail_numa_count );
+  }
+
+  if ( ! use_cores_per_numa ) {
+    // Default to use all but one core if asynchronous, all cores if synchronous.
+    const unsigned threads_per_numa = thread_count / use_numa_count ;
+
+    use_cores_per_numa = ! threads_per_numa ? avail_cores_per_numa - ( allow_async ? 1 : 0 ) : (
+                           threads_per_numa < avail_cores_per_numa ? threads_per_numa : avail_cores_per_numa );
+  }
+
+  if ( ! thread_count ) {
+    thread_count = use_numa_count * use_cores_per_numa * avail_threads_per_core ;
+  }
+
+  //------------------------------------------------------------------------
+  // Input verification:
+
+  const bool valid_numa      = use_numa_count <= avail_numa_count ;
+  const bool valid_cores     = use_cores_per_numa &&
+                               use_cores_per_numa <= avail_cores_per_numa ;
+  const bool valid_threads   = thread_count &&
+                               thread_count <= use_numa_count * use_cores_per_numa * avail_threads_per_core ;
+  const bool balanced_numa   = ! ( thread_count % use_numa_count );
+  const bool balanced_cores  = ! ( thread_count % ( use_numa_count * use_cores_per_numa ) );
+
+  const bool valid_input = valid_numa && valid_cores && valid_threads && balanced_numa && balanced_cores ;
+
+  if ( ! valid_input ) {
+
+    std::ostringstream msg ;
+
+    msg << label << " HWLOC ERROR(s)" ;
+
+    if ( ! valid_threads ) {
+      msg << " : thread_count(" << thread_count
+          << ") exceeds capacity("
+          << use_numa_count * use_cores_per_numa * avail_threads_per_core
+          << ")" ;
+    }
+    if ( ! valid_numa ) {
+      msg << " : use_numa_count(" << use_numa_count
+          << ") exceeds capacity(" << avail_numa_count << ")" ;
+    }
+    if ( ! valid_cores ) {
+      msg << " : use_cores_per_numa(" << use_cores_per_numa
+          << ") exceeds capacity(" << avail_cores_per_numa << ")" ;
+    }
+    if ( ! balanced_numa ) {
+      msg << " : thread_count(" << thread_count
+          << ") imbalanced among numa(" << use_numa_count << ")" ;
+    }
+    if ( ! balanced_cores ) {
+      msg << " : thread_count(" << thread_count
+          << ") imbalanced among cores(" << use_numa_count * use_cores_per_numa << ")" ;
+    }
+
+    Kokkos::Impl::throw_runtime_exception( msg.str() );
+  }
+
+  const unsigned thread_spawn_synchronous =
+    ( allow_async &&
+      1 < thread_count &&
+      ( use_numa_count     < avail_numa_count ||
+        use_cores_per_numa < avail_cores_per_numa ) )
+     ? 0 /* asyncronous */
+     : 1 /* synchronous, threads_coord[0] is process core */ ;
+
+  // Determine binding coordinates for to-be-spawned threads so that
+  // threads may be bound to cores as they are spawned.
+
+  const unsigned threads_per_core = thread_count / ( use_numa_count * use_cores_per_numa );
+
+  if ( thread_spawn_synchronous ) {
+    // Working synchronously and include process core as threads_coord[0].
+    // Swap the NUMA coordinate of the process core with 0
+    // Swap the CORE coordinate of the process core with 0
+    for ( unsigned i = 0 , inuma = avail_numa_count - use_numa_count ; inuma < avail_numa_count ; ++inuma ) {
+      const unsigned numa_coord = 0 == inuma ? proc_coord.first : ( proc_coord.first == inuma ? 0 : inuma );
+      for ( unsigned icore = avail_cores_per_numa - use_cores_per_numa ; icore < avail_cores_per_numa ; ++icore ) {
+        const unsigned core_coord = 0 == icore ? proc_coord.second : ( proc_coord.second == icore ? 0 : icore );
+        for ( unsigned ith = 0 ; ith < threads_per_core ; ++ith , ++i ) {
+          threads_coord[i].first  = numa_coord ;
+          threads_coord[i].second = core_coord ;
+        }
+      }
+    }
+  }
+  else if ( use_numa_count < avail_numa_count ) {
+    // Working asynchronously and omit the process' NUMA region from the pool.
+    // Swap the NUMA coordinate of the process core with ( ( avail_numa_count - use_numa_count ) - 1 )
+    const unsigned numa_coord_swap = ( avail_numa_count - use_numa_count ) - 1 ;
+    for ( unsigned i = 0 , inuma = avail_numa_count - use_numa_count ; inuma < avail_numa_count ; ++inuma ) {
+      const unsigned numa_coord = proc_coord.first == inuma ? numa_coord_swap : inuma ;
+      for ( unsigned icore = avail_cores_per_numa - use_cores_per_numa ; icore < avail_cores_per_numa ; ++icore ) {
+        const unsigned core_coord = icore ;
+        for ( unsigned ith = 0 ; ith < threads_per_core ; ++ith , ++i ) {
+          threads_coord[i].first  = numa_coord ;
+          threads_coord[i].second = core_coord ;
+        }
+      }
+    }
+  }
+  else if ( use_cores_per_numa < avail_cores_per_numa ) {
+    // Working asynchronously and omit the process' core from the pool.
+    // Swap the CORE coordinate of the process core with ( ( avail_cores_per_numa - use_cores_per_numa ) - 1 )
+    const unsigned core_coord_swap = ( avail_cores_per_numa - use_cores_per_numa ) - 1 ;
+    for ( unsigned i = 0 , inuma = avail_numa_count - use_numa_count ; inuma < avail_numa_count ; ++inuma ) {
+      const unsigned numa_coord = inuma ;
+      for ( unsigned icore = avail_cores_per_numa - use_cores_per_numa ; icore < avail_cores_per_numa ; ++icore ) {
+        const unsigned core_coord = proc_coord.second == icore ? core_coord_swap : icore ;
+        for ( unsigned ith = 0 ; ith < threads_per_core ; ++ith , ++i ) {
+          threads_coord[i].first  = numa_coord ;
+          threads_coord[i].second = core_coord ;
+        }
+      }
+    }
+  }
+
+  return thread_spawn_synchronous ;
+}
+
+} /* namespace hwloc */
+} /* namespace Kokkos */
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+#if defined( KOKKOS_HAVE_HWLOC )
+
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+
+/*--------------------------------------------------------------------------*/
+/* Third Party Libraries */
+
+/* Hardware locality library: http://www.open-mpi.org/projects/hwloc/ */
+#include <hwloc.h>
+
+#define  REQUIRED_HWLOC_API_VERSION  0x000010300
+
+#if HWLOC_API_VERSION < REQUIRED_HWLOC_API_VERSION
+#error "Requires  http://www.open-mpi.org/projects/hwloc/  Version 1.3 or greater"
+#endif
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace hwloc {
+namespace {
+
+#if DEBUG_PRINT
+
+inline
+void print_bitmap( std::ostream & s , const hwloc_const_bitmap_t bitmap )
+{
+  s << "{" ;
+  for ( int i = hwloc_bitmap_first( bitmap ) ;
+        -1 != i ; i = hwloc_bitmap_next( bitmap , i ) ) {
+    s << " " << i ;
+  }
+  s << " }" ;
+}
+
+#endif
+
+enum { MAX_CORE = 1024 };
+
+std::pair<unsigned,unsigned> s_core_topology(0,0);
+unsigned                     s_core_capacity(0);
+hwloc_topology_t             s_hwloc_topology(0);
+hwloc_bitmap_t               s_hwloc_location(0);
+hwloc_bitmap_t               s_process_binding(0);
+hwloc_bitmap_t               s_core[ MAX_CORE ];
+bool                         s_can_bind_threads(true);
+
+struct Sentinel {
+  ~Sentinel();
+  Sentinel();
+};
+
+bool sentinel()
+{
+  static Sentinel self ;
+
+  if ( 0 == s_hwloc_topology ) {
+    std::cerr << "Kokkos::hwloc ERROR : Called after return from main()" << std::endl ;
+    std::cerr.flush();
+  }
+
+  return 0 != s_hwloc_topology ;
+}
+
+Sentinel::~Sentinel()
+{
+  hwloc_topology_destroy( s_hwloc_topology );
+  hwloc_bitmap_free( s_process_binding );
+  hwloc_bitmap_free( s_hwloc_location );
+
+  s_core_topology.first  = 0 ;
+  s_core_topology.second = 0 ;
+  s_core_capacity   = 0 ;
+  s_hwloc_topology  = 0 ;
+  s_hwloc_location  = 0 ;
+  s_process_binding = 0 ;
+}
+
+Sentinel::Sentinel()
+{
+#if defined(__MIC__)
+  static const bool remove_core_0 = true ;
+#else
+  static const bool remove_core_0 = false ;
+#endif
+
+  s_core_topology   = std::pair<unsigned,unsigned>(0,0);
+  s_core_capacity   = 0 ;
+  s_hwloc_topology  = 0 ;
+  s_hwloc_location  = 0 ;
+  s_process_binding = 0 ;
+
+  for ( unsigned i = 0 ; i < MAX_CORE ; ++i ) s_core[i] = 0 ;
+
+  hwloc_topology_init( & s_hwloc_topology );
+  hwloc_topology_load( s_hwloc_topology );
+
+  s_hwloc_location  = hwloc_bitmap_alloc();
+  s_process_binding = hwloc_bitmap_alloc();
+
+  hwloc_get_cpubind( s_hwloc_topology , s_process_binding ,  HWLOC_CPUBIND_PROCESS );
+
+  if ( hwloc_bitmap_iszero( s_process_binding ) ) {
+    std::cerr << "WARNING: Cannot detect process binding -- ASSUMING ALL processing units" << std::endl;
+    const int pu_depth = hwloc_get_type_depth( s_hwloc_topology, HWLOC_OBJ_PU );
+    int num_pu = 1;
+    if ( pu_depth != HWLOC_TYPE_DEPTH_UNKNOWN ) {
+      num_pu = hwloc_get_nbobjs_by_depth( s_hwloc_topology, pu_depth );
+    }
+    else {
+      std::cerr << "WARNING: Cannot detect number of processing units -- ASSUMING 1 (serial)." << std::endl;
+      num_pu = 1;
+    }
+    hwloc_bitmap_set_range( s_process_binding, 0, num_pu-1);
+    s_can_bind_threads = false;
+  }
+
+
+  if ( remove_core_0 ) {
+
+    const hwloc_obj_t core = hwloc_get_obj_by_type( s_hwloc_topology , HWLOC_OBJ_CORE , 0 );
+
+    if ( hwloc_bitmap_intersects( s_process_binding , core->allowed_cpuset ) ) {
+
+      hwloc_bitmap_t s_process_no_core_zero = hwloc_bitmap_alloc();
+
+      hwloc_bitmap_andnot( s_process_no_core_zero , s_process_binding , core->allowed_cpuset );
+
+      bool ok = 0 == hwloc_set_cpubind( s_hwloc_topology ,
+                                        s_process_no_core_zero ,
+                                        HWLOC_CPUBIND_PROCESS | HWLOC_CPUBIND_STRICT );
+
+      if ( ok ) {
+        hwloc_get_cpubind( s_hwloc_topology , s_process_binding ,  HWLOC_CPUBIND_PROCESS );
+
+        ok = 0 != hwloc_bitmap_isequal( s_process_binding , s_process_no_core_zero );
+      }
+
+      hwloc_bitmap_free( s_process_no_core_zero );
+
+      if ( ! ok ) {
+        std::cerr << "WARNING: Kokkos::hwloc attempted and failed to move process off of core #0" << std::endl ;
+      }
+    }
+  }
+
+  // Choose a hwloc object type for the NUMA level, which may not exist.
+
+  hwloc_obj_type_t root_type = HWLOC_OBJ_TYPE_MAX ;
+
+  {
+    // Object types to search, in order.
+    static const hwloc_obj_type_t candidate_root_type[] =
+      { HWLOC_OBJ_NODE     /* NUMA region     */
+      , HWLOC_OBJ_SOCKET   /* hardware socket */
+      , HWLOC_OBJ_MACHINE  /* local machine   */
+      };
+
+    enum { CANDIDATE_ROOT_TYPE_COUNT =
+             sizeof(candidate_root_type) / sizeof(hwloc_obj_type_t) };
+
+    for ( int k = 0 ; k < CANDIDATE_ROOT_TYPE_COUNT && HWLOC_OBJ_TYPE_MAX == root_type ; ++k ) {
+      if ( 0 < hwloc_get_nbobjs_by_type( s_hwloc_topology , candidate_root_type[k] ) ) {
+        root_type = candidate_root_type[k] ;
+      }
+    }
+  }
+
+  // Determine which of these 'root' types are available to this process.
+  // The process may have been bound (e.g., by MPI) to a subset of these root types.
+  // Determine current location of the master (calling) process>
+
+  hwloc_bitmap_t proc_cpuset_location = hwloc_bitmap_alloc();
+
+  hwloc_get_last_cpu_location( s_hwloc_topology , proc_cpuset_location , HWLOC_CPUBIND_THREAD );
+
+  const unsigned max_root = hwloc_get_nbobjs_by_type( s_hwloc_topology , root_type );
+
+  unsigned root_base     = max_root ;
+  unsigned root_count    = 0 ;
+  unsigned core_per_root = 0 ;
+  unsigned pu_per_core   = 0 ;
+  bool     symmetric     = true ;
+
+  for ( unsigned i = 0 ; i < max_root ; ++i ) {
+
+    const hwloc_obj_t root = hwloc_get_obj_by_type( s_hwloc_topology , root_type , i );
+
+    if ( hwloc_bitmap_intersects( s_process_binding , root->allowed_cpuset ) ) {
+
+      ++root_count ;
+
+      // Remember which root (NUMA) object the master thread is running on.
+      // This will be logical NUMA rank #0 for this process.
+
+      if ( hwloc_bitmap_intersects( proc_cpuset_location, root->allowed_cpuset ) ) {
+        root_base = i ;
+      }
+
+      // Count available cores:
+
+      const unsigned max_core =
+        hwloc_get_nbobjs_inside_cpuset_by_type( s_hwloc_topology ,
+                                                root->allowed_cpuset ,
+                                                HWLOC_OBJ_CORE );
+
+      unsigned core_count = 0 ;
+
+      for ( unsigned j = 0 ; j < max_core ; ++j ) {
+
+        const hwloc_obj_t core =
+          hwloc_get_obj_inside_cpuset_by_type( s_hwloc_topology ,
+                                               root->allowed_cpuset ,
+                                               HWLOC_OBJ_CORE , j );
+
+        // If process' cpuset intersects core's cpuset then process can access this core.
+        // Must use intersection instead of inclusion because the Intel-Phi
+        // MPI may bind the process to only one of the core's hyperthreads.
+        //
+        // Assumption: if the process can access any hyperthread of the core
+        // then it has ownership of the entire core.
+        // This assumes that it would be performance-detrimental
+        // to spawn more than one MPI process per core and use nested threading.
+
+        if ( hwloc_bitmap_intersects( s_process_binding , core->allowed_cpuset ) ) {
+
+          ++core_count ;
+
+          const unsigned pu_count =
+            hwloc_get_nbobjs_inside_cpuset_by_type( s_hwloc_topology ,
+                                                    core->allowed_cpuset ,
+                                                    HWLOC_OBJ_PU );
+
+          if ( pu_per_core == 0 ) pu_per_core = pu_count ;
+
+          // Enforce symmetry by taking the minimum:
+
+          pu_per_core = std::min( pu_per_core , pu_count );
+
+          if ( pu_count != pu_per_core ) symmetric = false ;
+        }
+      }
+
+      if ( 0 == core_per_root ) core_per_root = core_count ;
+
+      // Enforce symmetry by taking the minimum:
+
+      core_per_root = std::min( core_per_root , core_count );
+
+      if ( core_count != core_per_root ) symmetric = false ;
+    }
+  }
+
+  s_core_topology.first  = root_count ;
+  s_core_topology.second = core_per_root ;
+  s_core_capacity        = pu_per_core ;
+
+  // Fill the 's_core' array for fast mapping from a core coordinate to the
+  // hwloc cpuset object required for thread location querying and binding.
+
+  for ( unsigned i = 0 ; i < max_root ; ++i ) {
+
+    const unsigned root_rank = ( i + root_base ) % max_root ;
+
+    const hwloc_obj_t root = hwloc_get_obj_by_type( s_hwloc_topology , root_type , root_rank );
+
+    if ( hwloc_bitmap_intersects( s_process_binding , root->allowed_cpuset ) ) {
+
+      const unsigned max_core =
+        hwloc_get_nbobjs_inside_cpuset_by_type( s_hwloc_topology ,
+                                                root->allowed_cpuset ,
+                                                HWLOC_OBJ_CORE );
+
+      unsigned core_count = 0 ;
+
+      for ( unsigned j = 0 ; j < max_core && core_count < core_per_root ; ++j ) {
+
+        const hwloc_obj_t core =
+          hwloc_get_obj_inside_cpuset_by_type( s_hwloc_topology ,
+                                               root->allowed_cpuset ,
+                                               HWLOC_OBJ_CORE , j );
+
+        if ( hwloc_bitmap_intersects( s_process_binding , core->allowed_cpuset ) ) {
+
+          s_core[ core_count + core_per_root * i ] = core->allowed_cpuset ;
+
+          ++core_count ;
+        }
+      }
+    }
+  }
+
+  hwloc_bitmap_free( proc_cpuset_location );
+
+  if ( ! symmetric ) {
+    std::cout << "Kokkos::hwloc WARNING: Using a symmetric subset of a non-symmetric core topology."
+              << std::endl ;
+  }
+}
+
+
+} // namespace
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+bool available()
+{ return true ; }
+
+unsigned get_available_numa_count()
+{ sentinel(); return s_core_topology.first ; }
+
+unsigned get_available_cores_per_numa()
+{ sentinel(); return s_core_topology.second ; }
+
+unsigned get_available_threads_per_core()
+{ sentinel(); return s_core_capacity ; }
+
+bool can_bind_threads()
+{ sentinel(); return s_can_bind_threads; }
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+unsigned bind_this_thread(
+  const unsigned               coordinate_count ,
+  std::pair<unsigned,unsigned> coordinate[] )
+{
+  unsigned i = 0 ;
+
+  try {
+    const std::pair<unsigned,unsigned> current = get_this_thread_coordinate();
+
+    // Match one of the requests:
+    for ( i = 0 ; i < coordinate_count && current != coordinate[i] ; ++i );
+
+    if ( coordinate_count == i ) {
+      // Match the first request (typically NUMA):
+      for ( i = 0 ; i < coordinate_count && current.first != coordinate[i].first ; ++i );
+    }
+
+    if ( coordinate_count == i ) {
+      // Match any unclaimed request:
+      for ( i = 0 ; i < coordinate_count && ~0u == coordinate[i].first  ; ++i );
+    }
+
+    if ( coordinate_count == i || ! bind_this_thread( coordinate[i] ) ) {
+       // Failed to bind:
+       i = ~0u ;
+    }
+
+    if ( i < coordinate_count ) {
+
+#if DEBUG_PRINT
+      if ( current != coordinate[i] ) {
+        std::cout << "  bind_this_thread: rebinding from ("
+                  << current.first << ","
+                  << current.second
+                  << ") to ("
+                  << coordinate[i].first << ","
+                  << coordinate[i].second
+                  << ")" << std::endl ;
+      }
+#endif
+
+      coordinate[i].first  = ~0u ;
+      coordinate[i].second = ~0u ;
+    }
+  }
+  catch( ... ) {
+    i = ~0u ;
+  }
+
+  return i ;
+}
+
+
+bool bind_this_thread( const std::pair<unsigned,unsigned> coord )
+{
+  if ( ! sentinel() ) return false ;
+
+#if DEBUG_PRINT
+
+  std::cout << "Kokkos::bind_this_thread() at " ;
+
+  hwloc_get_last_cpu_location( s_hwloc_topology ,
+                               s_hwloc_location , HWLOC_CPUBIND_THREAD );
+
+  print_bitmap( std::cout , s_hwloc_location );
+
+  std::cout << " to " ;
+
+  print_bitmap( std::cout , s_core[ coord.second + coord.first * s_core_topology.second ] );
+
+  std::cout << std::endl ;
+
+#endif
+
+  // As safe and fast as possible.
+  // Fast-lookup by caching the coordinate -> hwloc cpuset mapping in 's_core'.
+  return coord.first  < s_core_topology.first &&
+         coord.second < s_core_topology.second &&
+         0 == hwloc_set_cpubind( s_hwloc_topology ,
+                                 s_core[ coord.second + coord.first * s_core_topology.second ] ,
+                                 HWLOC_CPUBIND_THREAD | HWLOC_CPUBIND_STRICT );
+}
+
+bool unbind_this_thread()
+{
+  if ( ! sentinel() ) return false ;
+
+#define HWLOC_DEBUG_PRINT 0
+
+#if HWLOC_DEBUG_PRINT
+
+  std::cout << "Kokkos::unbind_this_thread() from " ;
+
+  hwloc_get_cpubind( s_hwloc_topology , s_hwloc_location , HWLOC_CPUBIND_THREAD );
+
+  print_bitmap( std::cout , s_hwloc_location );
+
+#endif
+
+  const bool result =
+    s_hwloc_topology &&
+    0 == hwloc_set_cpubind( s_hwloc_topology ,
+                            s_process_binding ,
+                            HWLOC_CPUBIND_THREAD | HWLOC_CPUBIND_STRICT );
+
+#if HWLOC_DEBUG_PRINT
+
+  std::cout << " to " ;
+
+  hwloc_get_cpubind( s_hwloc_topology , s_hwloc_location , HWLOC_CPUBIND_THREAD );
+
+  print_bitmap( std::cout , s_hwloc_location );
+
+  std::cout << std::endl ;
+
+#endif
+
+  return result ;
+
+#undef HWLOC_DEBUG_PRINT
+
+}
+
+//----------------------------------------------------------------------------
+
+std::pair<unsigned,unsigned> get_this_thread_coordinate()
+{
+  std::pair<unsigned,unsigned> coord(0u,0u);
+
+  if ( ! sentinel() ) return coord ;
+
+  const unsigned n = s_core_topology.first * s_core_topology.second ;
+
+  // Using the pre-allocated 's_hwloc_location' to avoid memory
+  // allocation by this thread.  This call is NOT thread-safe.
+  hwloc_get_last_cpu_location( s_hwloc_topology ,
+                               s_hwloc_location , HWLOC_CPUBIND_THREAD );
+
+  unsigned i = 0 ;
+
+  while ( i < n && ! hwloc_bitmap_intersects( s_hwloc_location , s_core[ i ] ) ) ++i ;
+
+  if ( i < n ) {
+    coord.first  = i / s_core_topology.second ;
+    coord.second = i % s_core_topology.second ;
+  }
+
+  return coord ;
+}
+
+//----------------------------------------------------------------------------
+
+} /* namespace hwloc */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#else /* ! defined( KOKKOS_HAVE_HWLOC ) */
+
+namespace Kokkos {
+namespace hwloc {
+
+bool available() { return false ; }
+bool can_bind_threads() { return false ; }
+
+unsigned get_available_numa_count() { return 1 ; }
+unsigned get_available_cores_per_numa() { return 1 ; }
+unsigned get_available_threads_per_core() { return 1 ; }
+
+unsigned bind_this_thread( const unsigned , std::pair<unsigned,unsigned>[] )
+{ return ~0 ; }
+
+bool bind_this_thread( const std::pair<unsigned,unsigned> )
+{ return false ; }
+
+bool unbind_this_thread()
+{ return true ; }
+
+std::pair<unsigned,unsigned> get_this_thread_coordinate()
+{ return std::pair<unsigned,unsigned>(0,0); }
+
+} // namespace hwloc
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif
+
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_spinwait.cpp b/lib/kokkos/core/src/impl/Kokkos_spinwait.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..aff7f29f89883d199ecf65feb86c89328530413b
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_spinwait.cpp
@@ -0,0 +1,89 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Macros.hpp>
+#include <impl/Kokkos_spinwait.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+#if ( KOKKOS_ENABLE_ASM )
+  #if defined( __arm__ ) || defined( __aarch64__ )
+    /* No-operation instruction to idle the thread. */
+    #define YIELD   asm volatile("nop")
+  #else
+    /* Pause instruction to prevent excess processor bus usage */
+    #define YIELD   asm volatile("pause\n":::"memory")
+  #endif
+#elif defined ( KOKKOS_HAVE_WINTHREAD )
+  #include <process.h>
+  #define YIELD  Sleep(0)
+#elif defined ( _WIN32)  && defined (_MSC_VER)
+  /* Windows w/ Visual Studio */
+  #define NOMINMAX
+  #include <winsock2.h>
+  #include <windows.h>
+#define YIELD YieldProcessor();
+#elif defined ( _WIN32 )
+  /* Windows w/ Intel*/
+  #define YIELD __asm__ __volatile__("pause\n":::"memory")
+#else
+  #include <sched.h>
+  #define YIELD  sched_yield()
+#endif
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+void spinwait( volatile int & flag , const int value )
+{
+  while ( value == flag ) {
+    YIELD ;
+  }
+}
+#endif
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_spinwait.hpp b/lib/kokkos/core/src/impl/Kokkos_spinwait.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..cc87771faefcb8ad7716842890dbec4a9c1219a1
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_spinwait.hpp
@@ -0,0 +1,64 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+
+#ifndef KOKKOS_SPINWAIT_HPP
+#define KOKKOS_SPINWAIT_HPP
+
+#include <Kokkos_Macros.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+void spinwait( volatile int & flag , const int value );
+#else
+KOKKOS_INLINE_FUNCTION
+void spinwait( volatile int & , const int ) {}
+#endif
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+#endif /* #ifndef KOKKOS_SPINWAIT_HPP */
+
diff --git a/lib/kokkos/core/unit_test/CMakeLists.txt b/lib/kokkos/core/unit_test/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5bb2b672e124f3b282d760562514afb1719fd957
--- /dev/null
+++ b/lib/kokkos/core/unit_test/CMakeLists.txt
@@ -0,0 +1,105 @@
+#
+# Add test-only library for gtest to be reused by all the subpackages
+#
+
+SET(GTEST_SOURCE_DIR ${${PARENT_PACKAGE_NAME}_SOURCE_DIR}/tpls/gtest)
+
+INCLUDE_DIRECTORIES(${GTEST_SOURCE_DIR})
+TRIBITS_ADD_LIBRARY(
+  kokkos_gtest
+  HEADERS ${GTEST_SOURCE_DIR}/gtest/gtest.h
+  SOURCES ${GTEST_SOURCE_DIR}/gtest/gtest-all.cc
+  TESTONLY
+  )
+
+#
+# Define the tests
+#
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+IF(Kokkos_ENABLE_Serial)
+  TRIBITS_ADD_EXECUTABLE_AND_TEST(
+    UnitTest_Serial
+    SOURCES UnitTestMain.cpp TestSerial.cpp
+    COMM serial mpi
+    NUM_MPI_PROCS 1
+    FAIL_REGULAR_EXPRESSION "  FAILED  "
+    TESTONLYLIBS kokkos_gtest
+  )
+ENDIF()
+
+IF(Kokkos_ENABLE_Pthread)
+  TRIBITS_ADD_EXECUTABLE_AND_TEST(
+    UnitTest_Threads
+    SOURCES UnitTestMain.cpp TestThreads.cpp
+    COMM serial mpi
+    NUM_MPI_PROCS 1
+    FAIL_REGULAR_EXPRESSION "  FAILED  "
+    TESTONLYLIBS kokkos_gtest
+  )
+ENDIF()
+
+IF(Kokkos_ENABLE_OpenMP)
+  TRIBITS_ADD_EXECUTABLE_AND_TEST(
+    UnitTest_OpenMP
+    SOURCES UnitTestMain.cpp TestOpenMP.cpp TestOpenMP_a.cpp TestOpenMP_b.cpp TestOpenMP_c.cpp
+    COMM serial mpi
+    NUM_MPI_PROCS 1
+    FAIL_REGULAR_EXPRESSION "  FAILED  "
+    TESTONLYLIBS kokkos_gtest
+  )
+ENDIF()
+
+IF(Kokkos_ENABLE_QTHREAD)
+  TRIBITS_ADD_EXECUTABLE_AND_TEST(
+    UnitTest_Qthread
+    SOURCES UnitTestMain.cpp TestQthread.cpp
+    COMM serial mpi
+    NUM_MPI_PROCS 1
+    FAIL_REGULAR_EXPRESSION "  FAILED  "
+    TESTONLYLIBS kokkos_gtest
+  )
+ENDIF()
+
+IF(Kokkos_ENABLE_Cuda)
+  TRIBITS_ADD_EXECUTABLE_AND_TEST(
+    UnitTest_Cuda
+    SOURCES UnitTestMain.cpp TestCuda.cpp TestCuda_a.cpp TestCuda_b.cpp TestCuda_c.cpp
+    COMM serial mpi
+    NUM_MPI_PROCS 1
+    FAIL_REGULAR_EXPRESSION "  FAILED  "
+    TESTONLYLIBS kokkos_gtest
+  )
+ENDIF()
+
+TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  UnitTest_Default
+  SOURCES UnitTestMain.cpp TestDefaultDeviceType.cpp TestDefaultDeviceType_a.cpp
+  COMM serial mpi
+  NUM_MPI_PROCS 1
+  FAIL_REGULAR_EXPRESSION "  FAILED  "
+    TESTONLYLIBS kokkos_gtest
+)
+
+foreach(INITTESTS_NUM RANGE 1 16)
+TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  UnitTest_DefaultInit_${INITTESTS_NUM}
+  SOURCES UnitTestMain.cpp TestDefaultDeviceTypeInit_${INITTESTS_NUM}.cpp
+  COMM serial mpi
+  NUM_MPI_PROCS 1
+  FAIL_REGULAR_EXPRESSION "  FAILED  "
+    TESTONLYLIBS kokkos_gtest
+)
+endforeach(INITTESTS_NUM)
+
+TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  UnitTest_HWLOC
+  SOURCES UnitTestMain.cpp  TestHWLOC.cpp
+  COMM serial mpi
+  NUM_MPI_PROCS 1
+  FAIL_REGULAR_EXPRESSION "  FAILED  "
+    TESTONLYLIBS kokkos_gtest
+)
+
diff --git a/lib/kokkos/core/unit_test/Makefile b/lib/kokkos/core/unit_test/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..3d9d212c1ecdef658fdb9cf7d30fc542a6fb72d3
--- /dev/null
+++ b/lib/kokkos/core/unit_test/Makefile
@@ -0,0 +1,153 @@
+KOKKOS_PATH = ../..
+
+GTEST_PATH = ../../tpls/gtest
+
+vpath %.cpp ${KOKKOS_PATH}/core/unit_test
+TEST_HEADERS = $(wildcard $(KOKKOS_PATH)/core/unit_test/*.hpp)
+
+default: build_all
+	echo "End Build"
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+	CXX = $(NVCC_WRAPPER)
+	CXXFLAGS ?= -O3
+	LINK = $(CXX)
+	LDFLAGS ?= -lpthread
+else
+	CXX ?= g++
+	CXXFLAGS ?= -O3
+	LINK ?= $(CXX)
+	LDFLAGS ?= -lpthread
+endif
+
+KOKKOS_CXXFLAGS += -I$(GTEST_PATH) -I${KOKKOS_PATH}/core/unit_test
+
+TEST_TARGETS =
+TARGETS =
+
+ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+	OBJ_CUDA = TestCuda_c.o TestCuda_b.o TestCuda_a.o TestCuda.o UnitTestMain.o gtest-all.o
+	TARGETS += KokkosCore_UnitTest_Cuda
+	TEST_TARGETS += test-cuda
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
+	OBJ_THREADS = TestThreads.o UnitTestMain.o gtest-all.o
+	TARGETS += KokkosCore_UnitTest_Threads
+	TEST_TARGETS += test-threads
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
+	OBJ_OPENMP = TestOpenMP_c.o TestOpenMP_b.o TestOpenMP_a.o TestOpenMP.o UnitTestMain.o gtest-all.o
+	TARGETS += KokkosCore_UnitTest_OpenMP
+	TEST_TARGETS += test-openmp
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
+	OBJ_SERIAL = TestSerial.o UnitTestMain.o gtest-all.o
+	TARGETS += KokkosCore_UnitTest_Serial
+	TEST_TARGETS += test-serial
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_QTHREAD), 1)
+	OBJ_QTHREAD = TestQthread.o UnitTestMain.o gtest-all.o
+	TARGETS += KokkosCore_UnitTest_Qthread
+	TEST_TARGETS += test-qthread
+endif
+
+OBJ_HWLOC = TestHWLOC.o UnitTestMain.o gtest-all.o
+TARGETS += KokkosCore_UnitTest_HWLOC
+TEST_TARGETS += test-hwloc
+
+OBJ_DEFAULT = TestDefaultDeviceType.o TestDefaultDeviceType_a.o UnitTestMain.o gtest-all.o
+TARGETS += KokkosCore_UnitTest_Default
+TEST_TARGETS += test-default
+
+NUM_INITTESTS = 16
+INITTESTS_NUMBERS := $(shell seq 1 ${NUM_INITTESTS})
+INITTESTS_TARGETS := $(addprefix KokkosCore_UnitTest_DefaultDeviceTypeInit_,${INITTESTS_NUMBERS})
+TARGETS += ${INITTESTS_TARGETS}
+INITTESTS_TEST_TARGETS := $(addprefix test-default-init-,${INITTESTS_NUMBERS})
+TEST_TARGETS += ${INITTESTS_TEST_TARGETS}
+
+OBJ_SYNCHRONIC = TestSynchronic.o UnitTestMain.o gtest-all.o
+TARGETS += KokkosCore_UnitTest_Synchronic
+TEST_TARGETS += test-synchronic
+
+KokkosCore_UnitTest_Cuda: $(OBJ_CUDA) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_CUDA) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_Cuda
+
+KokkosCore_UnitTest_Threads: $(OBJ_THREADS) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_THREADS) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_Threads
+
+KokkosCore_UnitTest_OpenMP: $(OBJ_OPENMP) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_OPENMP) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_OpenMP
+
+KokkosCore_UnitTest_Serial: $(OBJ_SERIAL) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_SERIAL) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_Serial
+
+KokkosCore_UnitTest_Qthread: $(OBJ_QTHREAD) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_QTHREAD) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_Qthread
+
+KokkosCore_UnitTest_HWLOC: $(OBJ_HWLOC) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_HWLOC) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_HWLOC
+
+KokkosCore_UnitTest_AllocationTracker: $(OBJ_ALLOCATIONTRACKER) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_ALLOCATIONTRACKER) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_AllocationTracker
+
+KokkosCore_UnitTest_Default: $(OBJ_DEFAULT) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_DEFAULT) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_Default
+
+${INITTESTS_TARGETS}: KokkosCore_UnitTest_DefaultDeviceTypeInit_%: TestDefaultDeviceTypeInit_%.o UnitTestMain.o gtest-all.o $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) TestDefaultDeviceTypeInit_$*.o UnitTestMain.o gtest-all.o $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_DefaultDeviceTypeInit_$*
+
+KokkosCore_UnitTest_Synchronic: $(OBJ_SYNCHRONIC) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_SYNCHRONIC) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_Synchronic
+
+test-cuda: KokkosCore_UnitTest_Cuda
+	./KokkosCore_UnitTest_Cuda
+
+test-threads: KokkosCore_UnitTest_Threads
+	./KokkosCore_UnitTest_Threads
+
+test-openmp: KokkosCore_UnitTest_OpenMP
+	./KokkosCore_UnitTest_OpenMP
+
+test-serial: KokkosCore_UnitTest_Serial
+	./KokkosCore_UnitTest_Serial
+
+test-qthread: KokkosCore_UnitTest_Qthread
+	./KokkosCore_UnitTest_Qthread
+
+test-hwloc: KokkosCore_UnitTest_HWLOC
+	./KokkosCore_UnitTest_HWLOC
+
+test-allocationtracker: KokkosCore_UnitTest_AllocationTracker
+	./KokkosCore_UnitTest_AllocationTracker
+
+test-default: KokkosCore_UnitTest_Default
+	./KokkosCore_UnitTest_Default
+
+${INITTESTS_TEST_TARGETS}: test-default-init-%: KokkosCore_UnitTest_DefaultDeviceTypeInit_%
+	./KokkosCore_UnitTest_DefaultDeviceTypeInit_$*
+
+test-synchronic: KokkosCore_UnitTest_Synchronic
+	./KokkosCore_UnitTest_Synchronic
+
+build_all: $(TARGETS)
+
+test: $(TEST_TARGETS)
+
+clean: kokkos-clean
+	rm -f *.o $(TARGETS)
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS) $(TEST_HEADERS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
+
+gtest-all.o:$(GTEST_PATH)/gtest/gtest-all.cc
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $(GTEST_PATH)/gtest/gtest-all.cc
+
diff --git a/lib/kokkos/core/unit_test/TestAggregate.hpp b/lib/kokkos/core/unit_test/TestAggregate.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..5388a60787cb9217a4436798d826dcc53f55d3f2
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestAggregate.hpp
@@ -0,0 +1,109 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef TEST_AGGREGATE_HPP
+#define TEST_AGGREGATE_HPP
+
+#include <gtest/gtest.h>
+
+#include <stdexcept>
+#include <sstream>
+#include <iostream>
+
+/*--------------------------------------------------------------------------*/
+
+#include <impl/KokkosExp_ViewArray.hpp>
+
+namespace Test {
+
+template< class DeviceType >
+void TestViewAggregate()
+{
+  typedef Kokkos::Array<double,32>  value_type ;
+
+  typedef Kokkos::Experimental::Impl::
+    ViewDataAnalysis< value_type * , Kokkos::LayoutLeft , value_type >
+      analysis_1d ;
+
+  static_assert( std::is_same< typename analysis_1d::specialize , Kokkos::Array<> >::value , "" );
+
+
+  typedef Kokkos::ViewTraits< value_type ** , DeviceType > a32_traits ;
+  typedef Kokkos::ViewTraits< typename a32_traits::scalar_array_type , DeviceType > flat_traits ;
+
+  static_assert( std::is_same< typename a32_traits::specialize , Kokkos::Array<> >::value , "" );
+  static_assert( std::is_same< typename a32_traits::value_type , value_type >::value , "" );
+  static_assert( a32_traits::rank == 2 , "" );
+  static_assert( a32_traits::rank_dynamic == 2 , "" );
+
+  static_assert( std::is_same< typename flat_traits::specialize , void >::value , "" );
+  static_assert( flat_traits::rank == 3 , "" );
+  static_assert( flat_traits::rank_dynamic == 2 , "" );
+  static_assert( flat_traits::dimension::N2 == 32 , "" );
+
+
+  typedef Kokkos::View< Kokkos::Array<double,32> ** , DeviceType > a32_type ;
+
+  typedef typename a32_type::array_type  a32_flat_type ;
+
+  static_assert( std::is_same< typename a32_type::value_type , value_type >::value , "" );
+  static_assert( std::is_same< typename a32_type::pointer_type , double * >::value , "" );
+  static_assert( a32_type::Rank == 2 , "" );
+  static_assert( a32_flat_type::Rank == 3 , "" );
+
+  a32_type x("test",4,5);
+  a32_flat_type y( x );
+
+  ASSERT_EQ( x.extent(0) , 4 );
+  ASSERT_EQ( x.extent(1) , 5 );
+  ASSERT_EQ( y.extent(0) , 4 );
+  ASSERT_EQ( y.extent(1) , 5 );
+  ASSERT_EQ( y.extent(2) , 32 );
+}
+
+}
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+#endif /* #ifndef TEST_AGGREGATE_HPP */
diff --git a/lib/kokkos/core/unit_test/TestAggregateReduction.hpp b/lib/kokkos/core/unit_test/TestAggregateReduction.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..bd05cd347b979e305becead88a898d27b0a7d4f8
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestAggregateReduction.hpp
@@ -0,0 +1,191 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef TEST_AGGREGATE_REDUCTION_HPP
+#define TEST_AGGREGATE_REDUCTION_HPP
+
+#include <gtest/gtest.h>
+
+#include <stdexcept>
+#include <sstream>
+#include <iostream>
+
+namespace Test {
+
+template< typename T , unsigned N >
+struct StaticArray {
+  T value[N] ;
+
+  KOKKOS_INLINE_FUNCTION
+  StaticArray() = default;
+
+  KOKKOS_INLINE_FUNCTION
+  StaticArray( const StaticArray & rhs ) = default;
+
+  KOKKOS_INLINE_FUNCTION
+  operator T () { return value[0]; }
+
+  KOKKOS_INLINE_FUNCTION
+  StaticArray & operator = ( const T & rhs )
+    {
+      for ( unsigned i = 0 ; i < N ; ++i ) value[i] = rhs ;
+      return *this ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  StaticArray & operator = ( const StaticArray & rhs ) = default;
+
+  KOKKOS_INLINE_FUNCTION
+  StaticArray operator * ( const StaticArray & rhs )
+    {
+      StaticArray tmp ;
+      for ( unsigned i = 0 ; i < N ; ++i ) tmp.value[i] = value[i] * rhs.value[i] ;
+      return tmp ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  StaticArray operator + ( const StaticArray & rhs )
+    {
+      StaticArray tmp ;
+      for ( unsigned i = 0 ; i < N ; ++i ) tmp.value[i] = value[i] + rhs.value[i] ;
+      return tmp ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  StaticArray & operator += ( const StaticArray & rhs )
+    {
+      for ( unsigned i = 0 ; i < N ; ++i ) value[i] += rhs.value[i] ;
+      return *this ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator += ( const volatile StaticArray & rhs ) volatile
+    {
+      for ( unsigned i = 0 ; i < N ; ++i ) value[i] += rhs.value[i] ;
+    }
+};
+
+static_assert(std::is_trivial<StaticArray<int, 4>>::value, "Not trivial");
+
+template< typename T , class Space >
+struct DOT {
+  typedef T      value_type ;
+  typedef Space  execution_space ;
+
+  Kokkos::View< value_type * , Space > a ;
+  Kokkos::View< value_type * , Space > b ;
+
+  DOT( const Kokkos::View< value_type * , Space > arg_a
+     , const Kokkos::View< value_type * , Space > arg_b
+     )
+    : a( arg_a ), b( arg_b ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i , value_type & update ) const
+    {
+      update += a(i) * b(i);
+    }
+};
+
+template< typename T , class Space >
+struct FILL {
+  typedef T      value_type ;
+  typedef Space  execution_space ;
+
+  Kokkos::View< value_type * , Space > a ;
+  Kokkos::View< value_type * , Space > b ;
+
+  FILL( const Kokkos::View< value_type * , Space > & arg_a
+      , const Kokkos::View< value_type * , Space > & arg_b
+      )
+    : a( arg_a ), b( arg_b ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i ) const
+    {
+      a(i) = i % 2 ? i + 1 : 1 ;
+      b(i) = i % 2 ? 1 : i + 1 ;
+    }
+};
+
+template< class Space >
+void TestViewAggregateReduction()
+{
+
+#if ! KOKKOS_USING_EXP_VIEW
+
+  const int count = 2 ;
+  const long result = count % 2 ? ( count * ( ( count + 1 ) / 2 ) )
+                                : ( ( count / 2 ) * ( count + 1 ) );
+
+  Kokkos::View< long * , Space > a("a",count);
+  Kokkos::View< long * , Space > b("b",count);
+  Kokkos::View< StaticArray<long,4> * , Space > a4("a4",count);
+  Kokkos::View< StaticArray<long,4> * , Space > b4("b4",count);
+  Kokkos::View< StaticArray<long,10> * , Space > a10("a10",count);
+  Kokkos::View< StaticArray<long,10> * , Space > b10("b10",count);
+
+  Kokkos::parallel_for( count , FILL<long,Space>(a,b) );
+  Kokkos::parallel_for( count , FILL< StaticArray<long,4> , Space >(a4,b4) );
+  Kokkos::parallel_for( count , FILL< StaticArray<long,10> , Space >(a10,b10) );
+
+  long r = 0;
+  StaticArray<long,4> r4 ;
+  StaticArray<long,10> r10 ;
+
+  Kokkos::parallel_reduce( count , DOT<long,Space>(a,b) , r );
+  Kokkos::parallel_reduce( count , DOT< StaticArray<long,4> , Space >(a4,b4) , r4 );
+  Kokkos::parallel_reduce( count , DOT< StaticArray<long,10> , Space >(a10,b10) , r10 );
+
+  ASSERT_EQ( result , r );
+  for ( int i = 0 ; i < 10 ; ++i ) { ASSERT_EQ( result , r10.value[i] ); }
+  for ( int i = 0 ; i < 4 ; ++i ) { ASSERT_EQ( result , r4.value[i] ); }
+
+#endif
+
+}
+
+}
+
+#endif /* #ifndef TEST_AGGREGATE_REDUCTION_HPP */
+
diff --git a/lib/kokkos/core/unit_test/TestAtomic.hpp b/lib/kokkos/core/unit_test/TestAtomic.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e948723574b48b2a64ee66c487062e34c0ccf29b
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestAtomic.hpp
@@ -0,0 +1,402 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+
+namespace TestAtomic {
+
+// Struct for testing arbitrary size atomics
+
+template<int N>
+struct SuperScalar {
+  double val[N];
+
+  KOKKOS_INLINE_FUNCTION
+  SuperScalar() {
+    for(int i=0; i<N; i++)
+      val[i] = 0.0;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  SuperScalar(const SuperScalar& src) {
+    for(int i=0; i<N; i++)
+      val[i] = src.val[i];
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  SuperScalar(const volatile SuperScalar& src) {
+    for(int i=0; i<N; i++)
+      val[i] = src.val[i];
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  SuperScalar& operator = (const SuperScalar& src) {
+    for(int i=0; i<N; i++)
+      val[i] = src.val[i];
+    return *this;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  SuperScalar& operator = (const volatile SuperScalar& src) {
+    for(int i=0; i<N; i++)
+      val[i] = src.val[i];
+    return *this;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator = (const SuperScalar& src) volatile  {
+    for(int i=0; i<N; i++)
+      val[i] = src.val[i];
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  SuperScalar operator + (const SuperScalar& src) {
+    SuperScalar tmp = *this;
+    for(int i=0; i<N; i++)
+      tmp.val[i] += src.val[i];
+    return tmp;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  SuperScalar& operator += (const double& src) {
+    for(int i=0; i<N; i++)
+      val[i] += 1.0*(i+1)*src;
+    return *this;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  SuperScalar& operator += (const SuperScalar& src) {
+    for(int i=0; i<N; i++)
+      val[i] += src.val[i];
+    return *this;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  bool operator == (const SuperScalar& src) {
+    bool compare = true;
+    for(int i=0; i<N; i++)
+      compare = compare && ( val[i] == src.val[i]);
+    return compare;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  bool operator != (const SuperScalar& src) {
+    bool compare = true;
+    for(int i=0; i<N; i++)
+      compare = compare && ( val[i] == src.val[i]);
+    return !compare;
+  }
+
+
+
+  KOKKOS_INLINE_FUNCTION
+  SuperScalar(const double& src) {
+    for(int i=0; i<N; i++)
+      val[i] = 1.0 * (i+1) * src;
+  }
+
+};
+
+template<int N>
+std::ostream& operator<<(std::ostream& os, const SuperScalar<N>& dt)
+{
+    os << "{ ";
+    for(int i=0;i<N-1;i++)
+       os << dt.val[i] << ", ";
+    os << dt.val[N-1] << "}";
+    return os;
+}
+
+template<class T,class DEVICE_TYPE>
+struct ZeroFunctor {
+  typedef DEVICE_TYPE execution_space;
+  typedef typename Kokkos::View<T,execution_space> type;
+  typedef typename Kokkos::View<T,execution_space>::HostMirror h_type;
+  type data;
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int) const {
+    data() = 0;
+  }
+};
+
+//---------------------------------------------------
+//--------------atomic_fetch_add---------------------
+//---------------------------------------------------
+
+template<class T,class DEVICE_TYPE>
+struct AddFunctor{
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View<T,execution_space> type;
+  type data;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int) const {
+    Kokkos::atomic_fetch_add(&data(),(T)1);
+  }
+};
+
+template<class T, class execution_space >
+T AddLoop(int loop) {
+  struct ZeroFunctor<T,execution_space> f_zero;
+  typename ZeroFunctor<T,execution_space>::type data("Data");
+  typename ZeroFunctor<T,execution_space>::h_type h_data("HData");
+  f_zero.data = data;
+  Kokkos::parallel_for(1,f_zero);
+  execution_space::fence();
+
+  struct AddFunctor<T,execution_space> f_add;
+  f_add.data = data;
+  Kokkos::parallel_for(loop,f_add);
+  execution_space::fence();
+
+  Kokkos::deep_copy(h_data,data);
+  T val = h_data();
+  return val;
+}
+
+template<class T>
+T AddLoopSerial(int loop) {
+  T* data = new T[1];
+  data[0] = 0;
+
+  for(int i=0;i<loop;i++)
+  *data+=(T)1;
+
+  T val = *data;
+  delete [] data;
+  return val;
+}
+
+//------------------------------------------------------
+//--------------atomic_compare_exchange-----------------
+//------------------------------------------------------
+
+template<class T,class DEVICE_TYPE>
+struct CASFunctor{
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View<T,execution_space> type;
+  type data;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int) const {
+	  T old = data();
+	  T newval, assumed;
+	  do {
+	    assumed = old;
+	    newval = assumed + (T)1;
+	    old = Kokkos::atomic_compare_exchange(&data(), assumed, newval);
+	  }
+	  while( old != assumed );
+  }
+};
+
+template<class T, class execution_space >
+T CASLoop(int loop) {
+  struct ZeroFunctor<T,execution_space> f_zero;
+  typename ZeroFunctor<T,execution_space>::type data("Data");
+  typename ZeroFunctor<T,execution_space>::h_type h_data("HData");
+  f_zero.data = data;
+  Kokkos::parallel_for(1,f_zero);
+  execution_space::fence();
+
+  struct CASFunctor<T,execution_space> f_cas;
+  f_cas.data = data;
+  Kokkos::parallel_for(loop,f_cas);
+  execution_space::fence();
+
+  Kokkos::deep_copy(h_data,data);
+  T val = h_data();
+
+  return val;
+}
+
+template<class T>
+T CASLoopSerial(int loop) {
+  T* data = new T[1];
+  data[0] = 0;
+
+  for(int i=0;i<loop;i++) {
+	  T assumed;
+	  T newval;
+	  T old;
+	  do {
+	    assumed = *data;
+	    newval = assumed + (T)1;
+	    old = *data;
+	    *data = newval;
+	  }
+	  while(!(assumed==old));
+  }
+
+  T val = *data;
+  delete [] data;
+  return val;
+}
+
+//----------------------------------------------
+//--------------atomic_exchange-----------------
+//----------------------------------------------
+
+template<class T,class DEVICE_TYPE>
+struct ExchFunctor{
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View<T,execution_space> type;
+  type data, data2;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int i) const {
+    T old = Kokkos::atomic_exchange(&data(),(T)i);
+    Kokkos::atomic_fetch_add(&data2(),old);
+  }
+};
+
+template<class T, class execution_space >
+T ExchLoop(int loop) {
+  struct ZeroFunctor<T,execution_space> f_zero;
+  typename ZeroFunctor<T,execution_space>::type data("Data");
+  typename ZeroFunctor<T,execution_space>::h_type h_data("HData");
+  f_zero.data = data;
+  Kokkos::parallel_for(1,f_zero);
+  execution_space::fence();
+
+  typename ZeroFunctor<T,execution_space>::type data2("Data");
+  typename ZeroFunctor<T,execution_space>::h_type h_data2("HData");
+  f_zero.data = data2;
+  Kokkos::parallel_for(1,f_zero);
+  execution_space::fence();
+
+  struct ExchFunctor<T,execution_space> f_exch;
+  f_exch.data = data;
+  f_exch.data2 = data2;
+  Kokkos::parallel_for(loop,f_exch);
+  execution_space::fence();
+
+  Kokkos::deep_copy(h_data,data);
+  Kokkos::deep_copy(h_data2,data2);
+  T val = h_data() + h_data2();
+
+  return val;
+}
+
+template<class T>
+T ExchLoopSerial(typename std::conditional<!std::is_same<T,Kokkos::complex<double> >::value,int,void>::type loop) {
+  T* data = new T[1];
+  T* data2 = new T[1];
+  data[0] = 0;
+  data2[0] = 0;
+  for(int i=0;i<loop;i++) {
+	T old = *data;
+	*data=(T) i;
+	*data2+=old;
+  }
+
+  T val = *data2 + *data;
+  delete [] data;
+  delete [] data2;
+  return val;
+}
+
+template<class T>
+T ExchLoopSerial(typename std::conditional<std::is_same<T,Kokkos::complex<double> >::value,int,void>::type loop) {
+  T* data = new T[1];
+  T* data2 = new T[1];
+  data[0] = 0;
+  data2[0] = 0;
+  for(int i=0;i<loop;i++) {
+  T old = *data;
+  data->real() = (static_cast<double>(i));
+  data->imag() = 0;
+  *data2+=old;
+  }
+
+  T val = *data2 + *data;
+  delete [] data;
+  delete [] data2;
+  return val;
+}
+
+template<class T, class DeviceType >
+T LoopVariant(int loop, int test) {
+  switch (test) {
+    case 1: return AddLoop<T,DeviceType>(loop);
+    case 2: return CASLoop<T,DeviceType>(loop);
+    case 3: return ExchLoop<T,DeviceType>(loop);
+  }
+  return 0;
+}
+
+template<class T>
+T LoopVariantSerial(int loop, int test) {
+  switch (test) {
+    case 1: return AddLoopSerial<T>(loop);
+    case 2: return CASLoopSerial<T>(loop);
+    case 3: return ExchLoopSerial<T>(loop);
+  }
+  return 0;
+}
+
+template<class T,class DeviceType>
+bool Loop(int loop, int test)
+{
+  T res       = LoopVariant<T,DeviceType>(loop,test);
+  T resSerial = LoopVariantSerial<T>(loop,test);
+
+  bool passed = true;
+
+  if ( resSerial != res ) {
+    passed = false;
+
+    std::cout << "Loop<"
+              << typeid(T).name()
+              << ">( test = "
+              << test << " FAILED : "
+              << resSerial << " != " << res
+              << std::endl ;
+  }
+
+
+  return passed ;
+}
+
+}
+
diff --git a/lib/kokkos/core/unit_test/TestAtomicOperations.hpp b/lib/kokkos/core/unit_test/TestAtomicOperations.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..aee4bda06cea276e12fca664a48c81a428445bcd
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestAtomicOperations.hpp
@@ -0,0 +1,841 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+
+namespace TestAtomicOperations {
+
+//-----------------------------------------------
+//--------------zero_functor---------------------
+//-----------------------------------------------
+
+template<class T,class DEVICE_TYPE>
+struct ZeroFunctor {
+  typedef DEVICE_TYPE execution_space;
+  typedef typename Kokkos::View<T,execution_space> type;
+  typedef typename Kokkos::View<T,execution_space>::HostMirror h_type;
+  type data;
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int) const {
+    data() = 0;
+  }
+};
+
+//-----------------------------------------------
+//--------------init_functor---------------------
+//-----------------------------------------------
+
+template<class T,class DEVICE_TYPE>
+struct InitFunctor {
+  typedef DEVICE_TYPE execution_space;
+  typedef typename Kokkos::View<T,execution_space> type;
+  typedef typename Kokkos::View<T,execution_space>::HostMirror h_type;
+  type data;
+  T init_value ;
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int) const {
+    data() = init_value;
+  }
+
+  InitFunctor(T _init_value) : init_value(_init_value) {}
+};
+
+
+//---------------------------------------------------
+//--------------atomic_fetch_max---------------------
+//---------------------------------------------------
+
+template<class T,class DEVICE_TYPE>
+struct MaxFunctor{
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View<T,execution_space> type;
+  type data;
+  T i0;
+  T i1;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int) const {
+    //Kokkos::atomic_fetch_max(&data(),(T)1);
+    Kokkos::atomic_fetch_max(&data(),(T)i1);
+  }
+  MaxFunctor( T _i0 , T _i1 ) : i0(_i0) , i1(_i1) {}
+};
+
+template<class T, class execution_space >
+T MaxAtomic(T i0 , T i1) {
+  struct InitFunctor<T,execution_space> f_init(i0);
+  typename InitFunctor<T,execution_space>::type data("Data");
+  typename InitFunctor<T,execution_space>::h_type h_data("HData");
+  f_init.data = data;
+  Kokkos::parallel_for(1,f_init);
+  execution_space::fence();
+
+  struct MaxFunctor<T,execution_space> f(i0,i1);
+  f.data = data;
+  Kokkos::parallel_for(1,f);
+  execution_space::fence();
+
+  Kokkos::deep_copy(h_data,data);
+  T val = h_data();
+  return val;
+}
+
+template<class T>
+T MaxAtomicCheck(T i0 , T i1) {
+  T* data = new T[1];
+  data[0] = 0;
+
+  *data = (i0 > i1 ? i0 : i1) ;
+
+  T val = *data;
+  delete [] data;
+  return val;
+}
+
+template<class T,class DeviceType>
+bool MaxAtomicTest(T i0, T i1)
+{
+  T res       = MaxAtomic<T,DeviceType>(i0,i1);
+  T resSerial = MaxAtomicCheck<T>(i0,i1);
+
+  bool passed = true;
+
+  if ( resSerial != res ) {
+    passed = false;
+
+    std::cout << "Loop<"
+              << typeid(T).name()
+              << ">( test = MaxAtomicTest"
+              << " FAILED : "
+              << resSerial << " != " << res
+              << std::endl ;
+  }
+
+  return passed ;
+}
+
+//---------------------------------------------------
+//--------------atomic_fetch_min---------------------
+//---------------------------------------------------
+
+template<class T,class DEVICE_TYPE>
+struct MinFunctor{
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View<T,execution_space> type;
+  type data;
+  T i0;
+  T i1;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int) const {
+    Kokkos::atomic_fetch_min(&data(),(T)i1);
+  }
+  MinFunctor( T _i0 , T _i1 ) : i0(_i0) , i1(_i1) {}
+};
+
+template<class T, class execution_space >
+T MinAtomic(T i0 , T i1) {
+  struct InitFunctor<T,execution_space> f_init(i0);
+  typename InitFunctor<T,execution_space>::type data("Data");
+  typename InitFunctor<T,execution_space>::h_type h_data("HData");
+  f_init.data = data;
+  Kokkos::parallel_for(1,f_init);
+  execution_space::fence();
+
+  struct MinFunctor<T,execution_space> f(i0,i1);
+  f.data = data;
+  Kokkos::parallel_for(1,f);
+  execution_space::fence();
+
+  Kokkos::deep_copy(h_data,data);
+  T val = h_data();
+  return val;
+}
+
+template<class T>
+T MinAtomicCheck(T i0 , T i1) {
+  T* data = new T[1];
+  data[0] = 0;
+
+  *data = (i0 < i1 ? i0 : i1) ;
+
+  T val = *data;
+  delete [] data;
+  return val;
+}
+
+template<class T,class DeviceType>
+bool MinAtomicTest(T i0, T i1)
+{
+  T res       = MinAtomic<T,DeviceType>(i0,i1);
+  T resSerial = MinAtomicCheck<T>(i0,i1);
+
+  bool passed = true;
+
+  if ( resSerial != res ) {
+    passed = false;
+
+    std::cout << "Loop<"
+              << typeid(T).name()
+              << ">( test = MinAtomicTest"
+              << " FAILED : "
+              << resSerial << " != " << res
+              << std::endl ;
+  }
+
+  return passed ;
+}
+
+//---------------------------------------------------
+//--------------atomic_fetch_mul---------------------
+//---------------------------------------------------
+
+template<class T,class DEVICE_TYPE>
+struct MulFunctor{
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View<T,execution_space> type;
+  type data;
+  T i0;
+  T i1;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int) const {
+    Kokkos::atomic_fetch_mul(&data(),(T)i1);
+  }
+  MulFunctor( T _i0 , T _i1 ) : i0(_i0) , i1(_i1) {}
+};
+
+template<class T, class execution_space >
+T MulAtomic(T i0 , T i1) {
+  struct InitFunctor<T,execution_space> f_init(i0);
+  typename InitFunctor<T,execution_space>::type data("Data");
+  typename InitFunctor<T,execution_space>::h_type h_data("HData");
+  f_init.data = data;
+  Kokkos::parallel_for(1,f_init);
+  execution_space::fence();
+
+  struct MulFunctor<T,execution_space> f(i0,i1);
+  f.data = data;
+  Kokkos::parallel_for(1,f);
+  execution_space::fence();
+
+  Kokkos::deep_copy(h_data,data);
+  T val = h_data();
+  return val;
+}
+
+template<class T>
+T MulAtomicCheck(T i0 , T i1) {
+  T* data = new T[1];
+  data[0] = 0;
+
+  *data = i0*i1 ;
+
+  T val = *data;
+  delete [] data;
+  return val;
+}
+
+template<class T,class DeviceType>
+bool MulAtomicTest(T i0, T i1)
+{
+  T res       = MulAtomic<T,DeviceType>(i0,i1);
+  T resSerial = MulAtomicCheck<T>(i0,i1);
+
+  bool passed = true;
+
+  if ( resSerial != res ) {
+    passed = false;
+
+    std::cout << "Loop<"
+              << typeid(T).name()
+              << ">( test = MulAtomicTest"
+              << " FAILED : "
+              << resSerial << " != " << res
+              << std::endl ;
+  }
+
+  return passed ;
+}
+
+//---------------------------------------------------
+//--------------atomic_fetch_div---------------------
+//---------------------------------------------------
+
+template<class T,class DEVICE_TYPE>
+struct DivFunctor{
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View<T,execution_space> type;
+  type data;
+  T i0;
+  T i1;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int) const {
+    Kokkos::atomic_fetch_div(&data(),(T)i1);
+  }
+  DivFunctor( T _i0 , T _i1 ) : i0(_i0) , i1(_i1) {}
+};
+
+template<class T, class execution_space >
+T DivAtomic(T i0 , T i1) {
+  struct InitFunctor<T,execution_space> f_init(i0);
+  typename InitFunctor<T,execution_space>::type data("Data");
+  typename InitFunctor<T,execution_space>::h_type h_data("HData");
+  f_init.data = data;
+  Kokkos::parallel_for(1,f_init);
+  execution_space::fence();
+
+  struct DivFunctor<T,execution_space> f(i0,i1);
+  f.data = data;
+  Kokkos::parallel_for(1,f);
+  execution_space::fence();
+
+  Kokkos::deep_copy(h_data,data);
+  T val = h_data();
+  return val;
+}
+
+template<class T>
+T DivAtomicCheck(T i0 , T i1) {
+  T* data = new T[1];
+  data[0] = 0;
+
+  *data = i0/i1 ;
+
+  T val = *data;
+  delete [] data;
+  return val;
+}
+
+template<class T,class DeviceType>
+bool DivAtomicTest(T i0, T i1)
+{
+  T res       = DivAtomic<T,DeviceType>(i0,i1);
+  T resSerial = DivAtomicCheck<T>(i0,i1);
+
+  bool passed = true;
+
+  if ( resSerial != res ) {
+    passed = false;
+
+    std::cout << "Loop<"
+              << typeid(T).name()
+              << ">( test = DivAtomicTest"
+              << " FAILED : "
+              << resSerial << " != " << res
+              << std::endl ;
+  }
+
+  return passed ;
+}
+
+//---------------------------------------------------
+//--------------atomic_fetch_mod---------------------
+//---------------------------------------------------
+
+template<class T,class DEVICE_TYPE>
+struct ModFunctor{
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View<T,execution_space> type;
+  type data;
+  T i0;
+  T i1;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int) const {
+    Kokkos::atomic_fetch_mod(&data(),(T)i1);
+  }
+  ModFunctor( T _i0 , T _i1 ) : i0(_i0) , i1(_i1) {}
+};
+
+template<class T, class execution_space >
+T ModAtomic(T i0 , T i1) {
+  struct InitFunctor<T,execution_space> f_init(i0);
+  typename InitFunctor<T,execution_space>::type data("Data");
+  typename InitFunctor<T,execution_space>::h_type h_data("HData");
+  f_init.data = data;
+  Kokkos::parallel_for(1,f_init);
+  execution_space::fence();
+
+  struct ModFunctor<T,execution_space> f(i0,i1);
+  f.data = data;
+  Kokkos::parallel_for(1,f);
+  execution_space::fence();
+
+  Kokkos::deep_copy(h_data,data);
+  T val = h_data();
+  return val;
+}
+
+template<class T>
+T ModAtomicCheck(T i0 , T i1) {
+  T* data = new T[1];
+  data[0] = 0;
+
+  *data = i0%i1 ;
+
+  T val = *data;
+  delete [] data;
+  return val;
+}
+
+template<class T,class DeviceType>
+bool ModAtomicTest(T i0, T i1)
+{
+  T res       = ModAtomic<T,DeviceType>(i0,i1);
+  T resSerial = ModAtomicCheck<T>(i0,i1);
+
+  bool passed = true;
+
+  if ( resSerial != res ) {
+    passed = false;
+
+    std::cout << "Loop<"
+              << typeid(T).name()
+              << ">( test = ModAtomicTest"
+              << " FAILED : "
+              << resSerial << " != " << res
+              << std::endl ;
+  }
+
+  return passed ;
+}
+
+//---------------------------------------------------
+//--------------atomic_fetch_and---------------------
+//---------------------------------------------------
+
+template<class T,class DEVICE_TYPE>
+struct AndFunctor{
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View<T,execution_space> type;
+  type data;
+  T i0;
+  T i1;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int) const {
+    Kokkos::atomic_fetch_and(&data(),(T)i1);
+  }
+  AndFunctor( T _i0 , T _i1 ) : i0(_i0) , i1(_i1) {}
+};
+
+template<class T, class execution_space >
+T AndAtomic(T i0 , T i1) {
+  struct InitFunctor<T,execution_space> f_init(i0);
+  typename InitFunctor<T,execution_space>::type data("Data");
+  typename InitFunctor<T,execution_space>::h_type h_data("HData");
+  f_init.data = data;
+  Kokkos::parallel_for(1,f_init);
+  execution_space::fence();
+
+  struct AndFunctor<T,execution_space> f(i0,i1);
+  f.data = data;
+  Kokkos::parallel_for(1,f);
+  execution_space::fence();
+
+  Kokkos::deep_copy(h_data,data);
+  T val = h_data();
+  return val;
+}
+
+template<class T>
+T AndAtomicCheck(T i0 , T i1) {
+  T* data = new T[1];
+  data[0] = 0;
+
+  *data = i0&i1 ;
+
+  T val = *data;
+  delete [] data;
+  return val;
+}
+
+template<class T,class DeviceType>
+bool AndAtomicTest(T i0, T i1)
+{
+  T res       = AndAtomic<T,DeviceType>(i0,i1);
+  T resSerial = AndAtomicCheck<T>(i0,i1);
+
+  bool passed = true;
+
+  if ( resSerial != res ) {
+    passed = false;
+
+    std::cout << "Loop<"
+              << typeid(T).name()
+              << ">( test = AndAtomicTest"
+              << " FAILED : "
+              << resSerial << " != " << res
+              << std::endl ;
+  }
+
+  return passed ;
+}
+
+//---------------------------------------------------
+//--------------atomic_fetch_or----------------------
+//---------------------------------------------------
+
+template<class T,class DEVICE_TYPE>
+struct OrFunctor{
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View<T,execution_space> type;
+  type data;
+  T i0;
+  T i1;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int) const {
+    Kokkos::atomic_fetch_or(&data(),(T)i1);
+  }
+  OrFunctor( T _i0 , T _i1 ) : i0(_i0) , i1(_i1) {}
+};
+
+template<class T, class execution_space >
+T OrAtomic(T i0 , T i1) {
+  struct InitFunctor<T,execution_space> f_init(i0);
+  typename InitFunctor<T,execution_space>::type data("Data");
+  typename InitFunctor<T,execution_space>::h_type h_data("HData");
+  f_init.data = data;
+  Kokkos::parallel_for(1,f_init);
+  execution_space::fence();
+
+  struct OrFunctor<T,execution_space> f(i0,i1);
+  f.data = data;
+  Kokkos::parallel_for(1,f);
+  execution_space::fence();
+
+  Kokkos::deep_copy(h_data,data);
+  T val = h_data();
+  return val;
+}
+
+template<class T>
+T OrAtomicCheck(T i0 , T i1) {
+  T* data = new T[1];
+  data[0] = 0;
+
+  *data = i0|i1 ;
+
+  T val = *data;
+  delete [] data;
+  return val;
+}
+
+template<class T,class DeviceType>
+bool OrAtomicTest(T i0, T i1)
+{
+  T res       = OrAtomic<T,DeviceType>(i0,i1);
+  T resSerial = OrAtomicCheck<T>(i0,i1);
+
+  bool passed = true;
+
+  if ( resSerial != res ) {
+    passed = false;
+
+    std::cout << "Loop<"
+              << typeid(T).name()
+              << ">( test = OrAtomicTest"
+              << " FAILED : "
+              << resSerial << " != " << res
+              << std::endl ;
+  }
+
+  return passed ;
+}
+
+//---------------------------------------------------
+//--------------atomic_fetch_xor---------------------
+//---------------------------------------------------
+
+template<class T,class DEVICE_TYPE>
+struct XorFunctor{
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View<T,execution_space> type;
+  type data;
+  T i0;
+  T i1;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int) const {
+    Kokkos::atomic_fetch_xor(&data(),(T)i1);
+  }
+  XorFunctor( T _i0 , T _i1 ) : i0(_i0) , i1(_i1) {}
+};
+
+template<class T, class execution_space >
+T XorAtomic(T i0 , T i1) {
+  struct InitFunctor<T,execution_space> f_init(i0);
+  typename InitFunctor<T,execution_space>::type data("Data");
+  typename InitFunctor<T,execution_space>::h_type h_data("HData");
+  f_init.data = data;
+  Kokkos::parallel_for(1,f_init);
+  execution_space::fence();
+
+  struct XorFunctor<T,execution_space> f(i0,i1);
+  f.data = data;
+  Kokkos::parallel_for(1,f);
+  execution_space::fence();
+
+  Kokkos::deep_copy(h_data,data);
+  T val = h_data();
+  return val;
+}
+
+template<class T>
+T XorAtomicCheck(T i0 , T i1) {
+  T* data = new T[1];
+  data[0] = 0;
+
+  *data = i0^i1 ;
+
+  T val = *data;
+  delete [] data;
+  return val;
+}
+
+template<class T,class DeviceType>
+bool XorAtomicTest(T i0, T i1)
+{
+  T res       = XorAtomic<T,DeviceType>(i0,i1);
+  T resSerial = XorAtomicCheck<T>(i0,i1);
+
+  bool passed = true;
+
+  if ( resSerial != res ) {
+    passed = false;
+
+    std::cout << "Loop<"
+              << typeid(T).name()
+              << ">( test = XorAtomicTest"
+              << " FAILED : "
+              << resSerial << " != " << res
+              << std::endl ;
+  }
+
+  return passed ;
+}
+
+//---------------------------------------------------
+//--------------atomic_fetch_lshift---------------------
+//---------------------------------------------------
+
+template<class T,class DEVICE_TYPE>
+struct LShiftFunctor{
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View<T,execution_space> type;
+  type data;
+  T i0;
+  T i1;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int) const {
+    Kokkos::atomic_fetch_lshift(&data(),(T)i1);
+  }
+  LShiftFunctor( T _i0 , T _i1 ) : i0(_i0) , i1(_i1) {}
+};
+
+template<class T, class execution_space >
+T LShiftAtomic(T i0 , T i1) {
+  struct InitFunctor<T,execution_space> f_init(i0);
+  typename InitFunctor<T,execution_space>::type data("Data");
+  typename InitFunctor<T,execution_space>::h_type h_data("HData");
+  f_init.data = data;
+  Kokkos::parallel_for(1,f_init);
+  execution_space::fence();
+
+  struct LShiftFunctor<T,execution_space> f(i0,i1);
+  f.data = data;
+  Kokkos::parallel_for(1,f);
+  execution_space::fence();
+
+  Kokkos::deep_copy(h_data,data);
+  T val = h_data();
+  return val;
+}
+
+template<class T>
+T LShiftAtomicCheck(T i0 , T i1) {
+  T* data = new T[1];
+  data[0] = 0;
+
+  *data = i0<<i1 ;
+
+  T val = *data;
+  delete [] data;
+  return val;
+}
+
+template<class T,class DeviceType>
+bool LShiftAtomicTest(T i0, T i1)
+{
+  T res       = LShiftAtomic<T,DeviceType>(i0,i1);
+  T resSerial = LShiftAtomicCheck<T>(i0,i1);
+
+  bool passed = true;
+
+  if ( resSerial != res ) {
+    passed = false;
+
+    std::cout << "Loop<"
+              << typeid(T).name()
+              << ">( test = LShiftAtomicTest"
+              << " FAILED : "
+              << resSerial << " != " << res
+              << std::endl ;
+  }
+
+  return passed ;
+}
+
+//---------------------------------------------------
+//--------------atomic_fetch_rshift---------------------
+//---------------------------------------------------
+
+template<class T,class DEVICE_TYPE>
+struct RShiftFunctor{
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View<T,execution_space> type;
+  type data;
+  T i0;
+  T i1;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int) const {
+    Kokkos::atomic_fetch_rshift(&data(),(T)i1);
+  }
+  RShiftFunctor( T _i0 , T _i1 ) : i0(_i0) , i1(_i1) {}
+};
+
+template<class T, class execution_space >
+T RShiftAtomic(T i0 , T i1) {
+  struct InitFunctor<T,execution_space> f_init(i0);
+  typename InitFunctor<T,execution_space>::type data("Data");
+  typename InitFunctor<T,execution_space>::h_type h_data("HData");
+  f_init.data = data;
+  Kokkos::parallel_for(1,f_init);
+  execution_space::fence();
+
+  struct RShiftFunctor<T,execution_space> f(i0,i1);
+  f.data = data;
+  Kokkos::parallel_for(1,f);
+  execution_space::fence();
+
+  Kokkos::deep_copy(h_data,data);
+  T val = h_data();
+  return val;
+}
+
+template<class T>
+T RShiftAtomicCheck(T i0 , T i1) {
+  T* data = new T[1];
+  data[0] = 0;
+
+  *data = i0>>i1 ;
+
+  T val = *data;
+  delete [] data;
+  return val;
+}
+
+template<class T,class DeviceType>
+bool RShiftAtomicTest(T i0, T i1)
+{
+  T res       = RShiftAtomic<T,DeviceType>(i0,i1);
+  T resSerial = RShiftAtomicCheck<T>(i0,i1);
+
+  bool passed = true;
+
+  if ( resSerial != res ) {
+    passed = false;
+
+    std::cout << "Loop<"
+              << typeid(T).name()
+              << ">( test = RShiftAtomicTest"
+              << " FAILED : "
+              << resSerial << " != " << res
+              << std::endl ;
+  }
+
+  return passed ;
+}
+
+
+//---------------------------------------------------
+//--------------atomic_test_control------------------
+//---------------------------------------------------
+
+template<class T,class DeviceType>
+bool AtomicOperationsTestIntegralType( int i0 , int i1 , int test )
+{
+  switch (test) {
+    case 1: return MaxAtomicTest<T,DeviceType>( (T)i0 , (T)i1 );
+    case 2: return MinAtomicTest<T,DeviceType>( (T)i0 , (T)i1 );
+    case 3: return MulAtomicTest<T,DeviceType>( (T)i0 , (T)i1 );
+    case 4: return DivAtomicTest<T,DeviceType>( (T)i0 , (T)i1 );
+    case 5: return ModAtomicTest<T,DeviceType>( (T)i0 , (T)i1 );
+    case 6: return AndAtomicTest<T,DeviceType>( (T)i0 , (T)i1 );
+    case 7: return OrAtomicTest<T,DeviceType>( (T)i0 , (T)i1 );
+    case 8: return XorAtomicTest<T,DeviceType>( (T)i0 , (T)i1 );
+    case 9: return LShiftAtomicTest<T,DeviceType>( (T)i0 , (T)i1 );
+    case 10: return RShiftAtomicTest<T,DeviceType>( (T)i0 , (T)i1 );
+  }
+  return 0;
+}
+
+template<class T,class DeviceType>
+bool AtomicOperationsTestNonIntegralType( int i0 , int i1 , int test )
+{
+  switch (test) {
+    case 1: return MaxAtomicTest<T,DeviceType>( (T)i0 , (T)i1 );
+    case 2: return MinAtomicTest<T,DeviceType>( (T)i0 , (T)i1 );
+    case 3: return MulAtomicTest<T,DeviceType>( (T)i0 , (T)i1 );
+    case 4: return DivAtomicTest<T,DeviceType>( (T)i0 , (T)i1 );
+  }
+  return 0;
+}
+
+} // namespace
+
diff --git a/lib/kokkos/core/unit_test/TestCXX11.hpp b/lib/kokkos/core/unit_test/TestCXX11.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..2d6349776b33f3af85f6feb3fab91331d7a6de0e
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestCXX11.hpp
@@ -0,0 +1,334 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+#include <Kokkos_Core.hpp>
+
+namespace TestCXX11 {
+
+template<class DeviceType>
+struct FunctorAddTest{
+  typedef Kokkos::View<double**,DeviceType> view_type;
+  view_type a_, b_;
+  typedef DeviceType execution_space;
+  FunctorAddTest(view_type & a, view_type &b):a_(a),b_(b) {}
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i) const {
+    b_(i,0) = a_(i,1) + a_(i,2);
+    b_(i,1) = a_(i,0) - a_(i,3);
+    b_(i,2) = a_(i,4) + a_(i,0);
+    b_(i,3) = a_(i,2) - a_(i,1);
+    b_(i,4) = a_(i,3) + a_(i,4);
+  }
+
+  typedef typename Kokkos::TeamPolicy< execution_space >::member_type  team_member ;
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const team_member & dev) const {
+    const int begin = dev.league_rank() * 4 ;
+    const int end   = begin + 4 ;
+    for ( int i = begin + dev.team_rank() ; i < end ; i += dev.team_size() ) {
+      b_(i,0) = a_(i,1) + a_(i,2);
+      b_(i,1) = a_(i,0) - a_(i,3);
+      b_(i,2) = a_(i,4) + a_(i,0);
+      b_(i,3) = a_(i,2) - a_(i,1);
+      b_(i,4) = a_(i,3) + a_(i,4);
+    }
+  }
+};
+
+template<class DeviceType, bool PWRTest>
+double AddTestFunctor() {
+
+  typedef Kokkos::TeamPolicy<DeviceType> policy_type ;
+
+  Kokkos::View<double**,DeviceType> a("A",100,5);
+  Kokkos::View<double**,DeviceType> b("B",100,5);
+  typename Kokkos::View<double**,DeviceType>::HostMirror h_a = Kokkos::create_mirror_view(a);
+  typename Kokkos::View<double**,DeviceType>::HostMirror h_b = Kokkos::create_mirror_view(b);
+
+  for(int i=0;i<100;i++) {
+    for(int j=0;j<5;j++)
+       h_a(i,j) = 0.1*i/(1.1*j+1.0) + 0.5*j;
+  }
+  Kokkos::deep_copy(a,h_a);
+
+  if(PWRTest==false)
+    Kokkos::parallel_for(100,FunctorAddTest<DeviceType>(a,b));
+  else
+    Kokkos::parallel_for(policy_type(25,Kokkos::AUTO),FunctorAddTest<DeviceType>(a,b));
+  Kokkos::deep_copy(h_b,b);
+
+  double result = 0;
+  for(int i=0;i<100;i++) {
+      for(int j=0;j<5;j++)
+         result += h_b(i,j);
+    }
+
+  return result;
+}
+
+
+#if defined (KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA)
+template<class DeviceType, bool PWRTest>
+double AddTestLambda() {
+
+  Kokkos::View<double**,DeviceType> a("A",100,5);
+  Kokkos::View<double**,DeviceType> b("B",100,5);
+  typename Kokkos::View<double**,DeviceType>::HostMirror h_a = Kokkos::create_mirror_view(a);
+  typename Kokkos::View<double**,DeviceType>::HostMirror h_b = Kokkos::create_mirror_view(b);
+
+  for(int i=0;i<100;i++) {
+    for(int j=0;j<5;j++)
+       h_a(i,j) = 0.1*i/(1.1*j+1.0) + 0.5*j;
+  }
+  Kokkos::deep_copy(a,h_a);
+
+  if(PWRTest==false) {
+    Kokkos::parallel_for(100,KOKKOS_LAMBDA(const int& i)  {
+      b(i,0) = a(i,1) + a(i,2);
+      b(i,1) = a(i,0) - a(i,3);
+      b(i,2) = a(i,4) + a(i,0);
+      b(i,3) = a(i,2) - a(i,1);
+      b(i,4) = a(i,3) + a(i,4);
+    });
+  } else {
+    typedef Kokkos::TeamPolicy<DeviceType> policy_type ;
+    typedef typename policy_type::member_type team_member ;
+
+    policy_type policy(25,Kokkos::AUTO);
+
+    Kokkos::parallel_for(policy,KOKKOS_LAMBDA(const team_member & dev)  {
+      const int begin = dev.league_rank() * 4 ;
+      const int end   = begin + 4 ;
+      for ( int i = begin + dev.team_rank() ; i < end ; i += dev.team_size() ) {
+        b(i,0) = a(i,1) + a(i,2);
+        b(i,1) = a(i,0) - a(i,3);
+        b(i,2) = a(i,4) + a(i,0);
+        b(i,3) = a(i,2) - a(i,1);
+        b(i,4) = a(i,3) + a(i,4);
+      }
+    });
+  }
+  Kokkos::deep_copy(h_b,b);
+
+  double result = 0;
+  for(int i=0;i<100;i++) {
+      for(int j=0;j<5;j++)
+         result += h_b(i,j);
+    }
+
+  return result;
+}
+
+#else
+template<class DeviceType, bool PWRTest>
+double AddTestLambda() {
+  return AddTestFunctor<DeviceType,PWRTest>();
+}
+#endif
+
+
+template<class DeviceType>
+struct FunctorReduceTest{
+  typedef Kokkos::View<double**,DeviceType> view_type;
+  view_type a_;
+  typedef DeviceType execution_space;
+  typedef double value_type;
+  FunctorReduceTest(view_type & a):a_(a) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i, value_type& sum) const {
+    sum += a_(i,1) + a_(i,2);
+    sum += a_(i,0) - a_(i,3);
+    sum += a_(i,4) + a_(i,0);
+    sum += a_(i,2) - a_(i,1);
+    sum += a_(i,3) + a_(i,4);
+  }
+
+  typedef typename Kokkos::TeamPolicy< execution_space >::member_type  team_member ;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const team_member & dev, value_type& sum) const {
+    const int begin = dev.league_rank() * 4 ;
+    const int end   = begin + 4 ;
+    for ( int i = begin + dev.team_rank() ; i < end ; i += dev.team_size() ) {
+      sum += a_(i,1) + a_(i,2);
+      sum += a_(i,0) - a_(i,3);
+      sum += a_(i,4) + a_(i,0);
+      sum += a_(i,2) - a_(i,1);
+      sum += a_(i,3) + a_(i,4);
+    }
+  }
+  KOKKOS_INLINE_FUNCTION
+  void init(value_type& update) const {update = 0.0;}
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile value_type& update, volatile value_type const& input) const {update += input;}
+};
+
+template<class DeviceType, bool PWRTest>
+double ReduceTestFunctor() {
+
+  typedef Kokkos::TeamPolicy<DeviceType> policy_type ;
+  typedef Kokkos::View<double**,DeviceType> view_type ;
+  typedef Kokkos::View<double,typename view_type::host_mirror_space,Kokkos::MemoryUnmanaged> unmanaged_result ;
+
+  view_type a("A",100,5);
+  typename view_type::HostMirror h_a = Kokkos::create_mirror_view(a);
+
+  for(int i=0;i<100;i++) {
+    for(int j=0;j<5;j++)
+       h_a(i,j) = 0.1*i/(1.1*j+1.0) + 0.5*j;
+  }
+  Kokkos::deep_copy(a,h_a);
+
+  double result = 0.0;
+  if(PWRTest==false)
+    Kokkos::parallel_reduce(100,FunctorReduceTest<DeviceType>(a), unmanaged_result( & result ));
+  else
+    Kokkos::parallel_reduce(policy_type(25,Kokkos::AUTO),FunctorReduceTest<DeviceType>(a), unmanaged_result( & result ));
+
+  return result;
+}
+
+#if defined (KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA)
+template<class DeviceType, bool PWRTest>
+double ReduceTestLambda() {
+
+  typedef Kokkos::TeamPolicy<DeviceType> policy_type ;
+  typedef Kokkos::View<double**,DeviceType> view_type ;
+  typedef Kokkos::View<double,typename view_type::host_mirror_space,Kokkos::MemoryUnmanaged> unmanaged_result ;
+
+  view_type a("A",100,5);
+  typename view_type::HostMirror h_a = Kokkos::create_mirror_view(a);
+
+  for(int i=0;i<100;i++) {
+    for(int j=0;j<5;j++)
+       h_a(i,j) = 0.1*i/(1.1*j+1.0) + 0.5*j;
+  }
+  Kokkos::deep_copy(a,h_a);
+
+  double result = 0.0;
+
+  if(PWRTest==false) {
+    Kokkos::parallel_reduce(100,KOKKOS_LAMBDA(const int& i, double& sum)  {
+      sum += a(i,1) + a(i,2);
+      sum += a(i,0) - a(i,3);
+      sum += a(i,4) + a(i,0);
+      sum += a(i,2) - a(i,1);
+      sum += a(i,3) + a(i,4);
+    }, unmanaged_result( & result ) );
+  } else {
+    typedef typename policy_type::member_type team_member ;
+    Kokkos::parallel_reduce(policy_type(25,Kokkos::AUTO),KOKKOS_LAMBDA(const team_member & dev, double& sum)  {
+      const int begin = dev.league_rank() * 4 ;
+      const int end   = begin + 4 ;
+      for ( int i = begin + dev.team_rank() ; i < end ; i += dev.team_size() ) {
+        sum += a(i,1) + a(i,2);
+        sum += a(i,0) - a(i,3);
+        sum += a(i,4) + a(i,0);
+        sum += a(i,2) - a(i,1);
+        sum += a(i,3) + a(i,4);
+      }
+    }, unmanaged_result( & result ) );
+  }
+
+  return result;
+}
+
+#else
+template<class DeviceType, bool PWRTest>
+double ReduceTestLambda() {
+  return ReduceTestFunctor<DeviceType,PWRTest>();
+}
+#endif
+
+template<class DeviceType>
+double TestVariantLambda(int test) {
+  switch (test) {
+    case 1: return AddTestLambda<DeviceType,false>();
+    case 2: return AddTestLambda<DeviceType,true>();
+    case 3: return ReduceTestLambda<DeviceType,false>();
+    case 4: return ReduceTestLambda<DeviceType,true>();
+  }
+  return 0;
+}
+
+
+template<class DeviceType>
+double TestVariantFunctor(int test) {
+  switch (test) {
+    case 1: return AddTestFunctor<DeviceType,false>();
+    case 2: return AddTestFunctor<DeviceType,true>();
+    case 3: return ReduceTestFunctor<DeviceType,false>();
+    case 4: return ReduceTestFunctor<DeviceType,true>();
+  }
+  return 0;
+}
+
+template<class DeviceType>
+bool Test(int test) {
+
+#ifdef KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA
+  double res_functor = TestVariantFunctor<DeviceType>(test);
+  double res_lambda = TestVariantLambda<DeviceType>(test);
+
+  char testnames[5][256] = {" "
+                            ,"AddTest","AddTest TeamPolicy"
+                            ,"ReduceTest","ReduceTest TeamPolicy"
+                           };
+  bool passed = true;
+
+  if ( res_functor != res_lambda ) {
+    passed = false;
+
+    std::cout << "CXX11 ( test = '"
+              << testnames[test] << "' FAILED : "
+              << res_functor << " != " << res_lambda
+              << std::endl ;
+  }
+
+  return passed ;
+#else
+  return true;
+#endif
+}
+
+}
diff --git a/lib/kokkos/core/unit_test/TestCXX11Deduction.hpp b/lib/kokkos/core/unit_test/TestCXX11Deduction.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..359e17a44f1642d630b97987f8d049fc3217a9fb
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestCXX11Deduction.hpp
@@ -0,0 +1,94 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+#include <Kokkos_Core.hpp>
+
+#ifndef TESTCXX11DEDUCTION_HPP
+#define TESTCXX11DEDUCTION_HPP
+
+namespace TestCXX11 {
+
+struct TestReductionDeductionTagA {};
+struct TestReductionDeductionTagB {};
+
+template < class ExecSpace >
+struct TestReductionDeductionFunctor {
+
+  // KOKKOS_INLINE_FUNCTION
+  // void operator()( long i , long & value ) const
+  // { value += i + 1 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( TestReductionDeductionTagA , long i , long & value ) const
+  { value += ( 2 * i + 1 ) + ( 2 * i + 2 ); }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const TestReductionDeductionTagB & , const long i , long & value ) const
+  { value += ( 3 * i + 1 ) + ( 3 * i + 2 ) + ( 3 * i + 3 ) ; }
+
+};
+
+template< class ExecSpace >
+void test_reduction_deduction()
+{
+  typedef TestReductionDeductionFunctor< ExecSpace > Functor ;
+
+  const long N = 50 ;
+  // const long answer  = N % 2 ? ( N * ((N+1)/2 )) : ( (N/2) * (N+1) );
+  const long answerA = N % 2 ? ( (2*N) * (((2*N)+1)/2 )) : ( ((2*N)/2) * ((2*N)+1) );
+  const long answerB = N % 2 ? ( (3*N) * (((3*N)+1)/2 )) : ( ((3*N)/2) * ((3*N)+1) );
+  long result = 0 ;
+
+  // Kokkos::parallel_reduce( Kokkos::RangePolicy<ExecSpace>(0,N) , Functor() , result );
+  // ASSERT_EQ( answer , result );
+  
+  Kokkos::parallel_reduce( Kokkos::RangePolicy<ExecSpace,TestReductionDeductionTagA>(0,N) , Functor() , result );
+  ASSERT_EQ( answerA , result );
+  
+  Kokkos::parallel_reduce( Kokkos::RangePolicy<ExecSpace,TestReductionDeductionTagB>(0,N) , Functor() , result );
+  ASSERT_EQ( answerB , result );
+}
+
+}
+
+#endif
+
diff --git a/lib/kokkos/core/unit_test/TestCompilerMacros.hpp b/lib/kokkos/core/unit_test/TestCompilerMacros.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..dfa2250c04ae8cc785383b1f64a127ad40279f57
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestCompilerMacros.hpp
@@ -0,0 +1,93 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+
+#define KOKKOS_PRAGMA_UNROLL(a)
+
+namespace TestCompilerMacros {
+
+template<class DEVICE_TYPE>
+struct AddFunctor {
+  typedef DEVICE_TYPE execution_space;
+  typedef typename Kokkos::View<int**,execution_space> type;
+  type a,b;
+  int length;
+
+  AddFunctor(type a_, type b_):a(a_),b(b_),length(a.dimension_1()) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int i) const {
+#ifdef KOKKOS_HAVE_PRAGMA_UNROLL
+    #pragma unroll
+#endif
+#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+    #pragma ivdep
+#endif
+#ifdef KOKKOS_HAVE_PRAGMA_VECTOR
+    #pragma vector always
+#endif
+#ifdef KOKKOS_HAVE_PRAGMA_LOOPCOUNT
+    #pragma loop count(128)
+#endif
+#ifdef KOKKOS_HAVE_PRAGMA_SIMD
+    #pragma simd
+#endif
+    for(int j=0;j<length;j++)
+      a(i,j) += b(i,j);
+  }
+};
+
+template<class DeviceType>
+bool Test() {
+  typedef typename Kokkos::View<int**,DeviceType> type;
+  type a("A",1024,128);
+  type b("B",1024,128);
+
+  AddFunctor<DeviceType> f(a,b);
+  Kokkos::parallel_for(1024,f);
+  DeviceType::fence();
+  return true;
+}
+
+}
diff --git a/lib/kokkos/core/unit_test/TestCuda.cpp b/lib/kokkos/core/unit_test/TestCuda.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e6155662525f08fd718e02a40243e942dd77104d
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestCuda.cpp
@@ -0,0 +1,290 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <iostream>
+
+#include <Kokkos_Core.hpp>
+
+//----------------------------------------------------------------------------
+
+#include <Cuda/Kokkos_Cuda_TaskPolicy.hpp>
+#include <impl/Kokkos_ViewTileLeft.hpp>
+#include <TestTile.hpp>
+
+//----------------------------------------------------------------------------
+
+#include <TestSharedAlloc.hpp>
+#include <TestViewMapping.hpp>
+
+#include <TestViewImpl.hpp>
+#include <TestAtomic.hpp>
+
+#include <TestViewAPI.hpp>
+#include <TestViewSubview.hpp>
+#include <TestViewOfClass.hpp>
+
+#include <TestReduce.hpp>
+#include <TestScan.hpp>
+#include <TestRange.hpp>
+#include <TestTeam.hpp>
+#include <TestAggregate.hpp>
+#include <TestAggregateReduction.hpp>
+#include <TestCompilerMacros.hpp>
+#include <TestMemorySpaceTracking.hpp>
+#include <TestMemoryPool.hpp>
+#include <TestTeamVector.hpp>
+#include <TestTemplateMetaFunctions.hpp>
+#include <TestCXX11Deduction.hpp>
+
+#include <TestTaskPolicy.hpp>
+#include <TestPolicyConstruction.hpp>
+
+#include <TestMDRange.hpp>
+
+//----------------------------------------------------------------------------
+
+class cuda : public ::testing::Test {
+protected:
+  static void SetUpTestCase();
+  static void TearDownTestCase();
+};
+
+void cuda::SetUpTestCase()
+  {
+    Kokkos::Cuda::print_configuration( std::cout );
+    Kokkos::HostSpace::execution_space::initialize();
+    Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice(0) );
+  }
+
+void cuda::TearDownTestCase()
+  {
+    Kokkos::Cuda::finalize();
+    Kokkos::HostSpace::execution_space::finalize();
+  }
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Test {
+
+__global__
+void test_abort()
+{
+  Kokkos::Impl::VerifyExecutionCanAccessMemorySpace<
+    Kokkos::CudaSpace ,
+    Kokkos::HostSpace >::verify();
+}
+
+__global__
+void test_cuda_spaces_int_value( int * ptr )
+{
+  if ( *ptr == 42 ) { *ptr = 2 * 42 ; }
+}
+
+TEST_F( cuda , md_range ) {
+  TestMDRange_2D< Kokkos::Cuda >::test_for2(100,100);
+
+  TestMDRange_3D< Kokkos::Cuda >::test_for3(100,100,100);
+}
+
+TEST_F( cuda , compiler_macros )
+{
+  ASSERT_TRUE( ( TestCompilerMacros::Test< Kokkos::Cuda >() ) );
+}
+
+TEST_F( cuda , memory_space )
+{
+  TestMemorySpace< Kokkos::Cuda >();
+}
+
+TEST_F( cuda, uvm )
+{
+  if ( Kokkos::CudaUVMSpace::available() ) {
+
+    int * uvm_ptr = (int*) Kokkos::kokkos_malloc< Kokkos::CudaUVMSpace >("uvm_ptr",sizeof(int));
+
+    *uvm_ptr = 42 ;
+
+    Kokkos::Cuda::fence();
+    test_cuda_spaces_int_value<<<1,1>>>(uvm_ptr);
+    Kokkos::Cuda::fence();
+
+    EXPECT_EQ( *uvm_ptr, int(2*42) );
+
+    Kokkos::kokkos_free< Kokkos::CudaUVMSpace >(uvm_ptr );
+  }
+}
+
+//----------------------------------------------------------------------------
+
+TEST_F( cuda , impl_shared_alloc )
+{
+  test_shared_alloc< Kokkos::CudaSpace , Kokkos::HostSpace::execution_space >();
+  test_shared_alloc< Kokkos::CudaUVMSpace , Kokkos::HostSpace::execution_space >();
+  test_shared_alloc< Kokkos::CudaHostPinnedSpace , Kokkos::HostSpace::execution_space >();
+}
+
+TEST_F( cuda, policy_construction) {
+  TestRangePolicyConstruction< Kokkos::Cuda >();
+  TestTeamPolicyConstruction< Kokkos::Cuda >();
+}
+
+TEST_F( cuda , impl_view_mapping )
+{
+  test_view_mapping< Kokkos::Cuda >();
+  test_view_mapping< Kokkos::CudaUVMSpace >();
+  test_view_mapping_subview< Kokkos::Cuda >();
+  test_view_mapping_subview< Kokkos::CudaUVMSpace >();
+  test_view_mapping_operator< Kokkos::Cuda >();
+  test_view_mapping_operator< Kokkos::CudaUVMSpace >();
+  TestViewMappingAtomic< Kokkos::Cuda >::run();
+}
+
+TEST_F( cuda , view_of_class )
+{
+  TestViewMappingClassValue< Kokkos::CudaSpace >::run();
+  TestViewMappingClassValue< Kokkos::CudaUVMSpace >::run();
+}
+
+template< class MemSpace >
+struct TestViewCudaTexture {
+
+  enum { N = 1000 };
+
+  using V = Kokkos::Experimental::View<double*,MemSpace> ;
+  using T = Kokkos::Experimental::View<const double*, MemSpace, Kokkos::MemoryRandomAccess > ;
+
+  V m_base ;
+  T m_tex ;
+
+  struct TagInit {};
+  struct TagTest {};
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const TagInit & , const int i ) const { m_base[i] = i + 1 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const TagTest & , const int i , long & error_count ) const
+    { if ( m_tex[i] != i + 1 ) ++error_count ; }
+
+  TestViewCudaTexture()
+    : m_base("base",N)
+    , m_tex( m_base )
+    {}
+
+  static void run()
+    {
+      EXPECT_TRUE( ( std::is_same< typename V::reference_type
+                                 , double &
+                                 >::value ) );
+
+      EXPECT_TRUE( ( std::is_same< typename T::reference_type
+                                 , const double
+                                 >::value ) );
+
+      EXPECT_TRUE(  V::reference_type_is_lvalue_reference ); // An ordinary view
+      EXPECT_FALSE( T::reference_type_is_lvalue_reference ); // Texture fetch returns by value
+
+      TestViewCudaTexture self ;
+      Kokkos::parallel_for( Kokkos::RangePolicy< Kokkos::Cuda , TagInit >(0,N) , self );
+      long error_count = -1 ;
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< Kokkos::Cuda , TagTest >(0,N) , self , error_count );
+      EXPECT_EQ( error_count , 0 );
+    }
+};
+
+TEST_F( cuda , impl_view_texture )
+{
+  TestViewCudaTexture< Kokkos::CudaSpace >::run();
+  TestViewCudaTexture< Kokkos::CudaUVMSpace >::run();
+}
+
+template< class MemSpace , class ExecSpace >
+struct TestViewCudaAccessible {
+
+  enum { N = 1000 };
+
+  using V = Kokkos::Experimental::View<double*,MemSpace> ;
+
+  V m_base ;
+
+  struct TagInit {};
+  struct TagTest {};
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const TagInit & , const int i ) const { m_base[i] = i + 1 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const TagTest & , const int i , long & error_count ) const
+    { if ( m_base[i] != i + 1 ) ++error_count ; }
+
+  TestViewCudaAccessible()
+    : m_base("base",N)
+    {}
+
+  static void run()
+    {
+      TestViewCudaAccessible self ;
+      Kokkos::parallel_for( Kokkos::RangePolicy< typename MemSpace::execution_space , TagInit >(0,N) , self );
+      MemSpace::execution_space::fence();
+      // Next access is a different execution space, must complete prior kernel.
+      long error_count = -1 ;
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace , TagTest >(0,N) , self , error_count );
+      EXPECT_EQ( error_count , 0 );
+    }
+};
+
+TEST_F( cuda , impl_view_accessible )
+{
+  TestViewCudaAccessible< Kokkos::CudaSpace , Kokkos::Cuda >::run();
+
+  TestViewCudaAccessible< Kokkos::CudaUVMSpace , Kokkos::Cuda >::run();
+  TestViewCudaAccessible< Kokkos::CudaUVMSpace , Kokkos::HostSpace::execution_space >::run();
+
+  TestViewCudaAccessible< Kokkos::CudaHostPinnedSpace , Kokkos::Cuda >::run();
+  TestViewCudaAccessible< Kokkos::CudaHostPinnedSpace , Kokkos::HostSpace::execution_space >::run();
+}
+
+}
diff --git a/lib/kokkos/core/unit_test/TestCuda_a.cpp b/lib/kokkos/core/unit_test/TestCuda_a.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4680c333867ff0e68f572121a654f8f23d09fcfb
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestCuda_a.cpp
@@ -0,0 +1,182 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <iostream>
+
+#include <Kokkos_Core.hpp>
+
+//----------------------------------------------------------------------------
+
+#include <Cuda/Kokkos_Cuda_TaskPolicy.hpp>
+#include <impl/Kokkos_ViewTileLeft.hpp>
+#include <TestTile.hpp>
+
+//----------------------------------------------------------------------------
+
+#include <TestSharedAlloc.hpp>
+#include <TestViewMapping.hpp>
+
+#include <TestViewImpl.hpp>
+#include <TestAtomic.hpp>
+
+#include <TestViewAPI.hpp>
+#include <TestViewSubview.hpp>
+#include <TestViewOfClass.hpp>
+
+#include <TestReduce.hpp>
+#include <TestScan.hpp>
+#include <TestRange.hpp>
+#include <TestTeam.hpp>
+#include <TestAggregate.hpp>
+#include <TestAggregateReduction.hpp>
+#include <TestCompilerMacros.hpp>
+#include <TestMemorySpaceTracking.hpp>
+#include <TestMemoryPool.hpp>
+#include <TestTeamVector.hpp>
+#include <TestTemplateMetaFunctions.hpp>
+#include <TestCXX11Deduction.hpp>
+
+#include <TestTaskPolicy.hpp>
+#include <TestPolicyConstruction.hpp>
+
+//----------------------------------------------------------------------------
+
+class cuda : public ::testing::Test {
+protected:
+  static void SetUpTestCase();
+  static void TearDownTestCase();
+};
+
+//----------------------------------------------------------------------------
+
+namespace Test {
+
+TEST_F( cuda, view_impl )
+{
+  // test_abort<<<32,32>>>(); // Aborts the kernel with CUDA version 4.1 or greater
+
+  test_view_impl< Kokkos::Cuda >();
+}
+
+TEST_F( cuda, view_api )
+{
+  typedef Kokkos::View< const int * , Kokkos::Cuda , Kokkos::MemoryTraits< Kokkos::RandomAccess > > view_texture_managed ;
+  typedef Kokkos::View< const int * , Kokkos::Cuda , Kokkos::MemoryTraits< Kokkos::RandomAccess | Kokkos::Unmanaged > > view_texture_unmanaged ;
+
+  TestViewAPI< double , Kokkos::Cuda >();
+  TestViewAPI< double , Kokkos::CudaUVMSpace >();
+
+#if 0
+  Kokkos::View<double, Kokkos::Cuda > x("x");
+  Kokkos::View<double[1], Kokkos::Cuda > y("y");
+  // *x = 10 ;
+  // x() = 10 ;
+  // y[0] = 10 ;
+  // y(0) = 10 ;
+#endif
+}
+
+TEST_F( cuda , view_nested_view )
+{
+  ::Test::view_nested_view< Kokkos::Cuda >();
+}
+
+TEST_F( cuda, view_subview_auto_1d_left ) {
+  TestViewSubview::test_auto_1d< Kokkos::LayoutLeft,Kokkos::Cuda >();
+}
+
+TEST_F( cuda, view_subview_auto_1d_right ) {
+  TestViewSubview::test_auto_1d< Kokkos::LayoutRight,Kokkos::Cuda >();
+}
+
+TEST_F( cuda, view_subview_auto_1d_stride ) {
+  TestViewSubview::test_auto_1d< Kokkos::LayoutStride,Kokkos::Cuda >();
+}
+
+TEST_F( cuda, view_subview_assign_strided ) {
+  TestViewSubview::test_1d_strided_assignment< Kokkos::Cuda >();
+}
+
+TEST_F( cuda, view_subview_left_0 ) {
+  TestViewSubview::test_left_0< Kokkos::CudaUVMSpace >();
+}
+
+TEST_F( cuda, view_subview_left_1 ) {
+  TestViewSubview::test_left_1< Kokkos::CudaUVMSpace >();
+}
+
+TEST_F( cuda, view_subview_left_2 ) {
+  TestViewSubview::test_left_2< Kokkos::CudaUVMSpace >();
+}
+
+TEST_F( cuda, view_subview_left_3 ) {
+  TestViewSubview::test_left_3< Kokkos::CudaUVMSpace >();
+}
+
+TEST_F( cuda, view_subview_right_0 ) {
+  TestViewSubview::test_right_0< Kokkos::CudaUVMSpace >();
+}
+
+TEST_F( cuda, view_subview_right_1 ) {
+  TestViewSubview::test_right_1< Kokkos::CudaUVMSpace >();
+}
+
+TEST_F( cuda, view_subview_right_3 ) {
+  TestViewSubview::test_right_3< Kokkos::CudaUVMSpace >();
+}
+
+TEST_F( cuda, view_subview_1d_assign ) {
+  TestViewSubview::test_1d_assign< Kokkos::CudaUVMSpace >();
+}
+
+TEST_F( cuda, view_subview_2d_from_3d ) {
+  TestViewSubview::test_2d_subview_3d< Kokkos::CudaUVMSpace >();
+}
+
+TEST_F( cuda, view_subview_2d_from_5d ) {
+  TestViewSubview::test_2d_subview_5d< Kokkos::CudaUVMSpace >();
+}
+
+}
diff --git a/lib/kokkos/core/unit_test/TestCuda_b.cpp b/lib/kokkos/core/unit_test/TestCuda_b.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d4ca949e57cb02d15444ec7f3e48b123003b6a68
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestCuda_b.cpp
@@ -0,0 +1,191 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <iostream>
+
+#include <Kokkos_Core.hpp>
+
+//----------------------------------------------------------------------------
+
+#include <Cuda/Kokkos_Cuda_TaskPolicy.hpp>
+#include <impl/Kokkos_ViewTileLeft.hpp>
+#include <TestTile.hpp>
+
+//----------------------------------------------------------------------------
+
+#include <TestSharedAlloc.hpp>
+#include <TestViewMapping.hpp>
+
+#include <TestViewImpl.hpp>
+#include <TestAtomic.hpp>
+
+#include <TestViewAPI.hpp>
+#include <TestViewSubview.hpp>
+#include <TestViewOfClass.hpp>
+
+#include <TestReduce.hpp>
+#include <TestScan.hpp>
+#include <TestRange.hpp>
+#include <TestTeam.hpp>
+#include <TestAggregate.hpp>
+#include <TestAggregateReduction.hpp>
+#include <TestCompilerMacros.hpp>
+#include <TestMemorySpaceTracking.hpp>
+#include <TestMemoryPool.hpp>
+#include <TestTeamVector.hpp>
+#include <TestTemplateMetaFunctions.hpp>
+#include <TestCXX11Deduction.hpp>
+
+#include <TestTaskPolicy.hpp>
+#include <TestPolicyConstruction.hpp>
+
+//----------------------------------------------------------------------------
+
+class cuda : public ::testing::Test {
+protected:
+  static void SetUpTestCase();
+  static void TearDownTestCase();
+};
+
+//----------------------------------------------------------------------------
+
+namespace Test {
+
+TEST_F( cuda, range_tag )
+{
+  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_for(3);
+  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_reduce(3);
+  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_scan(3);
+  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(3);
+  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(3);
+  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_scan(3);
+  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_for(1000);
+  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_reduce(1000);
+  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_scan(1000);
+  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(1001);
+  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(1001);
+  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_scan(1001);
+  //TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_dynamic_policy(1000);
+}
+
+TEST_F( cuda, team_tag )
+{
+  TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_for(3);
+  TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_reduce(3);
+  TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(3);
+  TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(3);
+  TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_for(1000);
+  TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_reduce(1000);
+  TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(1000);
+  TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(1000);
+}
+
+TEST_F( cuda, reduce )
+{
+  TestReduce< long ,   Kokkos::Cuda >( 10000000 );
+  TestReduce< double , Kokkos::Cuda >( 1000000 );
+  TestReduce< int , Kokkos::Cuda >( 0 );
+}
+
+TEST_F( cuda , reducers )
+{
+  TestReducers<int, Kokkos::Cuda>::execute_integer();
+  TestReducers<size_t, Kokkos::Cuda>::execute_integer();
+  TestReducers<double, Kokkos::Cuda>::execute_float();
+  TestReducers<Kokkos::complex<double>, Kokkos::Cuda>::execute_basic();
+}
+
+TEST_F( cuda, reduce_team )
+{
+  TestReduceTeam< long ,   Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 3 );
+  TestReduceTeam< long ,   Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
+  TestReduceTeam< long ,   Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 100000 );
+  TestReduceTeam< long ,   Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
+  TestReduceTeam< double ,   Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 3 );
+  TestReduceTeam< double ,   Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
+  TestReduceTeam< double ,   Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 100000 );
+  TestReduceTeam< double ,   Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
+}
+
+TEST_F( cuda, shared_team )
+{
+  TestSharedTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >();
+  TestSharedTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >();
+}
+
+#if defined (KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA)
+TEST_F( cuda, lambda_shared_team )
+{
+  TestLambdaSharedTeam< Kokkos::CudaSpace, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >();
+  TestLambdaSharedTeam< Kokkos::CudaUVMSpace, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >();
+  TestLambdaSharedTeam< Kokkos::CudaHostPinnedSpace, Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static>  >();
+  TestLambdaSharedTeam< Kokkos::CudaSpace, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >();
+  TestLambdaSharedTeam< Kokkos::CudaUVMSpace, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >();
+  TestLambdaSharedTeam< Kokkos::CudaHostPinnedSpace, Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic>  >();
+}
+#endif
+
+TEST_F( cuda, shmem_size) {
+  TestShmemSize< Kokkos::Cuda >();
+}
+
+TEST_F( cuda, multi_level_scratch) {
+  TestMultiLevelScratchTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >();
+  TestMultiLevelScratchTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >();
+}
+
+TEST_F( cuda, reduce_dynamic )
+{
+  TestReduceDynamic< long ,   Kokkos::Cuda >( 10000000 );
+  TestReduceDynamic< double , Kokkos::Cuda >( 1000000 );
+}
+
+TEST_F( cuda, reduce_dynamic_view )
+{
+  TestReduceDynamicView< long ,   Kokkos::Cuda >( 10000000 );
+  TestReduceDynamicView< double , Kokkos::Cuda >( 1000000 );
+}
+
+}
diff --git a/lib/kokkos/core/unit_test/TestCuda_c.cpp b/lib/kokkos/core/unit_test/TestCuda_c.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..70584cead1b5efb7b6b0b372aed95dd522c25169
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestCuda_c.cpp
@@ -0,0 +1,375 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <iostream>
+
+#include <Kokkos_Core.hpp>
+
+//----------------------------------------------------------------------------
+
+#include <Cuda/Kokkos_Cuda_TaskPolicy.hpp>
+#include <impl/Kokkos_ViewTileLeft.hpp>
+#include <TestTile.hpp>
+
+//----------------------------------------------------------------------------
+
+#include <TestSharedAlloc.hpp>
+#include <TestViewMapping.hpp>
+
+#include <TestViewImpl.hpp>
+#include <TestAtomic.hpp>
+#include <TestAtomicOperations.hpp>
+
+#include <TestViewAPI.hpp>
+#include <TestViewSubview.hpp>
+#include <TestViewOfClass.hpp>
+
+#include <TestReduce.hpp>
+#include <TestScan.hpp>
+#include <TestRange.hpp>
+#include <TestTeam.hpp>
+#include <TestAggregate.hpp>
+#include <TestAggregateReduction.hpp>
+#include <TestCompilerMacros.hpp>
+#include <TestMemorySpaceTracking.hpp>
+#include <TestMemoryPool.hpp>
+#include <TestTeamVector.hpp>
+#include <TestTemplateMetaFunctions.hpp>
+#include <TestCXX11Deduction.hpp>
+
+#include <TestTaskPolicy.hpp>
+#include <TestPolicyConstruction.hpp>
+
+//----------------------------------------------------------------------------
+
+class cuda : public ::testing::Test {
+protected:
+  static void SetUpTestCase();
+  static void TearDownTestCase();
+};
+
+//----------------------------------------------------------------------------
+
+namespace Test {
+
+TEST_F( cuda, atomic )
+{
+  const int loop_count = 1e3 ;
+
+  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Cuda>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Cuda>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Cuda>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Cuda>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Cuda>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Cuda>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Cuda>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Cuda>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Cuda>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Cuda>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Cuda>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Cuda>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Cuda>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Cuda>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Cuda>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Cuda>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Cuda>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Cuda>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Cuda>(100,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Cuda>(100,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Cuda>(100,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Cuda>(100,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Cuda>(100,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Cuda>(100,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<4> ,Kokkos::Cuda>(100,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<4> ,Kokkos::Cuda>(100,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<4> ,Kokkos::Cuda>(100,3) ) );
+
+}
+
+TEST_F( cuda , atomic_operations )
+{
+  const int start = 1; //Avoid zero for division
+  const int end = 11;
+  for (int i = start; i < end; ++i)
+  {
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Cuda>(start, end-i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Cuda>(start, end-i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Cuda>(start, end-i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Cuda>(start, end-i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Cuda>(start, end-i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Cuda>(start, end-i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Cuda>(start, end-i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Cuda>(start, end-i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Cuda>(start, end-i, 9 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Cuda>(start, end-i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Cuda>(start, end-i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Cuda>(start, end-i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Cuda>(start, end-i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Cuda>(start, end-i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Cuda>(start, end-i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Cuda>(start, end-i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Cuda>(start, end-i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Cuda>(start, end-i, 9 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Cuda>(start, end-i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Cuda>(start, end-i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Cuda>(start, end-i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Cuda>(start, end-i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Cuda>(start, end-i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Cuda>(start, end-i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Cuda>(start, end-i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Cuda>(start, end-i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Cuda>(start, end-i, 9 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Cuda>(start, end-i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Cuda>(start, end-i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Cuda>(start, end-i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Cuda>(start, end-i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Cuda>(start, end-i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Cuda>(start, end-i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Cuda>(start, end-i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Cuda>(start, end-i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Cuda>(start, end-i, 9 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Cuda>(start, end-i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Cuda>(start, end-i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Cuda>(start, end-i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Cuda>(start, end-i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Cuda>(start, end-i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Cuda>(start, end-i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Cuda>(start, end-i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Cuda>(start, end-i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Cuda>(start, end-i, 9 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::Cuda>(start, end-i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::Cuda>(start, end-i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::Cuda>(start, end-i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::Cuda>(start, end-i, 4 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::Cuda>(start, end-i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::Cuda>(start, end-i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::Cuda>(start, end-i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::Cuda>(start, end-i, 4 ) ) );
+  }
+
+}
+
+//----------------------------------------------------------------------------
+
+TEST_F( cuda, tile_layout)
+{
+  TestTile::test< Kokkos::Cuda , 1 , 1 >( 1 , 1 );
+  TestTile::test< Kokkos::Cuda , 1 , 1 >( 2 , 3 );
+  TestTile::test< Kokkos::Cuda , 1 , 1 >( 9 , 10 );
+
+  TestTile::test< Kokkos::Cuda , 2 , 2 >( 1 , 1 );
+  TestTile::test< Kokkos::Cuda , 2 , 2 >( 2 , 3 );
+  TestTile::test< Kokkos::Cuda , 2 , 2 >( 4 , 4 );
+  TestTile::test< Kokkos::Cuda , 2 , 2 >( 9 , 9 );
+
+  TestTile::test< Kokkos::Cuda , 2 , 4 >( 9 , 9 );
+  TestTile::test< Kokkos::Cuda , 4 , 4 >( 9 , 9 );
+
+  TestTile::test< Kokkos::Cuda , 4 , 4 >( 1 , 1 );
+  TestTile::test< Kokkos::Cuda , 4 , 4 >( 4 , 4 );
+  TestTile::test< Kokkos::Cuda , 4 , 4 >( 9 , 9 );
+  TestTile::test< Kokkos::Cuda , 4 , 4 >( 9 , 11 );
+
+  TestTile::test< Kokkos::Cuda , 8 , 8 >( 1 , 1 );
+  TestTile::test< Kokkos::Cuda , 8 , 8 >( 4 , 4 );
+  TestTile::test< Kokkos::Cuda , 8 , 8 >( 9 , 9 );
+  TestTile::test< Kokkos::Cuda , 8 , 8 >( 9 , 11 );
+}
+
+TEST_F( cuda , view_aggregate )
+{
+  TestViewAggregate< Kokkos::Cuda >();
+  TestViewAggregateReduction< Kokkos::Cuda >();
+}
+
+TEST_F( cuda , scan )
+{
+  TestScan< Kokkos::Cuda >::test_range( 1 , 1000 );
+  TestScan< Kokkos::Cuda >( 1000000 );
+  TestScan< Kokkos::Cuda >( 10000000 );
+
+  TestScan< Kokkos::Cuda >( 0 );
+  TestScan< Kokkos::Cuda >( 0 , 0 );
+
+  Kokkos::Cuda::fence();
+}
+
+TEST_F( cuda , team_scan )
+{
+  TestScanTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 10 );
+  TestScanTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >( 10 );
+  TestScanTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 10000 );
+  TestScanTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >( 10000 );
+}
+
+TEST_F( cuda , memory_pool )
+{
+//  typedef Kokkos::CudaUVMSpace  device_type;
+  typedef Kokkos::Cuda          device_type;
+
+  bool val = TestMemoryPool::test_mempool< device_type >( 128, 128000000 );
+  ASSERT_TRUE( val );
+
+  Kokkos::Cuda::fence();
+
+  TestMemoryPool::test_mempool2< device_type >( 64, 4, 100000, 200000 );
+
+  Kokkos::Cuda::fence();
+
+  TestMemoryPool::test_memory_exhaustion< Kokkos::Cuda >();
+
+  Kokkos::Cuda::fence();
+}
+
+}
+
+//----------------------------------------------------------------------------
+
+TEST_F( cuda , template_meta_functions )
+{
+  TestTemplateMetaFunctions<int, Kokkos::Cuda >();
+}
+
+//----------------------------------------------------------------------------
+
+namespace Test {
+
+TEST_F( cuda , reduction_deduction )
+{
+  TestCXX11::test_reduction_deduction< Kokkos::Cuda >();
+}
+
+TEST_F( cuda , team_vector )
+{
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(0) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(1) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(2) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(3) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(4) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(5) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(6) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(7) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(8) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(9) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(10) ) );
+}
+
+TEST_F( cuda, triple_nested_parallelism )
+{
+  TestTripleNestedReduce< double, Kokkos::Cuda >( 8192, 2048 , 32 , 32 );
+  TestTripleNestedReduce< double, Kokkos::Cuda >( 8192, 2048 , 32 , 16 );
+  TestTripleNestedReduce< double, Kokkos::Cuda >( 8192, 2048 , 16 , 16 );
+}
+
+}
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_ENABLE_TASKPOLICY )
+
+TEST_F( cuda , task_fib )
+{
+  for ( int i = 0 ; i < 25 ; ++i ) {
+    TestTaskPolicy::TestFib< Kokkos::Cuda >::run(i, (i+1)*1000000 );
+  }
+}
+
+TEST_F( cuda , task_depend )
+{
+  for ( int i = 0 ; i < 25 ; ++i ) {
+    TestTaskPolicy::TestTaskDependence< Kokkos::Cuda >::run(i);
+  }
+}
+
+TEST_F( cuda , task_team )
+{
+  //TestTaskPolicy::TestTaskTeam< Kokkos::Cuda >::run(1000);
+  TestTaskPolicy::TestTaskTeam< Kokkos::Cuda >::run(104);
+  TestTaskPolicy::TestTaskTeamValue< Kokkos::Cuda >::run(1000);
+}
+
+//----------------------------------------------------------------------------
+
+TEST_F( cuda , old_task_policy )
+{
+  TestTaskPolicy::test_task_dep< Kokkos::Cuda >( 10 );
+
+  for ( long i = 0 ; i < 15 ; ++i ) {
+      // printf("TestTaskPolicy::test_fib< Kokkos::Cuda >(%d);\n",i);
+    TestTaskPolicy::test_fib< Kokkos::Cuda >(i,4096);
+  }
+  for ( long i = 0 ; i < 35 ; ++i ) {
+      // printf("TestTaskPolicy::test_fib2< Kokkos::Cuda >(%d);\n",i);
+    TestTaskPolicy::test_fib2< Kokkos::Cuda >(i,4096);
+  }
+}
+
+TEST_F( cuda , old_task_team )
+{
+  TestTaskPolicy::test_task_team< Kokkos::Cuda >(1000);
+}
+
+TEST_F( cuda , old_task_latch )
+{
+  TestTaskPolicy::test_latch< Kokkos::Cuda >(10);
+  TestTaskPolicy::test_latch< Kokkos::Cuda >(1000);
+}
+
+#endif // #if defined( KOKKOS_ENABLE_TASKPOLICY )
+
diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceType.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceType.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1b1e0e67365fa28778cb848cbd52d0a2399c97e6
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestDefaultDeviceType.cpp
@@ -0,0 +1,242 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+#if !defined(KOKKOS_HAVE_CUDA) || defined(__CUDACC__)
+//----------------------------------------------------------------------------
+
+#include <TestViewImpl.hpp>
+#include <TestAtomic.hpp>
+
+#include <TestViewAPI.hpp>
+
+#include <TestReduce.hpp>
+#include <TestScan.hpp>
+#include <TestTeam.hpp>
+#include <TestAggregate.hpp>
+#include <TestCompilerMacros.hpp>
+#include <TestCXX11.hpp>
+#include <TestTeamVector.hpp>
+
+namespace Test {
+
+class defaultdevicetype : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+    Kokkos::initialize();
+  }
+
+  static void TearDownTestCase()
+  {
+    Kokkos::finalize();
+  }
+};
+
+
+TEST_F( defaultdevicetype, view_impl) {
+  test_view_impl< Kokkos::DefaultExecutionSpace >();
+}
+
+TEST_F( defaultdevicetype, view_api) {
+  TestViewAPI< double , Kokkos::DefaultExecutionSpace >();
+}
+
+TEST_F( defaultdevicetype, long_reduce) {
+  TestReduce< long ,   Kokkos::DefaultExecutionSpace >( 100000 );
+}
+
+TEST_F( defaultdevicetype, double_reduce) {
+  TestReduce< double ,   Kokkos::DefaultExecutionSpace >( 100000 );
+}
+
+TEST_F( defaultdevicetype, long_reduce_dynamic ) {
+  TestReduceDynamic< long ,   Kokkos::DefaultExecutionSpace >( 100000 );
+}
+
+TEST_F( defaultdevicetype, double_reduce_dynamic ) {
+  TestReduceDynamic< double ,   Kokkos::DefaultExecutionSpace >( 100000 );
+}
+
+TEST_F( defaultdevicetype, long_reduce_dynamic_view ) {
+  TestReduceDynamicView< long ,   Kokkos::DefaultExecutionSpace >( 100000 );
+}
+
+
+TEST_F( defaultdevicetype , atomics )
+{
+  const int loop_count = 1e4 ;
+
+  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::DefaultExecutionSpace>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::DefaultExecutionSpace>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::DefaultExecutionSpace>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::DefaultExecutionSpace>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::DefaultExecutionSpace>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::DefaultExecutionSpace>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::DefaultExecutionSpace>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::DefaultExecutionSpace>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::DefaultExecutionSpace>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::DefaultExecutionSpace>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::DefaultExecutionSpace>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::DefaultExecutionSpace>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::DefaultExecutionSpace>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::DefaultExecutionSpace>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::DefaultExecutionSpace>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::DefaultExecutionSpace>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::DefaultExecutionSpace>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::DefaultExecutionSpace>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::DefaultExecutionSpace>(100,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::DefaultExecutionSpace>(100,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::DefaultExecutionSpace>(100,3) ) );
+}
+
+/*TEST_F( defaultdevicetype , view_remap )
+{
+  enum { N0 = 3 , N1 = 2 , N2 = 8 , N3 = 9 };
+
+  typedef Kokkos::View< double*[N1][N2][N3] ,
+                             Kokkos::LayoutRight ,
+                             Kokkos::DefaultExecutionSpace > output_type ;
+
+  typedef Kokkos::View< int**[N2][N3] ,
+                             Kokkos::LayoutLeft ,
+                             Kokkos::DefaultExecutionSpace > input_type ;
+
+  typedef Kokkos::View< int*[N0][N2][N3] ,
+                             Kokkos::LayoutLeft ,
+                             Kokkos::DefaultExecutionSpace > diff_type ;
+
+  output_type output( "output" , N0 );
+  input_type  input ( "input" , N0 , N1 );
+  diff_type   diff  ( "diff" , N0 );
+
+  int value = 0 ;
+  for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
+  for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
+  for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
+  for ( size_t i0 = 0 ; i0 < N0 ; ++i0 ) {
+    input(i0,i1,i2,i3) = ++value ;
+  }}}}
+
+  // Kokkos::deep_copy( diff , input ); // throw with incompatible shape
+  Kokkos::deep_copy( output , input );
+
+  value = 0 ;
+  for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
+  for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
+  for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
+  for ( size_t i0 = 0 ; i0 < N0 ; ++i0 ) {
+    ++value ;
+    ASSERT_EQ( value , ((int) output(i0,i1,i2,i3) ) );
+  }}}}
+}*/
+
+//----------------------------------------------------------------------------
+
+
+TEST_F( defaultdevicetype , view_aggregate )
+{
+  TestViewAggregate< Kokkos::DefaultExecutionSpace >();
+}
+
+//----------------------------------------------------------------------------
+
+TEST_F( defaultdevicetype , scan )
+{
+  TestScan< Kokkos::DefaultExecutionSpace >::test_range( 1 , 1000 );
+  TestScan< Kokkos::DefaultExecutionSpace >( 1000000 );
+  TestScan< Kokkos::DefaultExecutionSpace >( 10000000 );
+  Kokkos::DefaultExecutionSpace::fence();
+}
+
+
+//----------------------------------------------------------------------------
+
+TEST_F( defaultdevicetype , compiler_macros )
+{
+  ASSERT_TRUE( ( TestCompilerMacros::Test< Kokkos::DefaultExecutionSpace >() ) );
+}
+
+
+//----------------------------------------------------------------------------
+TEST_F( defaultdevicetype , cxx11 )
+{
+  ASSERT_TRUE( ( TestCXX11::Test< Kokkos::DefaultExecutionSpace >(1) ) );
+  ASSERT_TRUE( ( TestCXX11::Test< Kokkos::DefaultExecutionSpace >(2) ) );
+  ASSERT_TRUE( ( TestCXX11::Test< Kokkos::DefaultExecutionSpace >(3) ) );
+  ASSERT_TRUE( ( TestCXX11::Test< Kokkos::DefaultExecutionSpace >(4) ) );
+}
+
+TEST_F( defaultdevicetype , team_vector )
+{
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::DefaultExecutionSpace >(0) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::DefaultExecutionSpace >(1) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::DefaultExecutionSpace >(2) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::DefaultExecutionSpace >(3) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::DefaultExecutionSpace >(4) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::DefaultExecutionSpace >(5) ) );
+}
+
+TEST_F( defaultdevicetype , malloc )
+{
+  int* data = (int*) Kokkos::kokkos_malloc(100*sizeof(int));
+  ASSERT_NO_THROW(data = (int*) Kokkos::kokkos_realloc(data,120*sizeof(int)));
+  Kokkos::kokkos_free(data);
+
+  int* data2 = (int*) Kokkos::kokkos_malloc(0);
+  ASSERT_TRUE(data2==NULL);
+  Kokkos::kokkos_free(data2);
+}
+
+} // namespace test
+
+#endif
diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a17ed97a9ff4130a2ca2ea087b400e9595c69dd9
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp
@@ -0,0 +1,419 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+#ifdef KOKKOS_HAVE_OPENMP
+#include <omp.h>
+#endif
+
+#if !defined(KOKKOS_HAVE_CUDA) || defined(__CUDACC__)
+//----------------------------------------------------------------------------
+
+namespace Test {
+
+namespace Impl {
+
+  char** init_kokkos_args(bool do_threads,bool do_numa,bool do_device,bool do_other, int& nargs, Kokkos::InitArguments& init_args) {
+    nargs = (do_threads?1:0) +
+            (do_numa?1:0) +
+            (do_device?1:0) +
+            (do_other?4:0);
+    char** args_kokkos = new char*[nargs];
+    for(int i = 0; i < nargs; i++)
+      args_kokkos[i] = new char[20];
+
+    int threads_idx = do_other?1:0;
+    int numa_idx = (do_other?3:0) + (do_threads?1:0);
+    int device_idx = (do_other?3:0) + (do_threads?1:0) + (do_numa?1:0);
+
+
+    if(do_threads) {
+      int nthreads = 3;
+
+#ifdef KOKKOS_HAVE_OPENMP
+      if(omp_get_max_threads() < 3)
+        nthreads = omp_get_max_threads();
+#endif
+
+      if(Kokkos::hwloc::available())  {
+        if(Kokkos::hwloc::get_available_threads_per_core()<3)
+            nthreads =   Kokkos::hwloc::get_available_threads_per_core()
+                       * Kokkos::hwloc::get_available_numa_count();
+      }
+
+#ifdef KOKKOS_HAVE_SERIAL
+      if(Kokkos::Impl::is_same<Kokkos::Serial,Kokkos::DefaultExecutionSpace>::value ||
+         Kokkos::Impl::is_same<Kokkos::Serial,Kokkos::DefaultHostExecutionSpace>::value ) {
+        nthreads = 1;
+      }
+#endif
+      init_args.num_threads = nthreads;
+      sprintf(args_kokkos[threads_idx],"--threads=%i",nthreads);
+    }
+
+    if(do_numa) {
+      int numa = 1;
+      if(Kokkos::hwloc::available())
+        numa = Kokkos::hwloc::get_available_numa_count();
+#ifdef KOKKOS_HAVE_SERIAL
+      if(Kokkos::Impl::is_same<Kokkos::Serial,Kokkos::DefaultExecutionSpace>::value ||
+         Kokkos::Impl::is_same<Kokkos::Serial,Kokkos::DefaultHostExecutionSpace>::value ) {
+        numa = 1;
+      }
+#endif
+
+      init_args.num_numa = numa;
+      sprintf(args_kokkos[numa_idx],"--numa=%i",numa);
+    }
+
+    if(do_device) {
+
+      init_args.device_id = 0;
+      sprintf(args_kokkos[device_idx],"--device=%i",0);
+    }
+
+    if(do_other) {
+      sprintf(args_kokkos[0],"--dummyarg=1");
+      sprintf(args_kokkos[threads_idx+(do_threads?1:0)],"--dummy2arg");
+      sprintf(args_kokkos[threads_idx+(do_threads?1:0)+1],"dummy3arg");
+      sprintf(args_kokkos[device_idx+(do_device?1:0)],"dummy4arg=1");
+    }
+
+
+    return args_kokkos;
+  }
+
+  Kokkos::InitArguments init_initstruct(bool do_threads, bool do_numa, bool do_device) {
+    Kokkos::InitArguments args;
+
+    if(do_threads) {
+      int nthreads = 3;
+
+#ifdef KOKKOS_HAVE_OPENMP
+      if(omp_get_max_threads() < 3)
+        nthreads = omp_get_max_threads();
+#endif
+
+      if(Kokkos::hwloc::available())  {
+        if(Kokkos::hwloc::get_available_threads_per_core()<3)
+            nthreads =   Kokkos::hwloc::get_available_threads_per_core()
+                       * Kokkos::hwloc::get_available_numa_count();
+      }
+#ifdef KOKKOS_HAVE_SERIAL
+      if(Kokkos::Impl::is_same<Kokkos::Serial,Kokkos::DefaultExecutionSpace>::value ||
+         Kokkos::Impl::is_same<Kokkos::Serial,Kokkos::DefaultHostExecutionSpace>::value ) {
+        nthreads = 1;
+      }
+#endif
+
+      args.num_threads = nthreads;
+    }
+
+    if(do_numa) {
+      int numa = 1;
+      if(Kokkos::hwloc::available())
+        numa = Kokkos::hwloc::get_available_numa_count();
+#ifdef KOKKOS_HAVE_SERIAL
+      if(Kokkos::Impl::is_same<Kokkos::Serial,Kokkos::DefaultExecutionSpace>::value ||
+         Kokkos::Impl::is_same<Kokkos::Serial,Kokkos::DefaultHostExecutionSpace>::value ) {
+        numa = 1;
+      }
+#endif
+      args.num_numa = numa;
+    }
+
+    if(do_device) {
+      args.device_id = 0;
+    }
+
+    return args;
+  }
+
+  void check_correct_initialization(const Kokkos::InitArguments& argstruct) {
+    ASSERT_EQ( Kokkos::DefaultExecutionSpace::is_initialized(), 1);
+    ASSERT_EQ( Kokkos::HostSpace::execution_space::is_initialized(), 1);
+
+    //Figure out the number of threads the HostSpace ExecutionSpace should have initialized to
+    int expected_nthreads = argstruct.num_threads;
+    if(expected_nthreads<1) {
+      if(Kokkos::hwloc::available()) {
+        expected_nthreads = Kokkos::hwloc::get_available_numa_count()
+                          * Kokkos::hwloc::get_available_cores_per_numa()
+                          * Kokkos::hwloc::get_available_threads_per_core();
+      } else {
+        #ifdef KOKKOS_HAVE_OPENMP
+        if(Kokkos::Impl::is_same<Kokkos::HostSpace::execution_space,Kokkos::OpenMP>::value) {
+          expected_nthreads = omp_get_max_threads();
+        } else
+        #endif
+          expected_nthreads = 1;
+
+      }
+      #ifdef KOKKOS_HAVE_SERIAL
+      if(Kokkos::Impl::is_same<Kokkos::DefaultExecutionSpace,Kokkos::Serial>::value ||
+         Kokkos::Impl::is_same<Kokkos::DefaultHostExecutionSpace,Kokkos::Serial>::value ) 
+        expected_nthreads = 1;
+      #endif
+    }
+
+    int expected_numa = argstruct.num_numa;
+    if(expected_numa<1) {
+      if(Kokkos::hwloc::available()) {
+        expected_numa = Kokkos::hwloc::get_available_numa_count();
+      } else {
+        expected_numa = 1;
+      }
+      #ifdef KOKKOS_HAVE_SERIAL
+      if(Kokkos::Impl::is_same<Kokkos::DefaultExecutionSpace,Kokkos::Serial>::value ||
+         Kokkos::Impl::is_same<Kokkos::DefaultHostExecutionSpace,Kokkos::Serial>::value )
+        expected_numa = 1;
+      #endif
+    }
+    ASSERT_EQ(Kokkos::HostSpace::execution_space::thread_pool_size(),expected_nthreads);
+
+#ifdef KOKKOS_HAVE_CUDA
+    if(Kokkos::Impl::is_same<Kokkos::DefaultExecutionSpace,Kokkos::Cuda>::value) {
+      int device;
+      cudaGetDevice( &device );
+      int expected_device = argstruct.device_id;
+      if(argstruct.device_id<0) {
+        expected_device = 0;
+      }
+      ASSERT_EQ(expected_device,device);
+    }
+#endif
+  }
+
+  //ToDo: Add check whether correct number of threads are actually started
+  void test_no_arguments() {
+    Kokkos::initialize();
+    check_correct_initialization(Kokkos::InitArguments());
+    Kokkos::finalize();
+  }
+
+  void test_commandline_args(int nargs, char** args, const Kokkos::InitArguments& argstruct) {
+    Kokkos::initialize(nargs,args);
+    check_correct_initialization(argstruct);
+    Kokkos::finalize();
+  }
+
+  void test_initstruct_args(const Kokkos::InitArguments& args) {
+    Kokkos::initialize(args);
+    check_correct_initialization(args);
+    Kokkos::finalize();
+  }
+}
+
+class defaultdevicetypeinit : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+  }
+
+  static void TearDownTestCase()
+  {
+  }
+};
+
+#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_01
+TEST_F( defaultdevicetypeinit, no_args) {
+  Impl::test_no_arguments();
+}
+#endif
+
+#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_02
+TEST_F( defaultdevicetypeinit, commandline_args_empty) {
+  Kokkos::InitArguments argstruct;
+  int nargs = 0;
+  char** args = Impl::init_kokkos_args(false,false,false,false,nargs, argstruct);
+  Impl::test_commandline_args(nargs,args,argstruct);
+  for(int i = 0; i < nargs; i++)
+    delete [] args[i];
+  delete [] args;
+}
+#endif
+
+#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_03
+TEST_F( defaultdevicetypeinit, commandline_args_other) {
+  Kokkos::InitArguments argstruct;
+  int nargs = 0;
+  char** args = Impl::init_kokkos_args(false,false,false,true,nargs, argstruct);
+  Impl::test_commandline_args(nargs,args,argstruct);
+  for(int i = 0; i < nargs; i++)
+    delete [] args[i];
+  delete [] args;
+}
+#endif
+
+#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_04
+TEST_F( defaultdevicetypeinit, commandline_args_nthreads) {
+  Kokkos::InitArguments argstruct;
+  int nargs = 0;
+  char** args = Impl::init_kokkos_args(true,false,false,false,nargs, argstruct);
+  Impl::test_commandline_args(nargs,args,argstruct);
+  for(int i = 0; i < nargs; i++)
+    delete [] args[i];
+  delete [] args;
+}
+#endif
+
+#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_05
+TEST_F( defaultdevicetypeinit, commandline_args_nthreads_numa) {
+  Kokkos::InitArguments argstruct;
+  int nargs = 0;
+  char** args = Impl::init_kokkos_args(true,true,false,false,nargs, argstruct);
+  Impl::test_commandline_args(nargs,args,argstruct);
+  for(int i = 0; i < nargs; i++)
+    delete [] args[i];
+  delete [] args;
+}
+#endif
+
+#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_06
+TEST_F( defaultdevicetypeinit, commandline_args_nthreads_numa_device) {
+  Kokkos::InitArguments argstruct;
+  int nargs = 0;
+  char** args = Impl::init_kokkos_args(true,true,true,false,nargs, argstruct);
+  Impl::test_commandline_args(nargs,args,argstruct);
+  for(int i = 0; i < nargs; i++)
+    delete [] args[i];
+  delete [] args;
+}
+#endif
+
+#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_07
+TEST_F( defaultdevicetypeinit, commandline_args_nthreads_device) {
+  Kokkos::InitArguments argstruct;
+  int nargs = 0;
+  char** args = Impl::init_kokkos_args(true,false,true,false,nargs, argstruct);
+  Impl::test_commandline_args(nargs,args,argstruct);
+  for(int i = 0; i < nargs; i++)
+    delete [] args[i];
+  delete [] args;
+}
+#endif
+
+#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_08
+TEST_F( defaultdevicetypeinit, commandline_args_numa_device) {
+  Kokkos::InitArguments argstruct;
+  int nargs = 0;
+  char** args = Impl::init_kokkos_args(false,true,true,false,nargs, argstruct);
+  Impl::test_commandline_args(nargs,args,argstruct);
+  for(int i = 0; i < nargs; i++)
+    delete [] args[i];
+  delete [] args;
+}
+#endif
+
+#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_09
+TEST_F( defaultdevicetypeinit, commandline_args_device) {
+  Kokkos::InitArguments argstruct;
+  int nargs = 0;
+  char** args = Impl::init_kokkos_args(false,false,true,false,nargs, argstruct);
+  Impl::test_commandline_args(nargs,args,argstruct);
+  for(int i = 0; i < nargs; i++)
+    delete [] args[i];
+  delete [] args;
+}
+#endif
+
+#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_10
+TEST_F( defaultdevicetypeinit, commandline_args_nthreads_numa_device_other) {
+  Kokkos::InitArguments argstruct;
+  int nargs = 0;
+  char** args = Impl::init_kokkos_args(true,true,true,true,nargs, argstruct);
+  Impl::test_commandline_args(nargs,args,argstruct);
+  for(int i = 0; i < nargs; i++)
+    delete [] args[i];
+  delete [] args;
+}
+#endif
+
+#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_11
+TEST_F( defaultdevicetypeinit, initstruct_default) {
+  Kokkos::InitArguments args;
+  Impl::test_initstruct_args(args);
+}
+#endif
+
+#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_12
+TEST_F( defaultdevicetypeinit, initstruct_nthreads) {
+  Kokkos::InitArguments args = Impl::init_initstruct(true,false,false);
+  Impl::test_initstruct_args(args);
+}
+#endif
+
+#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_13
+TEST_F( defaultdevicetypeinit, initstruct_nthreads_numa) {
+  Kokkos::InitArguments args = Impl::init_initstruct(true,true,false);
+  Impl::test_initstruct_args(args);
+}
+#endif
+
+#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_14
+TEST_F( defaultdevicetypeinit, initstruct_device) {
+  Kokkos::InitArguments args = Impl::init_initstruct(false,false,true);
+  Impl::test_initstruct_args(args);
+}
+#endif
+
+#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_15
+TEST_F( defaultdevicetypeinit, initstruct_nthreads_device) {
+  Kokkos::InitArguments args = Impl::init_initstruct(true,false,true);
+  Impl::test_initstruct_args(args);
+}
+#endif
+
+#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_16
+TEST_F( defaultdevicetypeinit, initstruct_nthreads_numa_device) {
+  Kokkos::InitArguments args = Impl::init_initstruct(true,true,true);
+  Impl::test_initstruct_args(args);
+}
+#endif
+
+
+} // namespace test
+
+#endif
diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_1.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_1.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..40a773b3b8fd18fb0a4cce396b4cc19400b9ad41
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_1.cpp
@@ -0,0 +1,2 @@
+#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_01
+#include<TestDefaultDeviceTypeInit.hpp>
diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_10.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_10.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f12c4f62b25acbb44e1f7d58876884035c250d9f
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_10.cpp
@@ -0,0 +1,2 @@
+#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_10
+#include<TestDefaultDeviceTypeInit.hpp>
diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_11.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_11.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c7ffd7b94e5675b28d519e5dc785ccfb55549b31
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_11.cpp
@@ -0,0 +1,2 @@
+#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_11
+#include<TestDefaultDeviceTypeInit.hpp>
diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_12.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_12.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..24e2b152014a8308e1ef3eccaa44ad76d884f9d2
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_12.cpp
@@ -0,0 +1,2 @@
+#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_12
+#include<TestDefaultDeviceTypeInit.hpp>
diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_13.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_13.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7968c13b661cad0b54697e86626d166fe0949602
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_13.cpp
@@ -0,0 +1,2 @@
+#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_13
+#include<TestDefaultDeviceTypeInit.hpp>
diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_14.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_14.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ab0563c6dc03d45fc696ea538cb75d6288f1e576
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_14.cpp
@@ -0,0 +1,2 @@
+#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_14
+#include<TestDefaultDeviceTypeInit.hpp>
diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_15.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_15.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..70a8ca1727515910f5bae07703421e9e95e6ab42
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_15.cpp
@@ -0,0 +1,2 @@
+#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_15
+#include<TestDefaultDeviceTypeInit.hpp>
diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_16.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_16.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..727c7a95eb9f949f6ecb0e910dc8ff009d6b8225
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_16.cpp
@@ -0,0 +1,2 @@
+#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_16
+#include<TestDefaultDeviceTypeInit.hpp>
diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_2.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_2.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..88fba34c50e93c2ddb8e730d50e08d853b44dba5
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_2.cpp
@@ -0,0 +1,2 @@
+#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_02
+#include<TestDefaultDeviceTypeInit.hpp>
diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_3.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_3.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b3562cc53d6b4cf2a4162b916d84f94e1ab482a6
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_3.cpp
@@ -0,0 +1,2 @@
+#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_03
+#include<TestDefaultDeviceTypeInit.hpp>
diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_4.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_4.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0d4983319cb565f2ba4283b910bd16cabc48253a
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_4.cpp
@@ -0,0 +1,2 @@
+#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_04
+#include<TestDefaultDeviceTypeInit.hpp>
diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_5.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_5.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..026fb01f8870af1a2d24f59da17a5d419721ba71
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_5.cpp
@@ -0,0 +1,2 @@
+#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_05
+#include<TestDefaultDeviceTypeInit.hpp>
diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_6.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_6.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..937a13160e40f6ec4666a7f4cb7eb7dc62d8a8fc
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_6.cpp
@@ -0,0 +1,2 @@
+#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_06
+#include<TestDefaultDeviceTypeInit.hpp>
diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_7.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_7.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..992c854c1a10224a09d897a917e309f654cd4763
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_7.cpp
@@ -0,0 +1,2 @@
+#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_07
+#include<TestDefaultDeviceTypeInit.hpp>
diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_8.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_8.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..07a8b1cb7c2ea3fc515f2f403ad6401353d7f7a1
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_8.cpp
@@ -0,0 +1,2 @@
+#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_08
+#include<TestDefaultDeviceTypeInit.hpp>
diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_9.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_9.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4d8c05be2d7f486487d7f39357982361117b4b76
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_9.cpp
@@ -0,0 +1,2 @@
+#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_09
+#include<TestDefaultDeviceTypeInit.hpp>
diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceType_a.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceType_a.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c15f81223329eaa749d84fbef28340638fd3c835
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestDefaultDeviceType_a.cpp
@@ -0,0 +1,76 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+#if !defined(KOKKOS_HAVE_CUDA) || defined(__CUDACC__)
+//----------------------------------------------------------------------------
+
+#include <TestReduce.hpp>
+
+
+namespace Test {
+
+class defaultdevicetype : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+    Kokkos::initialize();
+  }
+
+  static void TearDownTestCase()
+  {
+    Kokkos::finalize();
+  }
+};
+
+
+TEST_F( defaultdevicetype, reduce_instantiation) {
+  TestReduceCombinatoricalInstantiation<>::execute();
+}
+
+} // namespace test
+
+#endif
diff --git a/lib/kokkos/core/unit_test/TestHWLOC.cpp b/lib/kokkos/core/unit_test/TestHWLOC.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1637dec5de4ff762cfbd259ee47932b5e85eb4d0
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestHWLOC.cpp
@@ -0,0 +1,69 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <iostream>
+#include <Kokkos_hwloc.hpp>
+
+namespace Test {
+
+class hwloc : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {}
+
+  static void TearDownTestCase()
+  {}
+};
+
+TEST_F( hwloc, query)
+{
+  std::cout << " NUMA[" << Kokkos::hwloc::get_available_numa_count() << "]"
+            << " CORE[" << Kokkos::hwloc::get_available_cores_per_numa() << "]"
+            << " PU[" << Kokkos::hwloc::get_available_threads_per_core()  << "]"
+            << std::endl ;
+}
+
+}
+
diff --git a/lib/kokkos/core/unit_test/TestMDRange.hpp b/lib/kokkos/core/unit_test/TestMDRange.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..9894d1ce697c1f109163f7711e62f12cfceef703
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestMDRange.hpp
@@ -0,0 +1,555 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <stdio.h>
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Test {
+namespace {
+
+template <typename ExecSpace >
+struct TestMDRange_2D {
+
+  using DataType     = int ;
+  using ViewType     = typename Kokkos::View< DataType** ,  ExecSpace > ;
+  using HostViewType = typename ViewType::HostMirror ;
+
+  ViewType input_view ;
+
+  TestMDRange_2D( const DataType N0, const DataType N1 ) : input_view("input_view", N0, N1) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i , const int j ) const
+  {
+    input_view(i,j) = 1;
+  }
+
+
+  static void test_for2( const int64_t N0, const int64_t N1 )
+  {
+
+    using namespace Kokkos::Experimental;
+
+    {
+      using range_type = MDRangePolicy< ExecSpace, Rank<2>, Kokkos::IndexType<int> >;
+      range_type range( {0,0}, {N0,N1} );
+      TestMDRange_2D functor(N0,N1);
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view , functor.input_view );
+
+      int counter = 0;
+      for ( int i=0; i<N0; ++i ) {
+        for ( int j=0; j<N1; ++j ) {
+          if ( h_view(i,j) != 1 ) {
+            ++counter;
+          }
+        }}
+      if ( counter != 0 )
+        printf(" Errors in test_for2; mismatches = %d\n\n",counter);
+      ASSERT_EQ( counter , 0 );
+    }
+
+    {
+      using range_type = MDRangePolicy< ExecSpace, Rank<2, Iterate::Default, Iterate::Default >, Kokkos::IndexType<int> >;
+
+      range_type range( {0,0}, {N0,N1} );
+      TestMDRange_2D functor(N0,N1);
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view , functor.input_view );
+
+      int counter = 0;
+      for ( int i=0; i<N0; ++i ) {
+        for ( int j=0; j<N1; ++j ) {
+          if ( h_view(i,j) != 1 ) {
+            ++counter;
+          }
+        }}
+      if ( counter != 0 )
+        printf(" Errors in test_for2; mismatches = %d\n\n",counter);
+      ASSERT_EQ( counter , 0 );
+    }
+
+    {
+      using range_type = MDRangePolicy< ExecSpace, Rank<2, Iterate::Default, Iterate::Flat >, Kokkos::IndexType<int> >;
+
+      range_type range( {0,0}, {N0,N1} );
+      TestMDRange_2D functor(N0,N1);
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view , functor.input_view );
+
+      int counter = 0;
+      for ( int i=0; i<N0; ++i ) {
+        for ( int j=0; j<N1; ++j ) {
+          if ( h_view(i,j) != 1 ) {
+            ++counter;
+          }
+        }}
+      if ( counter != 0 )
+        printf(" Errors in test_for2; mismatches = %d\n\n",counter);
+      ASSERT_EQ( counter , 0 );
+    }
+
+    {
+      using range_type = MDRangePolicy< ExecSpace, Rank<2, Iterate::Right, Iterate::Flat >, Kokkos::IndexType<int> >;
+
+      range_type range( {0,0}, {N0,N1} );
+      TestMDRange_2D functor(N0,N1);
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view , functor.input_view );
+
+      int counter = 0;
+      for ( int i=0; i<N0; ++i ) {
+        for ( int j=0; j<N1; ++j ) {
+          if ( h_view(i,j) != 1 ) {
+            ++counter;
+          }
+        }}
+      if ( counter != 0 )
+        printf(" Errors in test_for2; mismatches = %d\n\n",counter);
+      ASSERT_EQ( counter , 0 );
+    }
+
+    {
+      using range_type = MDRangePolicy< ExecSpace, Rank<2, Iterate::Left, Iterate::Flat >, Kokkos::IndexType<int> >;
+
+      range_type range( {0,0}, {N0,N1} );
+      TestMDRange_2D functor(N0,N1);
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view , functor.input_view );
+
+      int counter = 0;
+      for ( int i=0; i<N0; ++i ) {
+        for ( int j=0; j<N1; ++j ) {
+          if ( h_view(i,j) != 1 ) {
+            ++counter;
+          }
+        }}
+      if ( counter != 0 )
+        printf(" Errors in test_for2; mismatches = %d\n\n",counter);
+      ASSERT_EQ( counter , 0 );
+    }
+
+    {
+      using range_type = MDRangePolicy< ExecSpace, Rank<2, Iterate::Left , Iterate::Left >, Kokkos::IndexType<int> >;
+
+      range_type range( {0,0}, {N0,N1}, {3,3} );
+      TestMDRange_2D functor(N0,N1);
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view , functor.input_view );
+
+      int counter = 0;
+      for ( int i=0; i<N0; ++i ) {
+        for ( int j=0; j<N1; ++j ) {
+          if ( h_view(i,j) != 1 ) {
+            ++counter;
+          }
+        }}
+      if ( counter != 0 )
+        printf(" Errors in test_for2; mismatches = %d\n\n",counter);
+      ASSERT_EQ( counter , 0 );
+    }
+
+    {
+      using range_type = MDRangePolicy< ExecSpace, Rank<2, Iterate::Left , Iterate::Right >, Kokkos::IndexType<int> >;
+
+      range_type range( {0,0}, {N0,N1}, {7,7} );
+      TestMDRange_2D functor(N0,N1);
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view , functor.input_view );
+
+      int counter = 0;
+      for ( int i=0; i<N0; ++i ) {
+        for ( int j=0; j<N1; ++j ) {
+          if ( h_view(i,j) != 1 ) {
+            ++counter;
+          }
+        }}
+      if ( counter != 0 )
+        printf(" Errors in test_for2; mismatches = %d\n\n",counter);
+      ASSERT_EQ( counter , 0 );
+    }
+
+    {
+      using range_type = MDRangePolicy< ExecSpace, Rank<2, Iterate::Right, Iterate::Left >, Kokkos::IndexType<int> >;
+
+      range_type range( {0,0}, {N0,N1}, {16,16} );
+      TestMDRange_2D functor(N0,N1);
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view , functor.input_view );
+
+      int counter = 0;
+      for ( int i=0; i<N0; ++i ) {
+        for ( int j=0; j<N1; ++j ) {
+          if ( h_view(i,j) != 1 ) {
+            ++counter;
+          }
+        }}
+      if ( counter != 0 )
+        printf(" Errors in test_for2; mismatches = %d\n\n",counter);
+      ASSERT_EQ( counter , 0 );
+    }
+
+    {
+      using range_type = MDRangePolicy< ExecSpace, Rank<2, Iterate::Right, Iterate::Right >, Kokkos::IndexType<int> >;
+
+      range_type range( {0,0}, {N0,N1}, {5,16} );
+      TestMDRange_2D functor(N0,N1);
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view , functor.input_view );
+
+      int counter = 0;
+      for ( int i=0; i<N0; ++i ) {
+        for ( int j=0; j<N1; ++j ) {
+          if ( h_view(i,j) != 1 ) {
+            ++counter;
+          }
+        }}
+      if ( counter != 0 )
+        printf(" Errors in test_for2; mismatches = %d\n\n",counter);
+      ASSERT_EQ( counter , 0 );
+    }
+
+  } //end test_for2
+}; //MDRange_2D
+
+template <typename ExecSpace >
+struct TestMDRange_3D {
+
+  using DataType = int ;
+  using ViewType     = typename Kokkos::View< DataType*** ,  ExecSpace > ;
+  using HostViewType = typename ViewType::HostMirror ;
+
+  ViewType input_view ;
+
+  TestMDRange_3D( const DataType N0, const DataType N1, const DataType N2 ) : input_view("input_view", N0, N1, N2) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i , const int j , const int k ) const
+  {
+    input_view(i,j,k) = 1;
+  }
+
+  static void test_for3( const int64_t N0, const int64_t N1, const int64_t N2 )
+  {
+    using namespace Kokkos::Experimental;
+
+    {
+      using range_type = MDRangePolicy< ExecSpace, Rank<3>, Kokkos::IndexType<int> >;
+
+      range_type range( {0,0,0}, {N0,N1,N2} );
+      TestMDRange_3D functor(N0,N1,N2);
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view , functor.input_view );
+
+      int counter = 0;
+      for ( int i=0; i<N0; ++i ) {
+        for ( int j=0; j<N1; ++j ) {
+          for ( int k=0; k<N2; ++k ) {
+          if ( h_view(i,j,k) != 1 ) {
+            ++counter;
+          }
+        }}}
+      if ( counter != 0 )
+        printf(" Errors in test_for3; mismatches = %d\n\n",counter);
+      ASSERT_EQ( counter , 0 );
+    }
+
+    {
+      using range_type = MDRangePolicy< ExecSpace, Rank<3, Iterate::Default, Iterate::Default >, Kokkos::IndexType<int> >;
+
+      range_type range( {0,0,0}, {N0,N1,N2} );
+      TestMDRange_3D functor(N0,N1,N2);
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view , functor.input_view );
+
+      int counter = 0;
+      for ( int i=0; i<N0; ++i ) {
+        for ( int j=0; j<N1; ++j ) {
+          for ( int k=0; k<N2; ++k ) {
+          if ( h_view(i,j,k) != 1 ) {
+            ++counter;
+          }
+        }}}
+      if ( counter != 0 )
+        printf(" Errors in test_for3; mismatches = %d\n\n",counter);
+      ASSERT_EQ( counter , 0 );
+    }
+
+    {
+      using range_type = MDRangePolicy< ExecSpace, Rank<3, Iterate::Flat, Iterate::Default>, Kokkos::IndexType<int> >;
+
+      range_type range( {0,0,0}, {N0,N1,N2} );
+      TestMDRange_3D functor(N0,N1,N2);
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view , functor.input_view );
+
+      int counter = 0;
+      for ( int i=0; i<N0; ++i ) {
+        for ( int j=0; j<N1; ++j ) {
+          for ( int k=0; k<N2; ++k ) {
+          if ( h_view(i,j,k) != 1 ) {
+            ++counter;
+          }
+        }}}
+      if ( counter != 0 )
+        printf(" Errors in test_for3; mismatches = %d\n\n",counter);
+      ASSERT_EQ( counter , 0 );
+    }
+
+    {
+      using range_type = MDRangePolicy< ExecSpace, Rank<3, Iterate::Flat, Iterate::Flat >, Kokkos::IndexType<int> >;
+
+      range_type range( {0,0,0}, {N0,N1,N2} );
+      TestMDRange_3D functor(N0,N1,N2);
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view , functor.input_view );
+
+      int counter = 0;
+      for ( int i=0; i<N0; ++i ) {
+        for ( int j=0; j<N1; ++j ) {
+          for ( int k=0; k<N2; ++k ) {
+          if ( h_view(i,j,k) != 1 ) {
+            ++counter;
+          }
+        }}}
+      if ( counter != 0 )
+        printf(" Errors in test_for3; mismatches = %d\n\n",counter);
+      ASSERT_EQ( counter , 0 );
+    }
+
+    {
+      using range_type = MDRangePolicy< ExecSpace, Rank<3, Iterate::Left, Iterate::Flat >, Kokkos::IndexType<int> >;
+
+      range_type range( {0,0,0}, {N0,N1,N2} );
+      TestMDRange_3D functor(N0,N1,N2);
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view , functor.input_view );
+
+      int counter = 0;
+      for ( int i=0; i<N0; ++i ) {
+        for ( int j=0; j<N1; ++j ) {
+          for ( int k=0; k<N2; ++k ) {
+          if ( h_view(i,j,k) != 1 ) {
+            ++counter;
+          }
+        }}}
+      if ( counter != 0 )
+        printf(" Errors in test_for3; mismatches = %d\n\n",counter);
+      ASSERT_EQ( counter , 0 );
+    }
+
+    {
+      using range_type = MDRangePolicy< ExecSpace, Rank<3, Iterate::Right, Iterate::Flat >, Kokkos::IndexType<int> >;
+
+      range_type range( {0,0,0}, {N0,N1,N2} );
+      TestMDRange_3D functor(N0,N1,N2);
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view , functor.input_view );
+
+      int counter = 0;
+      for ( int i=0; i<N0; ++i ) {
+        for ( int j=0; j<N1; ++j ) {
+          for ( int k=0; k<N2; ++k ) {
+          if ( h_view(i,j,k) != 1 ) {
+            ++counter;
+          }
+        }}}
+      if ( counter != 0 )
+        printf(" Errors in test_for3; mismatches = %d\n\n",counter);
+      ASSERT_EQ( counter , 0 );
+    }
+
+    {
+      using range_type = MDRangePolicy< ExecSpace, Rank<3, Iterate::Left, Iterate::Left >, Kokkos::IndexType<int> >;
+
+      range_type range( {0,0,0}, {N0,N1,N2}, {2,4,2} );
+      TestMDRange_3D functor(N0,N1,N2);
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view , functor.input_view );
+
+      int counter = 0;
+      for ( int i=0; i<N0; ++i ) {
+        for ( int j=0; j<N1; ++j ) {
+          for ( int k=0; k<N2; ++k ) {
+          if ( h_view(i,j,k) != 1 ) {
+            ++counter;
+          }
+        }}}
+      if ( counter != 0 )
+        printf(" Errors in test_for3; mismatches = %d\n\n",counter);
+      ASSERT_EQ( counter , 0 );
+    }
+
+    {
+      using range_type = MDRangePolicy< ExecSpace, Rank<3, Iterate::Left, Iterate::Right >, Kokkos::IndexType<int> >;
+
+      range_type range( {0,0,0}, {N0,N1,N2}, {3,5,7} );
+      TestMDRange_3D functor(N0,N1,N2);
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view , functor.input_view );
+
+      int counter = 0;
+      for ( int i=0; i<N0; ++i ) {
+        for ( int j=0; j<N1; ++j ) {
+          for ( int k=0; k<N2; ++k ) {
+          if ( h_view(i,j,k) != 1 ) {
+            ++counter;
+          }
+        }}}
+      if ( counter != 0 )
+        printf(" Errors in test_for3; mismatches = %d\n\n",counter);
+      ASSERT_EQ( counter , 0 );
+    }
+
+    {
+      using range_type = MDRangePolicy< ExecSpace, Rank<3, Iterate::Right, Iterate::Left >, Kokkos::IndexType<int> >;
+
+      range_type range( {0,0,0}, {N0,N1,N2}, {8,8,8} );
+      TestMDRange_3D functor(N0,N1,N2);
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view , functor.input_view );
+
+      int counter = 0;
+      for ( int i=0; i<N0; ++i ) {
+        for ( int j=0; j<N1; ++j ) {
+          for ( int k=0; k<N2; ++k ) {
+          if ( h_view(i,j,k) != 1 ) {
+            ++counter;
+          }
+        }}}
+      if ( counter != 0 )
+        printf(" Errors in test_for3; mismatches = %d\n\n",counter);
+      ASSERT_EQ( counter , 0 );
+    }
+
+    {
+      using range_type = MDRangePolicy< ExecSpace, Rank<3, Iterate::Right, Iterate::Right >, Kokkos::IndexType<int> >;
+
+      range_type range( {0,0,0}, {N0,N1,N2}, {2,4,2} );
+      TestMDRange_3D functor(N0,N1,N2);
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view , functor.input_view );
+
+      int counter = 0;
+      for ( int i=0; i<N0; ++i ) {
+        for ( int j=0; j<N1; ++j ) {
+          for ( int k=0; k<N2; ++k ) {
+          if ( h_view(i,j,k) != 1 ) {
+            ++counter;
+          }
+        }}}
+      if ( counter != 0 )
+        printf(" Errors in test_for3; mismatches = %d\n\n",counter);
+      ASSERT_EQ( counter , 0 );
+    }
+
+  } //end test_for3
+};
+
+} /* namespace */
+} /* namespace Test */
+
+/*--------------------------------------------------------------------------*/
+
diff --git a/lib/kokkos/core/unit_test/TestMemoryPool.hpp b/lib/kokkos/core/unit_test/TestMemoryPool.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..cf650b0bc8baa1949643a57ffff808c83f406286
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestMemoryPool.hpp
@@ -0,0 +1,820 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+
+#ifndef KOKKOS_UNITTEST_MEMPOOL_HPP
+#define KOKKOS_UNITTEST_MEMPOOL_HPP
+
+#include <stdio.h>
+#include <iostream>
+#include <cmath>
+#include <algorithm>
+
+#include <impl/Kokkos_Timer.hpp>
+
+//#define TESTMEMORYPOOL_PRINT
+//#define TESTMEMORYPOOL_PRINT_STATUS
+
+#ifdef KOKKOS_HAVE_CUDA
+#define STRIDE 32
+#else
+#define STRIDE 1
+#endif
+
+namespace TestMemoryPool {
+
+struct pointer_obj {
+  uint64_t *  ptr;
+};
+
+struct pointer_obj2 {
+  void *  ptr;
+  size_t  size;
+};
+
+template < typename PointerView, typename Allocator >
+struct allocate_memory {
+  typedef typename PointerView::execution_space  execution_space;
+  typedef typename execution_space::size_type    size_type;
+
+  PointerView  m_pointers;
+  size_t       m_chunk_size;
+  Allocator    m_mempool;
+
+  allocate_memory( PointerView & ptrs, size_t num_ptrs,
+                   size_t cs, Allocator & m )
+    : m_pointers( ptrs ), m_chunk_size( cs ), m_mempool( m )
+  {
+    // Initialize the view with the out degree of each vertex.
+    Kokkos::parallel_for( num_ptrs * STRIDE, *this );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( size_type i ) const
+  {
+    if ( i % STRIDE == 0 ) {
+      m_pointers[i / STRIDE].ptr =
+        static_cast< uint64_t * >( m_mempool.allocate( m_chunk_size ) );
+    }
+  }
+};
+
+template < typename PointerView >
+struct count_invalid_memory {
+  typedef typename PointerView::execution_space  execution_space;
+  typedef typename execution_space::size_type    size_type;
+  typedef uint64_t                               value_type;
+
+  PointerView  m_pointers;
+  uint64_t &   m_result;
+
+  count_invalid_memory( PointerView & ptrs, size_t num_ptrs, uint64_t & res )
+    : m_pointers( ptrs ), m_result( res )
+  {
+    // Initialize the view with the out degree of each vertex.
+    Kokkos::parallel_reduce( num_ptrs * STRIDE, *this, m_result );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type & v ) const
+  { v = 0; }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile value_type & dst, volatile value_type const & src ) const
+  { dst += src; }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( size_type i, value_type & r ) const
+  {
+    if ( i % STRIDE == 0 ) {
+      r += ( m_pointers[i / STRIDE].ptr == 0 );
+    }
+  }
+};
+
+template < typename PointerView >
+struct fill_memory {
+  typedef typename PointerView::execution_space  execution_space;
+  typedef typename execution_space::size_type    size_type;
+
+  PointerView m_pointers;
+
+  fill_memory( PointerView & ptrs, size_t num_ptrs ) : m_pointers( ptrs )
+  {
+    // Initialize the view with the out degree of each vertex.
+    Kokkos::parallel_for( num_ptrs * STRIDE, *this );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( size_type i ) const
+  {
+    if ( i % STRIDE == 0 ) {
+      *m_pointers[i / STRIDE].ptr = i / STRIDE ;
+    }
+  }
+};
+
+template < typename PointerView >
+struct sum_memory {
+  typedef typename PointerView::execution_space  execution_space;
+  typedef typename execution_space::size_type    size_type;
+  typedef uint64_t                               value_type;
+
+  PointerView  m_pointers;
+  uint64_t &   m_result;
+
+  sum_memory( PointerView & ptrs, size_t num_ptrs, uint64_t & res )
+    : m_pointers( ptrs ), m_result( res )
+  {
+    // Initialize the view with the out degree of each vertex.
+    Kokkos::parallel_reduce( num_ptrs * STRIDE, *this, m_result );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type & v ) const
+  { v = 0; }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile value_type & dst, volatile value_type const & src ) const
+  { dst += src; }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( size_type i, value_type & r ) const
+  {
+    if ( i % STRIDE == 0 ) {
+      r += *m_pointers[i / STRIDE].ptr;
+    }
+  }
+};
+
+template < typename PointerView, typename Allocator >
+struct deallocate_memory {
+  typedef typename PointerView::execution_space  execution_space;
+  typedef typename execution_space::size_type    size_type;
+
+  PointerView  m_pointers;
+  size_t       m_chunk_size;
+  Allocator    m_mempool;
+
+  deallocate_memory( PointerView & ptrs, size_t num_ptrs,
+                     size_t cs, Allocator & m )
+    : m_pointers( ptrs ), m_chunk_size( cs ), m_mempool( m )
+  {
+    // Initialize the view with the out degree of each vertex.
+    Kokkos::parallel_for( num_ptrs * STRIDE, *this );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( size_type i ) const
+  {
+    if ( i % STRIDE == 0 ) {
+      m_mempool.deallocate( m_pointers[i / STRIDE].ptr, m_chunk_size );
+    }
+  }
+};
+
+template < typename WorkView, typename PointerView, typename ScalarView,
+           typename Allocator >
+struct allocate_deallocate_memory {
+  typedef typename WorkView::execution_space   execution_space;
+  typedef typename execution_space::size_type  size_type;
+
+  WorkView     m_work;
+  PointerView  m_pointers;
+  ScalarView   m_ptrs_front;
+  ScalarView   m_ptrs_back;
+  Allocator    m_mempool;
+
+  allocate_deallocate_memory( WorkView & w, size_t work_size, PointerView & p,
+                              ScalarView pf, ScalarView pb, Allocator & m )
+    : m_work( w ), m_pointers( p ), m_ptrs_front( pf ), m_ptrs_back( pb ),
+      m_mempool( m )
+  {
+    // Initialize the view with the out degree of each vertex.
+    Kokkos::parallel_for( work_size * STRIDE, *this );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( size_type i ) const
+  {
+    if ( i % STRIDE == 0 ) {
+      unsigned my_work = m_work[i / STRIDE];
+
+      if ( ( my_work & 1 ) == 0 ) {
+        // Allocation.
+        size_t pos = Kokkos::atomic_fetch_add( &m_ptrs_back(), 1 );
+        size_t alloc_size = my_work >> 1;
+        m_pointers[pos].ptr = m_mempool.allocate( alloc_size );
+        m_pointers[pos].size = alloc_size;
+      }
+      else {
+        // Deallocation.
+        size_t pos = Kokkos::atomic_fetch_add( &m_ptrs_front(), 1 );
+        m_mempool.deallocate( m_pointers[pos].ptr, m_pointers[pos].size );
+      }
+    }
+  }
+};
+
+#define PRECISION 6
+#define SHIFTW 24
+#define SHIFTW2 12
+
+template < typename F >
+void print_results( const std::string & text, F elapsed_time )
+{
+  std::cout << std::setw( SHIFTW ) << text << std::setw( SHIFTW2 )
+            << std::fixed << std::setprecision( PRECISION ) << elapsed_time
+            << std::endl;
+}
+
+template < typename F, typename T >
+void print_results( const std::string & text, unsigned long long width,
+                    F elapsed_time, T result )
+{
+  std::cout << std::setw( SHIFTW ) << text << std::setw( SHIFTW2 )
+            << std::fixed << std::setprecision( PRECISION ) << elapsed_time
+            << "     " << std::setw( width ) << result << std::endl;
+}
+
+template < typename F >
+void print_results( const std::string & text, unsigned long long width,
+                    F elapsed_time, const std::string & result )
+{
+  std::cout << std::setw( SHIFTW ) << text << std::setw( SHIFTW2 )
+            << std::fixed << std::setprecision( PRECISION ) << elapsed_time
+            << "     " << std::setw( width ) << result << std::endl;
+}
+
+// This test slams allocation and deallocation in a worse than real-world usage
+// scenario to see how bad the thread-safety really is by having a loop where
+// all threads allocate and a subsequent loop where all threads deallocate.
+// All of the allocation requests are for equal-sized chunks that are the base
+// chunk size of the memory pool.  It also tests initialization of the memory
+// pool and breaking large chunks into smaller chunks to fulfill allocation
+// requests.  It verifies that MemoryPool(), allocate(), and deallocate() work
+// correctly.
+template < class Device >
+bool test_mempool( size_t chunk_size, size_t total_size )
+{
+  typedef typename Device::execution_space                 execution_space;
+  typedef typename Device::memory_space                    memory_space;
+  typedef Device                                           device_type;
+  typedef Kokkos::View< pointer_obj *, device_type >       pointer_view;
+  typedef Kokkos::Experimental::MemoryPool< device_type >  pool_memory_space;
+
+  uint64_t result;
+  size_t num_chunks = total_size / chunk_size;
+  bool return_val = true;
+
+  pointer_view pointers( "pointers", num_chunks );
+
+#ifdef TESTMEMORYPOOL_PRINT
+  std::cout << "*** test_mempool() ***" << std::endl
+            << std::setw( SHIFTW ) << "chunk_size: " << std::setw( 12 )
+            << chunk_size << std::endl
+            << std::setw( SHIFTW ) << "total_size: " << std::setw( 12 )
+            << total_size << std::endl
+            << std::setw( SHIFTW ) << "num_chunks: " << std::setw( 12 )
+            << num_chunks << std::endl;
+
+  double elapsed_time = 0;
+  Kokkos::Timer timer;
+#endif
+
+  pool_memory_space mempool( memory_space(), total_size * 1.2, 20 );
+
+#ifdef TESTMEMORYPOOL_PRINT
+  execution_space::fence();
+  elapsed_time = timer.seconds();
+  print_results( "initialize mempool: ", elapsed_time );
+#ifdef TESTMEMORYPOOL_PRINT_STATUS
+  mempool.print_status();
+#endif
+  timer.reset();
+#endif
+
+  {
+    allocate_memory< pointer_view, pool_memory_space >
+      am( pointers, num_chunks, chunk_size, mempool );
+  }
+
+#ifdef TESTMEMORYPOOL_PRINT
+  execution_space::fence();
+  elapsed_time = timer.seconds();
+  print_results( "allocate chunks: ", elapsed_time );
+#ifdef TESTMEMORYPOOL_PRINT_STATUS
+  mempool.print_status();
+#endif
+  timer.reset();
+#endif
+
+  {
+    count_invalid_memory< pointer_view > sm( pointers, num_chunks, result );
+  }
+
+#ifdef TESTMEMORYPOOL_PRINT
+  execution_space::fence();
+  elapsed_time = timer.seconds();
+  print_results( "invalid chunks: ", 16, elapsed_time, result );
+  timer.reset();
+#endif
+
+  {
+    fill_memory< pointer_view > fm( pointers, num_chunks );
+  }
+
+#ifdef TESTMEMORYPOOL_PRINT
+  execution_space::fence();
+  elapsed_time = timer.seconds();
+  print_results( "fill chunks: ", elapsed_time );
+  timer.reset();
+#endif
+
+  {
+    sum_memory< pointer_view > sm( pointers, num_chunks, result );
+  }
+
+  execution_space::fence();
+
+#ifdef TESTMEMORYPOOL_PRINT
+  elapsed_time = timer.seconds();
+  print_results( "sum chunks: ", 16, elapsed_time, result );
+#endif
+
+  if ( result != ( num_chunks * ( num_chunks - 1 ) ) / 2 ) {
+    std::cerr << "Invalid sum value in memory." << std::endl;
+    return_val = false;
+  }
+
+#ifdef TESTMEMORYPOOL_PRINT
+  timer.reset();
+#endif
+
+  {
+    deallocate_memory< pointer_view, pool_memory_space >
+      dm( pointers, num_chunks, chunk_size, mempool );
+  }
+
+#ifdef TESTMEMORYPOOL_PRINT
+  execution_space::fence();
+  elapsed_time = timer.seconds();
+  print_results( "deallocate chunks: ", elapsed_time );
+#ifdef TESTMEMORYPOOL_PRINT_STATUS
+  mempool.print_status();
+#endif
+  timer.reset();
+#endif
+
+  {
+    allocate_memory< pointer_view, pool_memory_space >
+      am( pointers, num_chunks, chunk_size, mempool );
+  }
+
+#ifdef TESTMEMORYPOOL_PRINT
+  execution_space::fence();
+  elapsed_time = timer.seconds();
+  print_results( "allocate chunks: ", elapsed_time );
+#ifdef TESTMEMORYPOOL_PRINT_STATUS
+  mempool.print_status();
+#endif
+  timer.reset();
+#endif
+
+  {
+    count_invalid_memory< pointer_view > sm( pointers, num_chunks, result );
+  }
+
+#ifdef TESTMEMORYPOOL_PRINT
+  execution_space::fence();
+  elapsed_time = timer.seconds();
+  print_results( "invalid chunks: ", 16, elapsed_time, result );
+  timer.reset();
+#endif
+
+  {
+    fill_memory< pointer_view > fm( pointers, num_chunks );
+  }
+
+#ifdef TESTMEMORYPOOL_PRINT
+  execution_space::fence();
+  elapsed_time = timer.seconds();
+  print_results( "fill chunks: ", elapsed_time );
+  timer.reset();
+#endif
+
+  {
+    sum_memory< pointer_view > sm( pointers, num_chunks, result );
+  }
+
+  execution_space::fence();
+
+#ifdef TESTMEMORYPOOL_PRINT
+  elapsed_time = timer.seconds();
+  print_results( "sum chunks: ", 16, elapsed_time, result );
+#endif
+
+  if ( result != ( num_chunks * ( num_chunks - 1 ) ) / 2 ) {
+    std::cerr << "Invalid sum value in memory." << std::endl;
+    return_val = false;
+  }
+
+#ifdef TESTMEMORYPOOL_PRINT
+  timer.reset();
+#endif
+
+  {
+    deallocate_memory< pointer_view, pool_memory_space >
+      dm( pointers, num_chunks, chunk_size, mempool );
+  }
+
+#ifdef TESTMEMORYPOOL_PRINT
+  execution_space::fence();
+  elapsed_time = timer.seconds();
+  print_results( "deallocate chunks: ", elapsed_time );
+#ifdef TESTMEMORYPOOL_PRINT_STATUS
+  mempool.print_status();
+#endif
+#endif
+
+  return return_val;
+}
+
+template < typename T >
+T smallest_power2_ge( T val )
+{
+  // Find the most significant nonzero bit.
+  int first_nonzero_bit = Kokkos::Impl::bit_scan_reverse( val );
+
+  // If val is an integral power of 2, ceil( log2(val) ) is equal to the
+  // most significant nonzero bit.  Otherwise, you need to add 1.
+  int lg2_size = first_nonzero_bit +
+                 !Kokkos::Impl::is_integral_power_of_two( val );
+
+  return T(1) << T(lg2_size);
+}
+
+// This test makes allocation requests for multiple sizes and interleaves
+// allocation and deallocation.
+//
+// There are 3 phases.  The first phase does only allocations to build up a
+// working state for the allocator.  The second phase interleaves allocations
+// and deletions.  The third phase does only deallocations to undo all the
+// allocations from the first phase.  By building first to a working state,
+// allocations and deallocations can happen in any order for the second phase.
+// Each phase performs on multiple chunk sizes.
+template < class Device >
+void test_mempool2( unsigned base_chunk_size, size_t num_chunk_sizes,
+                    size_t phase1_size, size_t phase2_size )
+{
+#ifdef TESTMEMORYPOOL_PRINT
+  typedef typename Device::execution_space                 execution_space;
+#endif
+  typedef typename Device::memory_space                    memory_space;
+  typedef Device                                           device_type;
+  typedef Kokkos::View< unsigned *, device_type >          work_view;
+  typedef Kokkos::View< size_t, device_type >              scalar_view;
+  typedef Kokkos::View< pointer_obj2 *, device_type >      pointer_view;
+  typedef Kokkos::Experimental::MemoryPool< device_type >  pool_memory_space;
+
+  enum {
+    MIN_CHUNK_SIZE      = 64,
+    MIN_BASE_CHUNK_SIZE = MIN_CHUNK_SIZE / 2 + 1
+  };
+
+  // Make sure the base chunk size is at least MIN_BASE_CHUNK_SIZE bytes, so
+  // all the different chunk sizes translate to different block sizes for the
+  // allocator.
+  if ( base_chunk_size < MIN_BASE_CHUNK_SIZE ) {
+    base_chunk_size = MIN_BASE_CHUNK_SIZE;
+  }
+
+  // Get the smallest power of 2 >= the base chunk size.  The size must be
+  // >= MIN_CHUNK_SIZE, though.
+  unsigned ceil_base_chunk_size = smallest_power2_ge( base_chunk_size );
+  if ( ceil_base_chunk_size < MIN_CHUNK_SIZE ) {
+    ceil_base_chunk_size = MIN_CHUNK_SIZE;
+  }
+
+  // Make sure the phase 1 size is multiples of num_chunk_sizes.
+  phase1_size = ( ( phase1_size + num_chunk_sizes - 1 ) / num_chunk_sizes ) *
+                num_chunk_sizes;
+
+  // Make sure the phase 2 size is multiples of (2 * num_chunk_sizes).
+  phase2_size =
+    ( ( phase2_size + 2 * num_chunk_sizes - 1 ) / ( 2 * num_chunk_sizes ) ) *
+    2 * num_chunk_sizes;
+
+  // The phase2 size must be <= twice the phase1 size so that deallocations
+  // can't happen before allocations.
+  if ( phase2_size > 2 * phase1_size ) phase2_size = 2 * phase1_size;
+
+  size_t phase3_size = phase1_size;
+  size_t half_phase2_size = phase2_size / 2;
+
+  // Each entry in the work views has the following format.  The least
+  // significant bit indicates allocation (0) vs. deallocation (1).  For
+  // allocation, the other bits indicate the desired allocation size.
+
+  // Initialize the phase 1 work view with an equal number of allocations for
+  // each chunk size.
+  work_view phase1_work( "Phase 1 Work", phase1_size );
+  typename work_view::HostMirror host_phase1_work =
+    create_mirror_view(phase1_work);
+
+  size_t inner_size = phase1_size / num_chunk_sizes;
+  unsigned chunk_size = base_chunk_size;
+
+  for ( size_t i = 0; i < num_chunk_sizes; ++i ) {
+    for ( size_t j = 0; j < inner_size; ++j ) {
+      host_phase1_work[i * inner_size + j] = chunk_size << 1;
+    }
+
+    chunk_size *= 2;
+  }
+
+  std::random_shuffle( host_phase1_work.ptr_on_device(),
+                       host_phase1_work.ptr_on_device() + phase1_size );
+
+  deep_copy( phase1_work, host_phase1_work );
+
+  // Initialize the phase 2 work view with half allocations and half
+  // deallocations with an equal number of allocations for each chunk size.
+  work_view phase2_work( "Phase 2 Work", phase2_size );
+  typename work_view::HostMirror host_phase2_work =
+    create_mirror_view(phase2_work);
+
+  inner_size = half_phase2_size / num_chunk_sizes;
+  chunk_size = base_chunk_size;
+
+  for ( size_t i = 0; i < num_chunk_sizes; ++i ) {
+    for ( size_t j = 0; j < inner_size; ++j ) {
+      host_phase2_work[i * inner_size + j] = chunk_size << 1;
+    }
+
+    chunk_size *= 2;
+  }
+
+  for ( size_t i = half_phase2_size; i < phase2_size; ++i ) {
+    host_phase2_work[i] = 1;
+  }
+
+  std::random_shuffle( host_phase2_work.ptr_on_device(),
+                       host_phase2_work.ptr_on_device() + phase2_size );
+
+  deep_copy( phase2_work, host_phase2_work );
+
+  // Initialize the phase 3 work view with all deallocations.
+  work_view phase3_work( "Phase 3 Work", phase3_size );
+  typename work_view::HostMirror host_phase3_work =
+    create_mirror_view(phase3_work);
+
+  inner_size = phase3_size / num_chunk_sizes;
+
+  for ( size_t i = 0; i < phase3_size; ++i ) host_phase3_work[i] = 1;
+
+  deep_copy( phase3_work, host_phase3_work );
+
+  // Calculate the amount of memory needed for the allocator.  We need to know
+  // the number of superblocks required for each chunk size and use that to
+  // calculate the amount of memory for each chunk size.
+  size_t lg_sb_size = 18;
+  size_t sb_size = 1 << lg_sb_size;
+  size_t total_size = 0;
+  size_t allocs_per_size = phase1_size / num_chunk_sizes +
+                           half_phase2_size / num_chunk_sizes;
+
+  chunk_size = ceil_base_chunk_size;
+  for ( size_t i = 0; i < num_chunk_sizes; ++i ) {
+    size_t my_size = allocs_per_size * chunk_size;
+    total_size += ( my_size + sb_size - 1 ) / sb_size * sb_size;
+    chunk_size *= 2;
+  }
+
+  // Declare the queue to hold the records for allocated memory.  An allocation
+  // adds a record to the back of the queue, and a deallocation removes a
+  // record from the front of the queue.
+  size_t num_allocations = phase1_size + half_phase2_size;
+  scalar_view ptrs_front( "Pointers front" );
+  scalar_view ptrs_back( "Pointers back" );
+
+  pointer_view pointers( "pointers", num_allocations );
+
+#ifdef TESTMEMORYPOOL_PRINT
+  printf( "\n*** test_mempool2() ***\n" );
+  printf( "       num_chunk_sizes: %12zu\n", num_chunk_sizes );
+  printf( "       base_chunk_size: %12u\n", base_chunk_size );
+  printf( "  ceil_base_chunk_size: %12u\n", ceil_base_chunk_size );
+  printf( "           phase1_size: %12zu\n", phase1_size );
+  printf( "           phase2_size: %12zu\n", phase2_size );
+  printf( "           phase3_size: %12zu\n", phase3_size );
+  printf( "       allocs_per_size: %12zu\n", allocs_per_size );
+  printf( "       num_allocations: %12zu\n", num_allocations );
+  printf( "            total_size: %12zu\n", total_size );
+  fflush( stdout );
+
+  double elapsed_time = 0;
+  Kokkos::Timer timer;
+#endif
+
+  pool_memory_space mempool( memory_space(), total_size * 1.2, lg_sb_size );
+
+#ifdef TESTMEMORYPOOL_PRINT
+  execution_space::fence();
+  elapsed_time = timer.seconds();
+  print_results( "initialize mempool: ", elapsed_time );
+
+#ifdef TESTMEMORYPOOL_PRINT_STATUS
+  mempool.print_status();
+#endif
+
+  timer.reset();
+#endif
+
+  {
+    allocate_deallocate_memory< work_view, pointer_view, scalar_view,
+                                pool_memory_space >
+      adm( phase1_work, phase1_size, pointers, ptrs_front, ptrs_back, mempool );
+  }
+
+#ifdef TESTMEMORYPOOL_PRINT
+  execution_space::fence();
+  elapsed_time = timer.seconds();
+  print_results( "phase1: ", elapsed_time );
+
+#ifdef TESTMEMORYPOOL_PRINT_STATUS
+  mempool.print_status();
+#endif
+
+  timer.reset();
+#endif
+
+  {
+    allocate_deallocate_memory< work_view, pointer_view, scalar_view,
+                                pool_memory_space >
+      adm( phase2_work, phase2_size, pointers, ptrs_front, ptrs_back, mempool );
+  }
+
+#ifdef TESTMEMORYPOOL_PRINT
+  execution_space::fence();
+  elapsed_time = timer.seconds();
+  print_results( "phase2: ", elapsed_time );
+
+#ifdef TESTMEMORYPOOL_PRINT_STATUS
+  mempool.print_status();
+#endif
+
+  timer.reset();
+#endif
+
+  {
+    allocate_deallocate_memory< work_view, pointer_view, scalar_view,
+                                pool_memory_space >
+      adm( phase3_work, phase3_size, pointers, ptrs_front, ptrs_back, mempool );
+  }
+
+#ifdef TESTMEMORYPOOL_PRINT
+  execution_space::fence();
+  elapsed_time = timer.seconds();
+  print_results( "phase3: ", elapsed_time );
+#ifdef TESTMEMORYPOOL_PRINT_STATUS
+  mempool.print_status();
+#endif
+#endif
+}
+
+// Tests for correct behavior when the allocator is out of memory.
+template < class Device >
+void test_memory_exhaustion()
+{
+#ifdef TESTMEMORYPOOL_PRINT
+  typedef typename Device::execution_space                 execution_space;
+#endif
+  typedef typename Device::memory_space                    memory_space;
+  typedef Device                                           device_type;
+  typedef Kokkos::View< pointer_obj *, device_type >       pointer_view;
+  typedef Kokkos::Experimental::MemoryPool< device_type >  pool_memory_space;
+
+  // The allocator will have a single superblock, and allocations will all be
+  // of the same chunk size.  The allocation loop will attempt to allocate
+  // twice the number of chunks as are available in the allocator.  The
+  // deallocation loop will only free the successfully allocated chunks.
+
+  size_t chunk_size = 128;
+  size_t num_chunks = 128;
+  size_t half_num_chunks = num_chunks / 2;
+  size_t superblock_size = chunk_size * half_num_chunks;
+  size_t lg_superblock_size =
+    Kokkos::Impl::integral_power_of_two( superblock_size );
+
+  pointer_view pointers( "pointers", num_chunks );
+
+#ifdef TESTMEMORYPOOL_PRINT
+  std::cout << "\n*** test_memory_exhaustion() ***" << std::endl;
+
+  double elapsed_time = 0;
+  Kokkos::Timer timer;
+#endif
+
+  pool_memory_space mempool( memory_space(), superblock_size,
+                             lg_superblock_size );
+
+#ifdef TESTMEMORYPOOL_PRINT
+  execution_space::fence();
+  elapsed_time = timer.seconds();
+  print_results( "initialize mempool: ", elapsed_time );
+#ifdef TESTMEMORYPOOL_PRINT_STATUS
+  mempool.print_status();
+#endif
+  timer.reset();
+#endif
+
+  {
+    allocate_memory< pointer_view, pool_memory_space >
+      am( pointers, num_chunks, chunk_size, mempool );
+  }
+
+#ifdef TESTMEMORYPOOL_PRINT
+  execution_space::fence();
+  elapsed_time = timer.seconds();
+  print_results( "allocate chunks: ", elapsed_time );
+#ifdef TESTMEMORYPOOL_PRINT_STATUS
+  mempool.print_status();
+#endif
+  timer.reset();
+#endif
+
+  {
+    // In parallel, the allocations that succeeded were not put contiguously
+    // into the pointers View.  The whole View can still be looped over and
+    // have deallocate called because deallocate will just do nothing for NULL
+    // pointers.
+    deallocate_memory< pointer_view, pool_memory_space >
+      dm( pointers, num_chunks, chunk_size, mempool );
+  }
+
+#ifdef TESTMEMORYPOOL_PRINT
+  execution_space::fence();
+  elapsed_time = timer.seconds();
+  print_results( "deallocate chunks: ", elapsed_time );
+#ifdef TESTMEMORYPOOL_PRINT_STATUS
+  mempool.print_status();
+#endif
+#endif
+}
+
+}
+
+#ifdef TESTMEMORYPOOL_PRINT
+#undef TESTMEMORYPOOL_PRINT
+#endif
+
+#ifdef TESTMEMORYPOOL_PRINT_STATUS
+#undef TESTMEMORYPOOL_PRINT_STATUS
+#endif
+
+#ifdef STRIDE
+#undef STRIDE
+#endif
+
+#endif
diff --git a/lib/kokkos/core/unit_test/TestMemorySpaceTracking.hpp b/lib/kokkos/core/unit_test/TestMemorySpaceTracking.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..575f2f2c254ecae81132c8e5f714e4fe6e71c14f
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestMemorySpaceTracking.hpp
@@ -0,0 +1,100 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <iostream>
+#include <Kokkos_Core.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace {
+
+template<class Arg1>
+class TestMemorySpace {
+public:
+
+  typedef typename Arg1::memory_space MemorySpace;
+  TestMemorySpace() { run_test(); }
+
+  void run_test()
+  {
+
+#if ! KOKKOS_USING_EXP_VIEW
+
+    Kokkos::View<int* ,Arg1> invalid;
+    ASSERT_EQ(0u, invalid.tracker().ref_count() );
+
+    {
+      Kokkos::View<int* ,Arg1> a("A",10);
+
+      ASSERT_EQ(1u, a.tracker().ref_count() );
+
+      {
+        Kokkos::View<int* ,Arg1> b = a;
+        ASSERT_EQ(2u, b.tracker().ref_count() );
+
+        Kokkos::View<int* ,Arg1> D("D",10);
+        ASSERT_EQ(1u, D.tracker().ref_count() );
+
+        {
+          Kokkos::View<int* ,Arg1> E("E",10);
+          ASSERT_EQ(1u, E.tracker().ref_count() );
+        }
+
+        ASSERT_EQ(2u, b.tracker().ref_count() );
+      }
+      ASSERT_EQ(1u, a.tracker().ref_count() );
+    }
+
+#endif
+
+  }
+};
+
+}
+
+/*--------------------------------------------------------------------------*/
+
+
+
diff --git a/lib/kokkos/core/unit_test/TestOpenMP.cpp b/lib/kokkos/core/unit_test/TestOpenMP.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6e8fc4517917bfcaaeecba6fbc2ac59f6090350d
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestOpenMP.cpp
@@ -0,0 +1,262 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Macros.hpp>
+#ifdef KOKKOS_LAMBDA
+#undef KOKKOS_LAMBDA
+#endif
+#define KOKKOS_LAMBDA [=]
+
+#include <Kokkos_Core.hpp>
+
+//----------------------------------------------------------------------------
+
+#include <TestViewImpl.hpp>
+#include <TestAtomic.hpp>
+#include <TestAtomicOperations.hpp>
+
+#include <TestViewAPI.hpp>
+#include <TestViewSubview.hpp>
+#include <TestViewOfClass.hpp>
+
+#include <TestSharedAlloc.hpp>
+#include <TestViewMapping.hpp>
+
+#include <TestRange.hpp>
+#include <TestTeam.hpp>
+#include <TestReduce.hpp>
+#include <TestScan.hpp>
+#include <TestAggregate.hpp>
+#include <TestAggregateReduction.hpp>
+#include <TestCompilerMacros.hpp>
+#include <TestMemoryPool.hpp>
+
+
+#include <TestCXX11.hpp>
+#include <TestCXX11Deduction.hpp>
+#include <TestTeamVector.hpp>
+#include <TestMemorySpaceTracking.hpp>
+#include <TestTemplateMetaFunctions.hpp>
+
+#include <TestPolicyConstruction.hpp>
+
+#include <TestMDRange.hpp>
+
+namespace Test {
+
+class openmp : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+    const unsigned numa_count       = Kokkos::hwloc::get_available_numa_count();
+    const unsigned cores_per_numa   = Kokkos::hwloc::get_available_cores_per_numa();
+    const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core();
+
+    const unsigned threads_count = std::max( 1u , numa_count ) *
+                                   std::max( 2u , ( cores_per_numa * threads_per_core ) / 2 );
+
+    Kokkos::OpenMP::initialize( threads_count );
+    Kokkos::OpenMP::print_configuration( std::cout , true );
+    srand(10231);
+  }
+
+  static void TearDownTestCase()
+  {
+    Kokkos::OpenMP::finalize();
+
+    omp_set_num_threads(1);
+
+    ASSERT_EQ( 1 , omp_get_max_threads() );
+  }
+};
+
+
+TEST_F( openmp , md_range ) {
+  TestMDRange_2D< Kokkos::OpenMP >::test_for2(100,100);
+
+  TestMDRange_3D< Kokkos::OpenMP >::test_for3(100,100,100);
+}
+
+TEST_F( openmp , impl_shared_alloc ) {
+  test_shared_alloc< Kokkos::HostSpace , Kokkos::OpenMP >();
+}
+
+TEST_F( openmp, policy_construction) {
+  TestRangePolicyConstruction< Kokkos::OpenMP >();
+  TestTeamPolicyConstruction< Kokkos::OpenMP >();
+}
+
+TEST_F( openmp , impl_view_mapping ) {
+  test_view_mapping< Kokkos::OpenMP >();
+  test_view_mapping_subview< Kokkos::OpenMP >();
+  test_view_mapping_operator< Kokkos::OpenMP >();
+  TestViewMappingAtomic< Kokkos::OpenMP >::run();
+}
+
+TEST_F( openmp, view_impl) {
+  test_view_impl< Kokkos::OpenMP >();
+}
+
+TEST_F( openmp, view_api) {
+  TestViewAPI< double , Kokkos::OpenMP >();
+}
+
+TEST_F( openmp , view_nested_view )
+{
+  ::Test::view_nested_view< Kokkos::OpenMP >();
+}
+
+TEST_F( openmp , atomics )
+{
+  const int loop_count = 1e4 ;
+
+  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::OpenMP>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::OpenMP>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::OpenMP>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::OpenMP>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::OpenMP>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::OpenMP>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::OpenMP>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::OpenMP>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::OpenMP>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::OpenMP>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::OpenMP>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::OpenMP>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::OpenMP>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::OpenMP>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::OpenMP>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::OpenMP>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::OpenMP>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::OpenMP>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::OpenMP>(100,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::OpenMP>(100,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::OpenMP>(100,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::OpenMP>(100,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::OpenMP>(100,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::OpenMP>(100,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<4> ,Kokkos::OpenMP>(100,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<4> ,Kokkos::OpenMP>(100,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<4> ,Kokkos::OpenMP>(100,3) ) );
+}
+
+TEST_F( openmp , atomic_operations )
+{
+  const int start = 1; //Avoid zero for division
+  const int end = 11;
+  for (int i = start; i < end; ++i)
+  {
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::OpenMP>(start, end-i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::OpenMP>(start, end-i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::OpenMP>(start, end-i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::OpenMP>(start, end-i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::OpenMP>(start, end-i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::OpenMP>(start, end-i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::OpenMP>(start, end-i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::OpenMP>(start, end-i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::OpenMP>(start, end-i, 9 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::OpenMP>(start, end-i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::OpenMP>(start, end-i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::OpenMP>(start, end-i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::OpenMP>(start, end-i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::OpenMP>(start, end-i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::OpenMP>(start, end-i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::OpenMP>(start, end-i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::OpenMP>(start, end-i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::OpenMP>(start, end-i, 9 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::OpenMP>(start, end-i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::OpenMP>(start, end-i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::OpenMP>(start, end-i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::OpenMP>(start, end-i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::OpenMP>(start, end-i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::OpenMP>(start, end-i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::OpenMP>(start, end-i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::OpenMP>(start, end-i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::OpenMP>(start, end-i, 9 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::OpenMP>(start, end-i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::OpenMP>(start, end-i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::OpenMP>(start, end-i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::OpenMP>(start, end-i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::OpenMP>(start, end-i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::OpenMP>(start, end-i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::OpenMP>(start, end-i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::OpenMP>(start, end-i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::OpenMP>(start, end-i, 9 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::OpenMP>(start, end-i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::OpenMP>(start, end-i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::OpenMP>(start, end-i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::OpenMP>(start, end-i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::OpenMP>(start, end-i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::OpenMP>(start, end-i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::OpenMP>(start, end-i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::OpenMP>(start, end-i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::OpenMP>(start, end-i, 9 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::OpenMP>(start, end-i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::OpenMP>(start, end-i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::OpenMP>(start, end-i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::OpenMP>(start, end-i, 4 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::OpenMP>(start, end-i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::OpenMP>(start, end-i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::OpenMP>(start, end-i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::OpenMP>(start, end-i, 4 ) ) );
+  }
+
+}
+
+} // namespace test
+
diff --git a/lib/kokkos/core/unit_test/TestOpenMP_a.cpp b/lib/kokkos/core/unit_test/TestOpenMP_a.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..64eac66804b1ef6a053930d6db47abb566ccda66
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestOpenMP_a.cpp
@@ -0,0 +1,150 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Macros.hpp>
+#ifdef KOKKOS_LAMBDA
+#undef KOKKOS_LAMBDA
+#endif
+#define KOKKOS_LAMBDA [=]
+
+#include <Kokkos_Core.hpp>
+
+//----------------------------------------------------------------------------
+
+#include <TestViewImpl.hpp>
+#include <TestAtomic.hpp>
+
+#include <TestViewAPI.hpp>
+#include <TestViewSubview.hpp>
+#include <TestViewOfClass.hpp>
+
+#include <TestSharedAlloc.hpp>
+#include <TestViewMapping.hpp>
+
+#include <TestRange.hpp>
+#include <TestTeam.hpp>
+#include <TestReduce.hpp>
+#include <TestScan.hpp>
+#include <TestAggregate.hpp>
+#include <TestAggregateReduction.hpp>
+#include <TestCompilerMacros.hpp>
+#include <TestMemoryPool.hpp>
+
+
+#include <TestCXX11.hpp>
+#include <TestCXX11Deduction.hpp>
+#include <TestTeamVector.hpp>
+#include <TestMemorySpaceTracking.hpp>
+#include <TestTemplateMetaFunctions.hpp>
+
+#include <TestPolicyConstruction.hpp>
+
+
+namespace Test {
+
+class openmp : public ::testing::Test {
+protected:
+  static void SetUpTestCase();
+  static void TearDownTestCase();
+};
+
+TEST_F( openmp, view_subview_auto_1d_left ) {
+  TestViewSubview::test_auto_1d< Kokkos::LayoutLeft,Kokkos::OpenMP >();
+}
+
+TEST_F( openmp, view_subview_auto_1d_right ) {
+  TestViewSubview::test_auto_1d< Kokkos::LayoutRight,Kokkos::OpenMP >();
+}
+
+TEST_F( openmp, view_subview_auto_1d_stride ) {
+  TestViewSubview::test_auto_1d< Kokkos::LayoutStride,Kokkos::OpenMP >();
+}
+
+TEST_F( openmp, view_subview_assign_strided ) {
+  TestViewSubview::test_1d_strided_assignment< Kokkos::OpenMP >();
+}
+
+TEST_F( openmp, view_subview_left_0 ) {
+  TestViewSubview::test_left_0< Kokkos::OpenMP >();
+}
+
+TEST_F( openmp, view_subview_left_1 ) {
+  TestViewSubview::test_left_1< Kokkos::OpenMP >();
+}
+
+TEST_F( openmp, view_subview_left_2 ) {
+  TestViewSubview::test_left_2< Kokkos::OpenMP >();
+}
+
+TEST_F( openmp, view_subview_left_3 ) {
+  TestViewSubview::test_left_3< Kokkos::OpenMP >();
+}
+
+TEST_F( openmp, view_subview_right_0 ) {
+  TestViewSubview::test_right_0< Kokkos::OpenMP >();
+}
+
+TEST_F( openmp, view_subview_right_1 ) {
+  TestViewSubview::test_right_1< Kokkos::OpenMP >();
+}
+
+TEST_F( openmp, view_subview_right_3 ) {
+  TestViewSubview::test_right_3< Kokkos::OpenMP >();
+}
+
+TEST_F( openmp, view_subview_1d_assign ) {
+  TestViewSubview::test_1d_assign< Kokkos::OpenMP >();
+}
+
+TEST_F( openmp, view_subview_2d_from_3d ) {
+  TestViewSubview::test_2d_subview_3d< Kokkos::OpenMP >();
+}
+
+TEST_F( openmp, view_subview_2d_from_5d ) {
+  TestViewSubview::test_2d_subview_5d< Kokkos::OpenMP >();
+}
+
+} // namespace test
+
diff --git a/lib/kokkos/core/unit_test/TestOpenMP_b.cpp b/lib/kokkos/core/unit_test/TestOpenMP_b.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6cc2476014c8d8e07ef6bc6a60b38c3660d3d7c4
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestOpenMP_b.cpp
@@ -0,0 +1,185 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Macros.hpp>
+#ifdef KOKKOS_LAMBDA
+#undef KOKKOS_LAMBDA
+#endif
+#define KOKKOS_LAMBDA [=]
+
+#include <Kokkos_Core.hpp>
+
+//----------------------------------------------------------------------------
+
+#include <TestViewImpl.hpp>
+#include <TestAtomic.hpp>
+
+#include <TestViewAPI.hpp>
+#include <TestViewSubview.hpp>
+#include <TestViewOfClass.hpp>
+
+#include <TestSharedAlloc.hpp>
+#include <TestViewMapping.hpp>
+
+#include <TestRange.hpp>
+#include <TestTeam.hpp>
+#include <TestReduce.hpp>
+#include <TestScan.hpp>
+#include <TestAggregate.hpp>
+#include <TestAggregateReduction.hpp>
+#include <TestCompilerMacros.hpp>
+#include <TestMemoryPool.hpp>
+
+
+#include <TestCXX11.hpp>
+#include <TestCXX11Deduction.hpp>
+#include <TestTeamVector.hpp>
+#include <TestMemorySpaceTracking.hpp>
+#include <TestTemplateMetaFunctions.hpp>
+
+#include <TestPolicyConstruction.hpp>
+
+
+namespace Test {
+
+class openmp : public ::testing::Test {
+protected:
+  static void SetUpTestCase();
+  static void TearDownTestCase();
+};
+
+TEST_F( openmp , range_tag )
+{
+  TestRange< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >::test_for(1000);
+  TestRange< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >::test_reduce(1000);
+  TestRange< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >::test_scan(1000);
+  TestRange< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(1001);
+  TestRange< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(1001);
+  TestRange< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >::test_scan(1001);
+  TestRange< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >::test_dynamic_policy(1000);
+}
+
+TEST_F( openmp , team_tag )
+{
+  TestTeamPolicy< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >::test_for(2);
+  TestTeamPolicy< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >::test_reduce(2);
+  TestTeamPolicy< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(2);
+  TestTeamPolicy< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(2);
+  TestTeamPolicy< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >::test_for(1000);
+  TestTeamPolicy< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >::test_reduce(1000);
+  TestTeamPolicy< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(1000);
+  TestTeamPolicy< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(1000);
+}
+
+TEST_F( openmp, long_reduce) {
+  TestReduce< long ,   Kokkos::OpenMP >( 1000000 );
+}
+
+TEST_F( openmp, double_reduce) {
+  TestReduce< double ,   Kokkos::OpenMP >( 1000000 );
+}
+
+TEST_F( openmp, long_reduce_dynamic ) {
+  TestReduceDynamic< long ,   Kokkos::OpenMP >( 1000000 );
+}
+
+TEST_F( openmp, double_reduce_dynamic ) {
+  TestReduceDynamic< double ,   Kokkos::OpenMP >( 1000000 );
+}
+
+TEST_F( openmp, long_reduce_dynamic_view ) {
+  TestReduceDynamicView< long ,   Kokkos::OpenMP >( 1000000 );
+}
+
+TEST_F( openmp , reducers )
+{
+  TestReducers<int, Kokkos::OpenMP>::execute_integer();
+  TestReducers<size_t, Kokkos::OpenMP>::execute_integer();
+  TestReducers<double, Kokkos::OpenMP>::execute_float();
+  TestReducers<Kokkos::complex<double>, Kokkos::OpenMP>::execute_basic();
+}
+
+TEST_F( openmp, team_long_reduce) {
+  TestReduceTeam< long ,   Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >( 3 );
+  TestReduceTeam< long ,   Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
+  TestReduceTeam< long ,   Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >( 100000 );
+  TestReduceTeam< long ,   Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
+}
+
+TEST_F( openmp, team_double_reduce) {
+  TestReduceTeam< double ,   Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >( 3 );
+  TestReduceTeam< double ,   Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
+  TestReduceTeam< double ,   Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >( 100000 );
+  TestReduceTeam< double ,   Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
+}
+
+TEST_F( openmp, team_shared_request) {
+  TestSharedTeam< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >();
+  TestSharedTeam< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >();
+}
+
+TEST_F( openmp, team_scratch_request) {
+  TestScratchTeam< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >();
+  TestScratchTeam< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >();
+}
+
+#if defined(KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA)
+TEST_F( openmp, team_lambda_shared_request) {
+  TestLambdaSharedTeam< Kokkos::HostSpace, Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >();
+  TestLambdaSharedTeam< Kokkos::HostSpace, Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >();
+}
+#endif
+
+TEST_F( openmp, shmem_size) {
+  TestShmemSize< Kokkos::OpenMP >();
+}
+
+TEST_F( openmp, multi_level_scratch) {
+  TestMultiLevelScratchTeam< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >();
+  TestMultiLevelScratchTeam< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >();
+}
+
+} // namespace test
+
diff --git a/lib/kokkos/core/unit_test/TestOpenMP_c.cpp b/lib/kokkos/core/unit_test/TestOpenMP_c.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f0cdabe913b8a4125fc5a1541823328d749759bf
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestOpenMP_c.cpp
@@ -0,0 +1,262 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Macros.hpp>
+#ifdef KOKKOS_LAMBDA
+#undef KOKKOS_LAMBDA
+#endif
+#define KOKKOS_LAMBDA [=]
+
+#include <Kokkos_Core.hpp>
+
+//----------------------------------------------------------------------------
+
+#include <TestViewImpl.hpp>
+#include <TestAtomic.hpp>
+
+#include <TestViewAPI.hpp>
+#include <TestViewSubview.hpp>
+#include <TestViewOfClass.hpp>
+
+#include <TestSharedAlloc.hpp>
+#include <TestViewMapping.hpp>
+
+#include <TestRange.hpp>
+#include <TestTeam.hpp>
+#include <TestReduce.hpp>
+#include <TestScan.hpp>
+#include <TestAggregate.hpp>
+#include <TestAggregateReduction.hpp>
+#include <TestCompilerMacros.hpp>
+#include <TestMemoryPool.hpp>
+#include <TestTaskPolicy.hpp>
+
+
+#include <TestCXX11.hpp>
+#include <TestCXX11Deduction.hpp>
+#include <TestTeamVector.hpp>
+#include <TestMemorySpaceTracking.hpp>
+#include <TestTemplateMetaFunctions.hpp>
+
+#include <TestPolicyConstruction.hpp>
+
+
+namespace Test {
+
+class openmp : public ::testing::Test {
+protected:
+  static void SetUpTestCase();
+  static void TearDownTestCase();
+};
+
+TEST_F( openmp , view_remap )
+{
+  enum { N0 = 3 , N1 = 2 , N2 = 8 , N3 = 9 };
+
+  typedef Kokkos::View< double*[N1][N2][N3] ,
+                             Kokkos::LayoutRight ,
+                             Kokkos::OpenMP > output_type ;
+
+  typedef Kokkos::View< int**[N2][N3] ,
+                             Kokkos::LayoutLeft ,
+                             Kokkos::OpenMP > input_type ;
+
+  typedef Kokkos::View< int*[N0][N2][N3] ,
+                             Kokkos::LayoutLeft ,
+                             Kokkos::OpenMP > diff_type ;
+
+  output_type output( "output" , N0 );
+  input_type  input ( "input" , N0 , N1 );
+  diff_type   diff  ( "diff" , N0 );
+
+  int value = 0 ;
+  for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
+  for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
+  for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
+  for ( size_t i0 = 0 ; i0 < N0 ; ++i0 ) {
+    input(i0,i1,i2,i3) = ++value ;
+  }}}}
+
+  // Kokkos::deep_copy( diff , input ); // throw with incompatible shape
+  Kokkos::deep_copy( output , input );
+
+  value = 0 ;
+  for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
+  for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
+  for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
+  for ( size_t i0 = 0 ; i0 < N0 ; ++i0 ) {
+    ++value ;
+    ASSERT_EQ( value , ((int) output(i0,i1,i2,i3) ) );
+  }}}}
+}
+
+//----------------------------------------------------------------------------
+
+
+TEST_F( openmp , view_aggregate )
+{
+  TestViewAggregate< Kokkos::OpenMP >();
+  TestViewAggregateReduction< Kokkos::OpenMP >();
+}
+
+//----------------------------------------------------------------------------
+
+TEST_F( openmp , scan )
+{
+  TestScan< Kokkos::OpenMP >::test_range( 1 , 1000 );
+  TestScan< Kokkos::OpenMP >( 1000000 );
+  TestScan< Kokkos::OpenMP >( 10000000 );
+  Kokkos::OpenMP::fence();
+}
+
+
+TEST_F( openmp , team_scan )
+{
+  TestScanTeam< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >( 10 );
+  TestScanTeam< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >( 10 );
+  TestScanTeam< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >( 10000 );
+  TestScanTeam< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >( 10000 );
+}
+
+//----------------------------------------------------------------------------
+
+TEST_F( openmp , compiler_macros )
+{
+  ASSERT_TRUE( ( TestCompilerMacros::Test< Kokkos::OpenMP >() ) );
+}
+
+//----------------------------------------------------------------------------
+
+TEST_F( openmp , memory_space )
+{
+  TestMemorySpace< Kokkos::OpenMP >();
+}
+
+TEST_F( openmp , memory_pool )
+{
+  bool val = TestMemoryPool::test_mempool< Kokkos::OpenMP >( 128, 128000000 );
+  ASSERT_TRUE( val );
+
+  TestMemoryPool::test_mempool2< Kokkos::OpenMP >( 64, 4, 1000000, 2000000 );
+
+  TestMemoryPool::test_memory_exhaustion< Kokkos::OpenMP >();
+}
+
+//----------------------------------------------------------------------------
+
+TEST_F( openmp , template_meta_functions )
+{
+  TestTemplateMetaFunctions<int, Kokkos::OpenMP >();
+}
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP )
+TEST_F( openmp , cxx11 )
+{
+  if ( Kokkos::Impl::is_same< Kokkos::DefaultExecutionSpace , Kokkos::OpenMP >::value ) {
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::OpenMP >(1) ) );
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::OpenMP >(2) ) );
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::OpenMP >(3) ) );
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::OpenMP >(4) ) );
+  }
+}
+#endif
+
+TEST_F( openmp , reduction_deduction )
+{
+  TestCXX11::test_reduction_deduction< Kokkos::OpenMP >();
+}
+
+TEST_F( openmp , team_vector )
+{
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >(0) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >(1) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >(2) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >(3) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >(4) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >(5) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >(6) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >(7) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >(8) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >(9) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >(10) ) );
+}
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_ENABLE_TASKPOLICY )
+
+TEST_F( openmp , task_fib )
+{
+  for ( int i = 0 ; i < 25 ; ++i ) {
+    TestTaskPolicy::TestFib< Kokkos::OpenMP >::run(i, (i+1)*1000000 );
+  }
+}
+
+TEST_F( openmp , task_depend )
+{
+  for ( int i = 0 ; i < 25 ; ++i ) {
+    TestTaskPolicy::TestTaskDependence< Kokkos::OpenMP >::run(i);
+  }
+}
+
+TEST_F( openmp , task_team )
+{
+  TestTaskPolicy::TestTaskTeam< Kokkos::OpenMP >::run(1000);
+  //TestTaskPolicy::TestTaskTeamValue< Kokkos::OpenMP >::run(1000); //TODO put back after testing
+}
+
+
+#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
+
+
+} // namespace test
+
+
+
+
+
+
diff --git a/lib/kokkos/core/unit_test/TestPolicyConstruction.hpp b/lib/kokkos/core/unit_test/TestPolicyConstruction.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..049138eb07cd402140f1d509a3590eb8e3eb6104
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestPolicyConstruction.hpp
@@ -0,0 +1,493 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+#include <stdexcept>
+#include <sstream>
+#include <iostream>
+
+struct SomeTag{};
+
+template< class ExecutionSpace >
+class TestRangePolicyConstruction {
+public:
+  TestRangePolicyConstruction() {
+    test_compile_time_parameters();
+  }
+private:
+  void test_compile_time_parameters() {
+    {
+      typedef Kokkos::RangePolicy<> policy_t;
+      typedef typename policy_t::execution_space execution_space;
+      typedef typename policy_t::index_type      index_type;
+      typedef typename policy_t::schedule_type   schedule_type;
+      typedef typename policy_t::work_tag        work_tag;
+
+      ASSERT_TRUE((std::is_same<execution_space ,Kokkos::DefaultExecutionSpace       >::value));
+      ASSERT_TRUE((std::is_same<index_type      ,typename execution_space::size_type >::value));
+      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Static>    >::value));
+      ASSERT_TRUE((std::is_same<work_tag        ,void                                >::value));
+    }
+    {
+      typedef Kokkos::RangePolicy<ExecutionSpace> policy_t;
+      typedef typename policy_t::execution_space execution_space;
+      typedef typename policy_t::index_type      index_type;
+      typedef typename policy_t::schedule_type   schedule_type;
+      typedef typename policy_t::work_tag        work_tag;
+
+      ASSERT_TRUE((std::is_same<execution_space ,ExecutionSpace                      >::value));
+      ASSERT_TRUE((std::is_same<index_type      ,typename execution_space::size_type >::value));
+      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Static>    >::value));
+      ASSERT_TRUE((std::is_same<work_tag        ,void                                >::value));
+    }
+    {
+      typedef Kokkos::RangePolicy<ExecutionSpace,Kokkos::Schedule<Kokkos::Dynamic> > policy_t;
+      typedef typename policy_t::execution_space execution_space;
+      typedef typename policy_t::index_type      index_type;
+      typedef typename policy_t::schedule_type   schedule_type;
+      typedef typename policy_t::work_tag        work_tag;
+
+      ASSERT_TRUE((std::is_same<execution_space ,ExecutionSpace                      >::value));
+      ASSERT_TRUE((std::is_same<index_type      ,typename execution_space::size_type >::value));
+      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
+      ASSERT_TRUE((std::is_same<work_tag        ,void                                >::value));
+    }
+    {
+      typedef Kokkos::RangePolicy<ExecutionSpace,Kokkos::Schedule<Kokkos::Dynamic>,Kokkos::IndexType<long> > policy_t;
+      typedef typename policy_t::execution_space execution_space;
+      typedef typename policy_t::index_type      index_type;
+      typedef typename policy_t::schedule_type   schedule_type;
+      typedef typename policy_t::work_tag        work_tag;
+
+      ASSERT_TRUE((std::is_same<execution_space ,ExecutionSpace                      >::value));
+      ASSERT_TRUE((std::is_same<index_type      ,long                                >::value));
+      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
+      ASSERT_TRUE((std::is_same<work_tag        ,void                                >::value));
+    }
+    {
+      typedef Kokkos::RangePolicy<Kokkos::IndexType<long>, ExecutionSpace,Kokkos::Schedule<Kokkos::Dynamic> > policy_t;
+      typedef typename policy_t::execution_space execution_space;
+      typedef typename policy_t::index_type      index_type;
+      typedef typename policy_t::schedule_type   schedule_type;
+      typedef typename policy_t::work_tag        work_tag;
+
+      ASSERT_TRUE((std::is_same<execution_space ,ExecutionSpace                      >::value));
+      ASSERT_TRUE((std::is_same<index_type      ,long                                >::value));
+      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
+      ASSERT_TRUE((std::is_same<work_tag        ,void                                >::value));
+    }
+    {
+      typedef Kokkos::RangePolicy<ExecutionSpace,Kokkos::Schedule<Kokkos::Dynamic>,Kokkos::IndexType<long>,SomeTag > policy_t;
+      typedef typename policy_t::execution_space execution_space;
+      typedef typename policy_t::index_type      index_type;
+      typedef typename policy_t::schedule_type   schedule_type;
+      typedef typename policy_t::work_tag        work_tag;
+
+      ASSERT_TRUE((std::is_same<execution_space ,ExecutionSpace                      >::value));
+      ASSERT_TRUE((std::is_same<index_type      ,long                                >::value));
+      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
+      ASSERT_TRUE((std::is_same<work_tag        ,SomeTag                             >::value));
+    }
+    {
+      typedef Kokkos::RangePolicy<Kokkos::Schedule<Kokkos::Dynamic>,ExecutionSpace,Kokkos::IndexType<long>,SomeTag > policy_t;
+      typedef typename policy_t::execution_space execution_space;
+      typedef typename policy_t::index_type      index_type;
+      typedef typename policy_t::schedule_type   schedule_type;
+      typedef typename policy_t::work_tag        work_tag;
+
+      ASSERT_TRUE((std::is_same<execution_space ,ExecutionSpace                      >::value));
+      ASSERT_TRUE((std::is_same<index_type      ,long                                >::value));
+      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
+      ASSERT_TRUE((std::is_same<work_tag        ,SomeTag                             >::value));
+    }
+    {
+      typedef Kokkos::RangePolicy<SomeTag,Kokkos::Schedule<Kokkos::Dynamic>,Kokkos::IndexType<long>,ExecutionSpace > policy_t;
+      typedef typename policy_t::execution_space execution_space;
+      typedef typename policy_t::index_type      index_type;
+      typedef typename policy_t::schedule_type   schedule_type;
+      typedef typename policy_t::work_tag        work_tag;
+
+      ASSERT_TRUE((std::is_same<execution_space ,ExecutionSpace                      >::value));
+      ASSERT_TRUE((std::is_same<index_type      ,long                                >::value));
+      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
+      ASSERT_TRUE((std::is_same<work_tag        ,SomeTag                             >::value));
+    }
+    {
+      typedef Kokkos::RangePolicy<Kokkos::Schedule<Kokkos::Dynamic> > policy_t;
+      typedef typename policy_t::execution_space execution_space;
+      typedef typename policy_t::index_type      index_type;
+      typedef typename policy_t::schedule_type   schedule_type;
+      typedef typename policy_t::work_tag        work_tag;
+
+      ASSERT_TRUE((std::is_same<execution_space ,Kokkos::DefaultExecutionSpace                      >::value));
+      ASSERT_TRUE((std::is_same<index_type      ,typename execution_space::size_type >::value));
+      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
+      ASSERT_TRUE((std::is_same<work_tag        ,void                                >::value));
+    }
+    {
+      typedef Kokkos::RangePolicy<Kokkos::Schedule<Kokkos::Dynamic>,Kokkos::IndexType<long> > policy_t;
+      typedef typename policy_t::execution_space execution_space;
+      typedef typename policy_t::index_type      index_type;
+      typedef typename policy_t::schedule_type   schedule_type;
+      typedef typename policy_t::work_tag        work_tag;
+
+      ASSERT_TRUE((std::is_same<execution_space ,Kokkos::DefaultExecutionSpace       >::value));
+      ASSERT_TRUE((std::is_same<index_type      ,long                                >::value));
+      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
+      ASSERT_TRUE((std::is_same<work_tag        ,void                                >::value));
+    }
+    {
+      typedef Kokkos::RangePolicy<Kokkos::IndexType<long>, Kokkos::Schedule<Kokkos::Dynamic> > policy_t;
+      typedef typename policy_t::execution_space execution_space;
+      typedef typename policy_t::index_type      index_type;
+      typedef typename policy_t::schedule_type   schedule_type;
+      typedef typename policy_t::work_tag        work_tag;
+
+      ASSERT_TRUE((std::is_same<execution_space ,Kokkos::DefaultExecutionSpace       >::value));
+      ASSERT_TRUE((std::is_same<index_type      ,long                                >::value));
+      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
+      ASSERT_TRUE((std::is_same<work_tag        ,void                                >::value));
+    }
+    {
+      typedef Kokkos::RangePolicy<Kokkos::Schedule<Kokkos::Dynamic>,Kokkos::IndexType<long>,SomeTag > policy_t;
+      typedef typename policy_t::execution_space execution_space;
+      typedef typename policy_t::index_type      index_type;
+      typedef typename policy_t::schedule_type   schedule_type;
+      typedef typename policy_t::work_tag        work_tag;
+
+      ASSERT_TRUE((std::is_same<execution_space ,Kokkos::DefaultExecutionSpace       >::value));
+      ASSERT_TRUE((std::is_same<index_type      ,long                                >::value));
+      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
+      ASSERT_TRUE((std::is_same<work_tag        ,SomeTag                             >::value));
+    }
+    {
+      typedef Kokkos::RangePolicy<Kokkos::Schedule<Kokkos::Dynamic>,Kokkos::IndexType<long>,SomeTag > policy_t;
+      typedef typename policy_t::execution_space execution_space;
+      typedef typename policy_t::index_type      index_type;
+      typedef typename policy_t::schedule_type   schedule_type;
+      typedef typename policy_t::work_tag        work_tag;
+
+      ASSERT_TRUE((std::is_same<execution_space ,Kokkos::DefaultExecutionSpace       >::value));
+      ASSERT_TRUE((std::is_same<index_type      ,long                                >::value));
+      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
+      ASSERT_TRUE((std::is_same<work_tag        ,SomeTag                             >::value));
+    }
+    {
+      typedef Kokkos::RangePolicy<SomeTag,Kokkos::Schedule<Kokkos::Dynamic>,Kokkos::IndexType<long> > policy_t;
+      typedef typename policy_t::execution_space execution_space;
+      typedef typename policy_t::index_type      index_type;
+      typedef typename policy_t::schedule_type   schedule_type;
+      typedef typename policy_t::work_tag        work_tag;
+
+      ASSERT_TRUE((std::is_same<execution_space ,Kokkos::DefaultExecutionSpace       >::value));
+      ASSERT_TRUE((std::is_same<index_type      ,long                                >::value));
+      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
+      ASSERT_TRUE((std::is_same<work_tag        ,SomeTag                             >::value));
+    }
+  }
+};
+
+template< class ExecutionSpace >
+class TestTeamPolicyConstruction {
+public:
+  TestTeamPolicyConstruction() {
+    test_compile_time_parameters();
+    test_run_time_parameters();
+  }
+private:
+  void test_compile_time_parameters() {
+    {
+      typedef Kokkos::TeamPolicy<> policy_t;
+      typedef typename policy_t::execution_space execution_space;
+      typedef typename policy_t::index_type      index_type;
+      typedef typename policy_t::schedule_type   schedule_type;
+      typedef typename policy_t::work_tag        work_tag;
+
+      ASSERT_TRUE((std::is_same<execution_space ,Kokkos::DefaultExecutionSpace       >::value));
+      ASSERT_TRUE((std::is_same<index_type      ,typename execution_space::size_type >::value));
+      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Static>    >::value));
+      ASSERT_TRUE((std::is_same<work_tag        ,void                                >::value));
+    }
+    {
+      typedef Kokkos::TeamPolicy<ExecutionSpace> policy_t;
+      typedef typename policy_t::execution_space execution_space;
+      typedef typename policy_t::index_type      index_type;
+      typedef typename policy_t::schedule_type   schedule_type;
+      typedef typename policy_t::work_tag        work_tag;
+
+      ASSERT_TRUE((std::is_same<execution_space ,ExecutionSpace                      >::value));
+      ASSERT_TRUE((std::is_same<index_type      ,typename execution_space::size_type >::value));
+      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Static>    >::value));
+      ASSERT_TRUE((std::is_same<work_tag        ,void                                >::value));
+    }
+    {
+      typedef Kokkos::TeamPolicy<ExecutionSpace,Kokkos::Schedule<Kokkos::Dynamic> > policy_t;
+      typedef typename policy_t::execution_space execution_space;
+      typedef typename policy_t::index_type      index_type;
+      typedef typename policy_t::schedule_type   schedule_type;
+      typedef typename policy_t::work_tag        work_tag;
+
+      ASSERT_TRUE((std::is_same<execution_space ,ExecutionSpace                      >::value));
+      ASSERT_TRUE((std::is_same<index_type      ,typename execution_space::size_type >::value));
+      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
+      ASSERT_TRUE((std::is_same<work_tag        ,void                                >::value));
+    }
+    {
+      typedef Kokkos::TeamPolicy<ExecutionSpace,Kokkos::Schedule<Kokkos::Dynamic>,Kokkos::IndexType<long> > policy_t;
+      typedef typename policy_t::execution_space execution_space;
+      typedef typename policy_t::index_type      index_type;
+      typedef typename policy_t::schedule_type   schedule_type;
+      typedef typename policy_t::work_tag        work_tag;
+
+      ASSERT_TRUE((std::is_same<execution_space ,ExecutionSpace                      >::value));
+      ASSERT_TRUE((std::is_same<index_type      ,long                                >::value));
+      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
+      ASSERT_TRUE((std::is_same<work_tag        ,void                                >::value));
+    }
+    {
+      typedef Kokkos::TeamPolicy<Kokkos::IndexType<long>, ExecutionSpace,Kokkos::Schedule<Kokkos::Dynamic> > policy_t;
+      typedef typename policy_t::execution_space execution_space;
+      typedef typename policy_t::index_type      index_type;
+      typedef typename policy_t::schedule_type   schedule_type;
+      typedef typename policy_t::work_tag        work_tag;
+
+      ASSERT_TRUE((std::is_same<execution_space ,ExecutionSpace                      >::value));
+      ASSERT_TRUE((std::is_same<index_type      ,long                                >::value));
+      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
+      ASSERT_TRUE((std::is_same<work_tag        ,void                                >::value));
+    }
+    {
+      typedef Kokkos::TeamPolicy<ExecutionSpace,Kokkos::Schedule<Kokkos::Dynamic>,Kokkos::IndexType<long>,SomeTag > policy_t;
+      typedef typename policy_t::execution_space execution_space;
+      typedef typename policy_t::index_type      index_type;
+      typedef typename policy_t::schedule_type   schedule_type;
+      typedef typename policy_t::work_tag        work_tag;
+
+      ASSERT_TRUE((std::is_same<execution_space ,ExecutionSpace                      >::value));
+      ASSERT_TRUE((std::is_same<index_type      ,long                                >::value));
+      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
+      ASSERT_TRUE((std::is_same<work_tag        ,SomeTag                             >::value));
+    }
+    {
+      typedef Kokkos::TeamPolicy<Kokkos::Schedule<Kokkos::Dynamic>,ExecutionSpace,Kokkos::IndexType<long>,SomeTag > policy_t;
+      typedef typename policy_t::execution_space execution_space;
+      typedef typename policy_t::index_type      index_type;
+      typedef typename policy_t::schedule_type   schedule_type;
+      typedef typename policy_t::work_tag        work_tag;
+
+      ASSERT_TRUE((std::is_same<execution_space ,ExecutionSpace                      >::value));
+      ASSERT_TRUE((std::is_same<index_type      ,long                                >::value));
+      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
+      ASSERT_TRUE((std::is_same<work_tag        ,SomeTag                             >::value));
+    }
+    {
+      typedef Kokkos::TeamPolicy<SomeTag,Kokkos::Schedule<Kokkos::Dynamic>,Kokkos::IndexType<long>,ExecutionSpace > policy_t;
+      typedef typename policy_t::execution_space execution_space;
+      typedef typename policy_t::index_type      index_type;
+      typedef typename policy_t::schedule_type   schedule_type;
+      typedef typename policy_t::work_tag        work_tag;
+
+      ASSERT_TRUE((std::is_same<execution_space ,ExecutionSpace                      >::value));
+      ASSERT_TRUE((std::is_same<index_type      ,long                                >::value));
+      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
+      ASSERT_TRUE((std::is_same<work_tag        ,SomeTag                             >::value));
+    }
+    {
+      typedef Kokkos::TeamPolicy<Kokkos::Schedule<Kokkos::Dynamic> > policy_t;
+      typedef typename policy_t::execution_space execution_space;
+      typedef typename policy_t::index_type      index_type;
+      typedef typename policy_t::schedule_type   schedule_type;
+      typedef typename policy_t::work_tag        work_tag;
+
+      ASSERT_TRUE((std::is_same<execution_space ,Kokkos::DefaultExecutionSpace                      >::value));
+      ASSERT_TRUE((std::is_same<index_type      ,typename execution_space::size_type >::value));
+      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
+      ASSERT_TRUE((std::is_same<work_tag        ,void                                >::value));
+    }
+    {
+      typedef Kokkos::TeamPolicy<Kokkos::Schedule<Kokkos::Dynamic>,Kokkos::IndexType<long> > policy_t;
+      typedef typename policy_t::execution_space execution_space;
+      typedef typename policy_t::index_type      index_type;
+      typedef typename policy_t::schedule_type   schedule_type;
+      typedef typename policy_t::work_tag        work_tag;
+
+      ASSERT_TRUE((std::is_same<execution_space ,Kokkos::DefaultExecutionSpace       >::value));
+      ASSERT_TRUE((std::is_same<index_type      ,long                                >::value));
+      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
+      ASSERT_TRUE((std::is_same<work_tag        ,void                                >::value));
+    }
+    {
+      typedef Kokkos::TeamPolicy<Kokkos::IndexType<long>, Kokkos::Schedule<Kokkos::Dynamic> > policy_t;
+      typedef typename policy_t::execution_space execution_space;
+      typedef typename policy_t::index_type      index_type;
+      typedef typename policy_t::schedule_type   schedule_type;
+      typedef typename policy_t::work_tag        work_tag;
+
+      ASSERT_TRUE((std::is_same<execution_space ,Kokkos::DefaultExecutionSpace       >::value));
+      ASSERT_TRUE((std::is_same<index_type      ,long                                >::value));
+      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
+      ASSERT_TRUE((std::is_same<work_tag        ,void                                >::value));
+    }
+    {
+      typedef Kokkos::TeamPolicy<Kokkos::Schedule<Kokkos::Dynamic>,Kokkos::IndexType<long>,SomeTag > policy_t;
+      typedef typename policy_t::execution_space execution_space;
+      typedef typename policy_t::index_type      index_type;
+      typedef typename policy_t::schedule_type   schedule_type;
+      typedef typename policy_t::work_tag        work_tag;
+
+      ASSERT_TRUE((std::is_same<execution_space ,Kokkos::DefaultExecutionSpace       >::value));
+      ASSERT_TRUE((std::is_same<index_type      ,long                                >::value));
+      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
+      ASSERT_TRUE((std::is_same<work_tag        ,SomeTag                             >::value));
+    }
+    {
+      typedef Kokkos::TeamPolicy<Kokkos::Schedule<Kokkos::Dynamic>,Kokkos::IndexType<long>,SomeTag > policy_t;
+      typedef typename policy_t::execution_space execution_space;
+      typedef typename policy_t::index_type      index_type;
+      typedef typename policy_t::schedule_type   schedule_type;
+      typedef typename policy_t::work_tag        work_tag;
+
+      ASSERT_TRUE((std::is_same<execution_space ,Kokkos::DefaultExecutionSpace       >::value));
+      ASSERT_TRUE((std::is_same<index_type      ,long                                >::value));
+      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
+      ASSERT_TRUE((std::is_same<work_tag        ,SomeTag                             >::value));
+    }
+    {
+      typedef Kokkos::TeamPolicy<SomeTag,Kokkos::Schedule<Kokkos::Dynamic>,Kokkos::IndexType<long> > policy_t;
+      typedef typename policy_t::execution_space execution_space;
+      typedef typename policy_t::index_type      index_type;
+      typedef typename policy_t::schedule_type   schedule_type;
+      typedef typename policy_t::work_tag        work_tag;
+
+      ASSERT_TRUE((std::is_same<execution_space ,Kokkos::DefaultExecutionSpace       >::value));
+      ASSERT_TRUE((std::is_same<index_type      ,long                                >::value));
+      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
+      ASSERT_TRUE((std::is_same<work_tag        ,SomeTag                             >::value));
+    }
+  }
+
+
+  template<class policy_t>
+  void test_run_time_parameters_type() {
+    int league_size = 131;
+    int team_size = 4<policy_t::execution_space::concurrency()?4:policy_t::execution_space::concurrency();
+    int chunk_size = 4;
+    int per_team_scratch = 1024;
+    int per_thread_scratch = 16;
+    int scratch_size = per_team_scratch + per_thread_scratch*team_size;
+    policy_t p1(league_size,team_size);
+    ASSERT_EQ  (p1.league_size() , league_size);
+    ASSERT_EQ  (p1.team_size()   , team_size);
+    ASSERT_TRUE(p1.chunk_size()  > 0);
+    ASSERT_EQ  (p1.scratch_size(0), 0);
+
+    policy_t p2 = p1.set_chunk_size(chunk_size);
+    ASSERT_EQ  (p1.league_size() , league_size);
+    ASSERT_EQ  (p1.team_size()   , team_size);
+    ASSERT_TRUE(p1.chunk_size()  > 0);
+    ASSERT_EQ  (p1.scratch_size(0), 0);
+
+    ASSERT_EQ  (p2.league_size() , league_size);
+    ASSERT_EQ  (p2.team_size()   , team_size);
+    ASSERT_EQ  (p2.chunk_size()  , chunk_size);
+    ASSERT_EQ  (p2.scratch_size(0), 0);
+
+    policy_t p3 = p2.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch));
+    ASSERT_EQ  (p2.league_size() , league_size);
+    ASSERT_EQ  (p2.team_size()   , team_size);
+    ASSERT_EQ  (p2.chunk_size()  , chunk_size);
+    ASSERT_EQ  (p2.scratch_size(0), 0);
+    ASSERT_EQ  (p3.league_size() , league_size);
+    ASSERT_EQ  (p3.team_size()   , team_size);
+    ASSERT_EQ  (p3.chunk_size()  , chunk_size);
+    ASSERT_EQ  (p3.scratch_size(0), per_team_scratch);
+
+    policy_t p4 = p2.set_scratch_size(0,Kokkos::PerThread(per_thread_scratch));
+    ASSERT_EQ  (p2.league_size() , league_size);
+    ASSERT_EQ  (p2.team_size()   , team_size);
+    ASSERT_EQ  (p2.chunk_size()  , chunk_size);
+    ASSERT_EQ  (p2.scratch_size(0), 0);
+    ASSERT_EQ  (p4.league_size() , league_size);
+    ASSERT_EQ  (p4.team_size()   , team_size);
+    ASSERT_EQ  (p4.chunk_size()  , chunk_size);
+    ASSERT_EQ  (p4.scratch_size(0), per_thread_scratch*team_size);
+
+    policy_t p5 = p2.set_scratch_size(0,Kokkos::PerThread(per_thread_scratch),Kokkos::PerTeam(per_team_scratch));
+    ASSERT_EQ  (p2.league_size() , league_size);
+    ASSERT_EQ  (p2.team_size()   , team_size);
+    ASSERT_EQ  (p2.chunk_size()  , chunk_size);
+    ASSERT_EQ  (p2.scratch_size(0), 0);
+    ASSERT_EQ  (p5.league_size() , league_size);
+    ASSERT_EQ  (p5.team_size()   , team_size);
+    ASSERT_EQ  (p5.chunk_size()  , chunk_size);
+    ASSERT_EQ  (p5.scratch_size(0), scratch_size);
+
+    policy_t p6 = p2.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch),Kokkos::PerThread(per_thread_scratch));
+    ASSERT_EQ  (p2.league_size() , league_size);
+    ASSERT_EQ  (p2.team_size()   , team_size);
+    ASSERT_EQ  (p2.chunk_size()  , chunk_size);
+    ASSERT_EQ  (p2.scratch_size(0), 0);
+    ASSERT_EQ  (p6.league_size() , league_size);
+    ASSERT_EQ  (p6.team_size()   , team_size);
+    ASSERT_EQ  (p6.chunk_size()  , chunk_size);
+    ASSERT_EQ  (p6.scratch_size(0), scratch_size);
+
+    policy_t p7 = p3.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch),Kokkos::PerThread(per_thread_scratch));
+    ASSERT_EQ  (p3.league_size() , league_size);
+    ASSERT_EQ  (p3.team_size()   , team_size);
+    ASSERT_EQ  (p3.chunk_size()  , chunk_size);
+    ASSERT_EQ  (p3.scratch_size(0), per_team_scratch);
+    ASSERT_EQ  (p7.league_size() , league_size);
+    ASSERT_EQ  (p7.team_size()   , team_size);
+    ASSERT_EQ  (p7.chunk_size()  , chunk_size);
+    ASSERT_EQ  (p7.scratch_size(0), scratch_size);
+}
+  void test_run_time_parameters() {
+    test_run_time_parameters_type<Kokkos::TeamPolicy<ExecutionSpace> >();
+    test_run_time_parameters_type<Kokkos::TeamPolicy<ExecutionSpace,Kokkos::Schedule<Kokkos::Dynamic>,Kokkos::IndexType<long> > >();
+    test_run_time_parameters_type<Kokkos::TeamPolicy<Kokkos::IndexType<long>, ExecutionSpace, Kokkos::Schedule<Kokkos::Dynamic> > >();
+    test_run_time_parameters_type<Kokkos::TeamPolicy<Kokkos::Schedule<Kokkos::Dynamic>,Kokkos::IndexType<long>,ExecutionSpace,SomeTag > >();
+  }
+};
diff --git a/lib/kokkos/core/unit_test/TestQthread.cpp b/lib/kokkos/core/unit_test/TestQthread.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..431b844c9f4e60030f546fba320088f5eecf89c5
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestQthread.cpp
@@ -0,0 +1,290 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Qthread.hpp>
+
+#include <Qthread/Kokkos_Qthread_TaskPolicy.hpp>
+
+//----------------------------------------------------------------------------
+
+#include <TestViewImpl.hpp>
+#include <TestAtomic.hpp>
+
+#include <TestViewAPI.hpp>
+#include <TestViewOfClass.hpp>
+
+#include <TestTeam.hpp>
+#include <TestRange.hpp>
+#include <TestReduce.hpp>
+#include <TestScan.hpp>
+#include <TestAggregate.hpp>
+#include <TestCompilerMacros.hpp>
+#include <TestTaskPolicy.hpp>
+// #include <TestTeamVector.hpp>
+
+namespace Test {
+
+class qthread : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+    const unsigned numa_count       = Kokkos::hwloc::get_available_numa_count();
+    const unsigned cores_per_numa   = Kokkos::hwloc::get_available_cores_per_numa();
+    const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core();
+
+    int threads_count = std::max( 1u , numa_count )
+                      * std::max( 2u , ( cores_per_numa * threads_per_core ) / 2 );
+    Kokkos::Qthread::initialize( threads_count );
+    Kokkos::Qthread::print_configuration( std::cout , true );
+  }
+
+  static void TearDownTestCase()
+  {
+    Kokkos::Qthread::finalize();
+  }
+};
+
+TEST_F( qthread , compiler_macros )
+{
+  ASSERT_TRUE( ( TestCompilerMacros::Test< Kokkos::Qthread >() ) );
+}
+
+TEST_F( qthread, view_impl) {
+  test_view_impl< Kokkos::Qthread >();
+}
+
+TEST_F( qthread, view_api) {
+  TestViewAPI< double , Kokkos::Qthread >();
+}
+
+TEST_F( qthread , view_nested_view )
+{
+  ::Test::view_nested_view< Kokkos::Qthread >();
+}
+
+TEST_F( qthread , range_tag )
+{
+  TestRange< Kokkos::Qthread , Kokkos::Schedule<Kokkos::Static> >::test_for(1000);
+  TestRange< Kokkos::Qthread , Kokkos::Schedule<Kokkos::Static> >::test_reduce(1000);
+  TestRange< Kokkos::Qthread , Kokkos::Schedule<Kokkos::Static> >::test_scan(1000);
+}
+
+TEST_F( qthread , team_tag )
+{
+  TestTeamPolicy< Kokkos::Qthread , Kokkos::Schedule<Kokkos::Static> >::test_for( 1000 );
+  TestTeamPolicy< Kokkos::Qthread , Kokkos::Schedule<Kokkos::Static> >::test_reduce( 1000 );
+}
+
+TEST_F( qthread, long_reduce) {
+  TestReduce< long ,   Kokkos::Qthread >( 1000000 );
+}
+
+TEST_F( qthread, double_reduce) {
+  TestReduce< double ,   Kokkos::Qthread >( 1000000 );
+}
+
+TEST_F( qthread, long_reduce_dynamic ) {
+  TestReduceDynamic< long ,   Kokkos::Qthread >( 1000000 );
+}
+
+TEST_F( qthread, double_reduce_dynamic ) {
+  TestReduceDynamic< double ,   Kokkos::Qthread >( 1000000 );
+}
+
+TEST_F( qthread, long_reduce_dynamic_view ) {
+  TestReduceDynamicView< long ,   Kokkos::Qthread >( 1000000 );
+}
+
+TEST_F( qthread, team_long_reduce) {
+  TestReduceTeam< long ,   Kokkos::Qthread , Kokkos::Schedule<Kokkos::Static> >( 1000000 );
+}
+
+TEST_F( qthread, team_double_reduce) {
+  TestReduceTeam< double ,   Kokkos::Qthread , Kokkos::Schedule<Kokkos::Static> >( 1000000 );
+}
+
+
+TEST_F( qthread , atomics )
+{
+  const int loop_count = 1e4 ;
+
+  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Qthread>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Qthread>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Qthread>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Qthread>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Qthread>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Qthread>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Qthread>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Qthread>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Qthread>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Qthread>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Qthread>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Qthread>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Qthread>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Qthread>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Qthread>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Qthread>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Qthread>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Qthread>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Qthread>(100,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Qthread>(100,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Qthread>(100,3) ) );
+
+#if defined( KOKKOS_ENABLE_ASM )
+  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Qthread>(100,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Qthread>(100,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Qthread>(100,3) ) );
+#endif
+
+}
+
+TEST_F( qthread , view_remap )
+{
+  enum { N0 = 3 , N1 = 2 , N2 = 8 , N3 = 9 };
+
+  typedef Kokkos::View< double*[N1][N2][N3] ,
+                             Kokkos::LayoutRight ,
+                             Kokkos::Qthread > output_type ;
+
+  typedef Kokkos::View< int**[N2][N3] ,
+                             Kokkos::LayoutLeft ,
+                             Kokkos::Qthread > input_type ;
+
+  typedef Kokkos::View< int*[N0][N2][N3] ,
+                             Kokkos::LayoutLeft ,
+                             Kokkos::Qthread > diff_type ;
+
+  output_type output( "output" , N0 );
+  input_type  input ( "input" , N0 , N1 );
+  diff_type   diff  ( "diff" , N0 );
+
+  int value = 0 ;
+  for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
+  for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
+  for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
+  for ( size_t i0 = 0 ; i0 < N0 ; ++i0 ) {
+    input(i0,i1,i2,i3) = ++value ;
+  }}}}
+
+  // Kokkos::deep_copy( diff , input ); // throw with incompatible shape
+  Kokkos::deep_copy( output , input );
+
+  value = 0 ;
+  for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
+  for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
+  for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
+  for ( size_t i0 = 0 ; i0 < N0 ; ++i0 ) {
+    ++value ;
+    ASSERT_EQ( value , ((int) output(i0,i1,i2,i3) ) );
+  }}}}
+}
+
+//----------------------------------------------------------------------------
+
+TEST_F( qthread , view_aggregate )
+{
+  TestViewAggregate< Kokkos::Qthread >();
+}
+
+//----------------------------------------------------------------------------
+
+TEST_F( qthread , scan )
+{
+  TestScan< Kokkos::Qthread >::test_range( 1 , 1000 );
+  TestScan< Kokkos::Qthread >( 1000000 );
+  TestScan< Kokkos::Qthread >( 10000000 );
+  Kokkos::Qthread::fence();
+}
+
+TEST_F( qthread, team_shared ) {
+  TestSharedTeam< Kokkos::Qthread , Kokkos::Schedule<Kokkos::Static> >();
+}
+
+TEST_F( qthread, shmem_size) {
+  TestShmemSize< Kokkos::Qthread >();
+}
+
+TEST_F( qthread , team_scan )
+{
+  TestScanTeam< Kokkos::Qthread , Kokkos::Schedule<Kokkos::Static> >( 10 );
+  TestScanTeam< Kokkos::Qthread , Kokkos::Schedule<Kokkos::Static> >( 10000 );
+}
+
+#if 0 /* disable */
+TEST_F( qthread , team_vector )
+{
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Qthread >(0) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Qthread >(1) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Qthread >(2) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Qthread >(3) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Qthread >(4) ) );
+}
+#endif
+
+//----------------------------------------------------------------------------
+
+TEST_F( qthread , task_policy )
+{
+  TestTaskPolicy::test_task_dep< Kokkos::Qthread >( 10 );
+  for ( long i = 0 ; i < 25 ; ++i ) TestTaskPolicy::test_fib< Kokkos::Qthread >(i);
+  for ( long i = 0 ; i < 35 ; ++i ) TestTaskPolicy::test_fib2< Kokkos::Qthread >(i);
+}
+
+TEST_F( qthread , task_team )
+{
+  TestTaskPolicy::test_task_team< Kokkos::Qthread >(1000);
+}
+
+//----------------------------------------------------------------------------
+
+} // namespace test
+
diff --git a/lib/kokkos/core/unit_test/TestRange.hpp b/lib/kokkos/core/unit_test/TestRange.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..be8b4f90a32d96ad12ff4bf3baafd4ab8dec11ca
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestRange.hpp
@@ -0,0 +1,242 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <stdio.h>
+
+#include <Kokkos_Core.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Test {
+namespace {
+
+template< class ExecSpace, class ScheduleType >
+struct TestRange {
+
+  typedef int value_type ; ///< typedef required for the parallel_reduce
+
+  typedef Kokkos::View<int*,ExecSpace> view_type ;
+
+  view_type m_flags ;
+
+  struct VerifyInitTag {};
+  struct ResetTag {};
+  struct VerifyResetTag {};
+
+  TestRange( const size_t N )
+    : m_flags( Kokkos::ViewAllocateWithoutInitializing("flags"), N )
+    {}
+
+  static void test_for( const size_t N )
+    {
+      TestRange functor(N);
+
+      typename view_type::HostMirror host_flags = Kokkos::create_mirror_view( functor.m_flags );
+
+      Kokkos::parallel_for( Kokkos::RangePolicy<ExecSpace,ScheduleType>(0,N) , functor );
+      Kokkos::parallel_for( Kokkos::RangePolicy<ExecSpace,ScheduleType,VerifyInitTag>(0,N) , functor );
+
+      Kokkos::deep_copy( host_flags , functor.m_flags );
+
+      size_t error_count = 0 ;
+      for ( size_t i = 0 ; i < N ; ++i ) {
+        if ( int(i) != host_flags(i) ) ++error_count ;
+      }
+      ASSERT_EQ( error_count , size_t(0) );
+
+      Kokkos::parallel_for( Kokkos::RangePolicy<ExecSpace,ScheduleType,ResetTag>(0,N) , functor );
+      Kokkos::parallel_for( std::string("TestKernelFor") , Kokkos::RangePolicy<ExecSpace,ScheduleType,VerifyResetTag>(0,N) , functor );
+
+      Kokkos::deep_copy( host_flags , functor.m_flags );
+
+      error_count = 0 ;
+      for ( size_t i = 0 ; i < N ; ++i ) {
+        if ( int(2*i) != host_flags(i) ) ++error_count ;
+      }
+      ASSERT_EQ( error_count , size_t(0) );
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i ) const
+    { m_flags(i) = i ; }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const VerifyInitTag & , const int i ) const
+    { if ( i != m_flags(i) ) { printf("TestRange::test_for error at %d != %d\n",i,m_flags(i)); } }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const ResetTag & , const int i ) const
+    { m_flags(i) = 2 * m_flags(i); }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const VerifyResetTag & , const int i ) const
+    { if ( 2 * i != m_flags(i) ) { printf("TestRange::test_for error at %d != %d\n",i,m_flags(i)); } }
+
+  //----------------------------------------
+
+  struct OffsetTag {};
+
+  static void test_reduce( const size_t N )
+    {
+      TestRange functor(N);
+      int total = 0 ;
+
+      Kokkos::parallel_for(    Kokkos::RangePolicy<ExecSpace,ScheduleType>(0,N) , functor );
+
+      Kokkos::parallel_reduce( "TestKernelReduce" , Kokkos::RangePolicy<ExecSpace,ScheduleType>(0,N) , functor , total );
+      // sum( 0 .. N-1 )
+      ASSERT_EQ( size_t((N-1)*(N)/2) , size_t(total) );
+
+      Kokkos::parallel_reduce( Kokkos::RangePolicy<ExecSpace,ScheduleType,OffsetTag>(0,N) , functor , total );
+      // sum( 1 .. N )
+      ASSERT_EQ( size_t((N)*(N+1)/2) , size_t(total) );
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i , value_type & update ) const
+    { update += m_flags(i); }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const OffsetTag & , const int i , value_type & update ) const
+    { update += 1 + m_flags(i); }
+
+  //----------------------------------------
+
+  static void test_scan( const size_t N )
+    {
+      TestRange functor(N);
+
+      Kokkos::parallel_for( Kokkos::RangePolicy<ExecSpace,ScheduleType>(0,N) , functor );
+
+      Kokkos::parallel_scan( "TestKernelScan" , Kokkos::RangePolicy<ExecSpace,ScheduleType,OffsetTag>(0,N) , functor );
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const OffsetTag & , const int i , value_type & update , bool final ) const
+    {
+      update += m_flags(i);
+
+      if ( final ) {
+        if ( update != (i*(i+1))/2 ) {
+          printf("TestRange::test_scan error %d : %d != %d\n",i,(i*(i+1))/2,m_flags(i));
+        }
+      }
+    }
+
+  static void test_dynamic_policy( const size_t N ) {
+
+
+    typedef Kokkos::RangePolicy<ExecSpace,Kokkos::Schedule<Kokkos::Dynamic> > policy_t;
+
+    {
+      Kokkos::View<size_t*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Atomic> > count("Count",ExecSpace::concurrency());
+      Kokkos::View<int*,ExecSpace> a("A",N);
+
+      Kokkos::parallel_for( policy_t(0,N),
+          KOKKOS_LAMBDA (const typename policy_t::member_type& i) {
+        for(int k=0; k<(i<N/2?1:10000); k++ )
+          a(i)++;
+        count(ExecSpace::hardware_thread_id())++;
+      });
+
+      int error = 0;
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N), KOKKOS_LAMBDA(const typename policy_t::member_type& i, int& lsum) {
+        lsum += ( a(i)!= (i<N/2?1:10000) );
+      },error);
+      ASSERT_EQ(error,0);
+
+      if( ( ExecSpace::concurrency()>(int)1) && (N>static_cast<const size_t>(4*ExecSpace::concurrency())) ) {
+        size_t min = N;
+        size_t max = 0;
+        for(int t=0; t<ExecSpace::concurrency(); t++) {
+          if(count(t)<min) min = count(t);
+          if(count(t)>max) max = count(t);
+        }
+        ASSERT_TRUE(min<max);
+        //if(ExecSpace::concurrency()>2)
+        //  ASSERT_TRUE(2*min<max);
+      }
+      
+    }
+
+    {
+      Kokkos::View<size_t*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Atomic> > count("Count",ExecSpace::concurrency());
+      Kokkos::View<int*,ExecSpace> a("A",N);
+
+      int sum = 0;
+      Kokkos::parallel_reduce( policy_t(0,N),
+          KOKKOS_LAMBDA (const typename policy_t::member_type& i, int& lsum) {
+        for(int k=0; k<(i<N/2?1:10000); k++ )
+          a(i)++;
+        count(ExecSpace::hardware_thread_id())++;
+        lsum++;
+      },sum);
+      ASSERT_EQ(sum,N);
+
+      int error = 0;
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N), KOKKOS_LAMBDA(const typename policy_t::member_type& i, int& lsum) {
+        lsum += ( a(i)!= (i<N/2?1:10000) );
+      },error);
+      ASSERT_EQ(error,0);
+
+      if( ( ExecSpace::concurrency()>(int)1) && (N>static_cast<const size_t>(4*ExecSpace::concurrency())) ) {
+        size_t min = N;
+        size_t max = 0;
+        for(int t=0; t<ExecSpace::concurrency(); t++) {
+          if(count(t)<min) min = count(t);
+          if(count(t)>max) max = count(t);
+        }
+        ASSERT_TRUE(min<max);
+        //if(ExecSpace::concurrency()>2)
+        //  ASSERT_TRUE(2*min<max);
+      }
+    }
+
+  }
+};
+
+} /* namespace */
+} /* namespace Test */
+
+/*--------------------------------------------------------------------------*/
+
diff --git a/lib/kokkos/core/unit_test/TestReduce.hpp b/lib/kokkos/core/unit_test/TestReduce.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..53fc393bcc29e6133e4d71ffab87815b935ec9f9
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestReduce.hpp
@@ -0,0 +1,1872 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <stdexcept>
+#include <sstream>
+#include <iostream>
+#include <limits>
+
+#include <Kokkos_Core.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Test {
+
+template< typename ScalarType , class DeviceType >
+class ReduceFunctor
+{
+public:
+  typedef DeviceType  execution_space ;
+  typedef typename execution_space::size_type size_type ;
+
+  struct value_type {
+    ScalarType value[3] ;
+  };
+
+  const size_type nwork ;
+
+  ReduceFunctor( const size_type & arg_nwork ) : nwork( arg_nwork ) {}
+
+  ReduceFunctor( const ReduceFunctor & rhs )
+    : nwork( rhs.nwork ) {}
+
+/*
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type & dst ) const
+  {
+    dst.value[0] = 0 ;
+    dst.value[1] = 0 ;
+    dst.value[2] = 0 ;
+  }
+*/
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile value_type & dst ,
+             const volatile value_type & src ) const
+  {
+    dst.value[0] += src.value[0] ;
+    dst.value[1] += src.value[1] ;
+    dst.value[2] += src.value[2] ;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( size_type iwork , value_type & dst ) const
+  {
+    dst.value[0] += 1 ;
+    dst.value[1] += iwork + 1 ;
+    dst.value[2] += nwork - iwork ;
+  }
+};
+
+template< class DeviceType >
+class ReduceFunctorFinal : public ReduceFunctor< long , DeviceType > {
+public:
+
+  typedef typename ReduceFunctor< long , DeviceType >::value_type value_type ;
+
+  ReduceFunctorFinal( const size_t n )
+    : ReduceFunctor<long,DeviceType>(n)
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  void final( value_type & dst ) const
+  {
+    dst.value[0] = - dst.value[0] ;
+    dst.value[1] = - dst.value[1] ;
+    dst.value[2] = - dst.value[2] ;
+  }
+};
+
+template< typename ScalarType , class DeviceType >
+class RuntimeReduceFunctor
+{
+public:
+  // Required for functor:
+  typedef DeviceType  execution_space ;
+  typedef ScalarType  value_type[] ;
+  const unsigned      value_count ;
+
+
+  // Unit test details:
+
+  typedef typename execution_space::size_type  size_type ;
+
+  const size_type     nwork ;
+
+  RuntimeReduceFunctor( const size_type arg_nwork ,
+                        const size_type arg_count )
+    : value_count( arg_count )
+    , nwork( arg_nwork ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void init( ScalarType dst[] ) const
+  {
+    for ( unsigned i = 0 ; i < value_count ; ++i ) dst[i] = 0 ;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile ScalarType dst[] ,
+             const volatile ScalarType src[] ) const
+  {
+    for ( unsigned i = 0 ; i < value_count ; ++i ) dst[i] += src[i] ;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( size_type iwork , ScalarType dst[] ) const
+  {
+    const size_type tmp[3] = { 1 , iwork + 1 , nwork - iwork };
+
+    for ( size_type i = 0 ; i < value_count ; ++i ) {
+      dst[i] += tmp[ i % 3 ];
+    }
+  }
+};
+
+template< typename ScalarType , class DeviceType >
+class RuntimeReduceMinMax
+{
+public:
+  // Required for functor:
+  typedef DeviceType  execution_space ;
+  typedef ScalarType  value_type[] ;
+  const unsigned      value_count ;
+
+  // Unit test details:
+
+  typedef typename execution_space::size_type  size_type ;
+
+  const size_type     nwork ;
+  const ScalarType    amin ;
+  const ScalarType    amax ;
+
+  RuntimeReduceMinMax( const size_type arg_nwork ,
+                       const size_type arg_count )
+    : value_count( arg_count )
+    , nwork( arg_nwork )
+    , amin( std::numeric_limits<ScalarType>::min() )
+    , amax( std::numeric_limits<ScalarType>::max() )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  void init( ScalarType dst[] ) const
+  {
+    for ( unsigned i = 0 ; i < value_count ; ++i ) {
+      dst[i] = i % 2 ? amax : amin ;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile ScalarType dst[] ,
+             const volatile ScalarType src[] ) const
+  {
+    for ( unsigned i = 0 ; i < value_count ; ++i ) {
+      dst[i] = i % 2 ? ( dst[i] < src[i] ? dst[i] : src[i] )  // min
+                     : ( dst[i] > src[i] ? dst[i] : src[i] ); // max
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( size_type iwork , ScalarType dst[] ) const
+  {
+    const ScalarType tmp[2] = { ScalarType(iwork + 1)
+                              , ScalarType(nwork - iwork) };
+
+    for ( size_type i = 0 ; i < value_count ; ++i ) {
+      dst[i] = i % 2 ? ( dst[i] < tmp[i%2] ? dst[i] : tmp[i%2] )
+                     : ( dst[i] > tmp[i%2] ? dst[i] : tmp[i%2] );
+    }
+  }
+};
+
+template< class DeviceType >
+class RuntimeReduceFunctorFinal : public RuntimeReduceFunctor< long , DeviceType > {
+public:
+
+  typedef RuntimeReduceFunctor< long , DeviceType > base_type ;
+  typedef typename base_type::value_type value_type ;
+  typedef long scalar_type ;
+
+  RuntimeReduceFunctorFinal( const size_t theNwork , const size_t count ) : base_type(theNwork,count) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void final( value_type dst ) const
+  {
+    for ( unsigned i = 0 ; i < base_type::value_count ; ++i ) {
+      dst[i] = - dst[i] ;
+    }
+  }
+};
+} // namespace Test
+
+namespace {
+
+template< typename ScalarType , class DeviceType >
+class TestReduce
+{
+public:
+  typedef DeviceType    execution_space ;
+  typedef typename execution_space::size_type size_type ;
+
+  //------------------------------------
+
+  TestReduce( const size_type & nwork )
+  {
+    run_test(nwork);
+    run_test_final(nwork);
+  }
+
+  void run_test( const size_type & nwork )
+  {
+    typedef Test::ReduceFunctor< ScalarType , execution_space > functor_type ;
+    typedef typename functor_type::value_type value_type ;
+
+    enum { Count = 3 };
+    enum { Repeat = 100 };
+
+    value_type result[ Repeat ];
+
+    const unsigned long nw   = nwork ;
+    const unsigned long nsum = nw % 2 ? nw * (( nw + 1 )/2 )
+                                      : (nw/2) * ( nw + 1 );
+
+    for ( unsigned i = 0 ; i < Repeat ; ++i ) {
+      Kokkos::parallel_reduce( nwork , functor_type(nwork) , result[i] );
+    }
+
+    for ( unsigned i = 0 ; i < Repeat ; ++i ) {
+      for ( unsigned j = 0 ; j < Count ; ++j ) {
+        const unsigned long correct = 0 == j % 3 ? nw : nsum ;
+        ASSERT_EQ( (ScalarType) correct , result[i].value[j] );
+      }
+    }
+  }
+
+  void run_test_final( const size_type & nwork )
+  {
+    typedef Test::ReduceFunctorFinal< execution_space > functor_type ;
+    typedef typename functor_type::value_type value_type ;
+
+    enum { Count = 3 };
+    enum { Repeat = 100 };
+
+    value_type result[ Repeat ];
+
+    const unsigned long nw   = nwork ;
+    const unsigned long nsum = nw % 2 ? nw * (( nw + 1 )/2 )
+                                      : (nw/2) * ( nw + 1 );
+
+    for ( unsigned i = 0 ; i < Repeat ; ++i ) {
+      if(i%2==0)
+        Kokkos::parallel_reduce( nwork , functor_type(nwork) , result[i] );
+      else
+        Kokkos::parallel_reduce( "Reduce", nwork , functor_type(nwork) , result[i] );
+    }
+
+    for ( unsigned i = 0 ; i < Repeat ; ++i ) {
+      for ( unsigned j = 0 ; j < Count ; ++j ) {
+        const unsigned long correct = 0 == j % 3 ? nw : nsum ;
+        ASSERT_EQ( (ScalarType) correct , - result[i].value[j] );
+      }
+    }
+  }
+};
+
+template< typename ScalarType , class DeviceType >
+class TestReduceDynamic
+{
+public:
+  typedef DeviceType    execution_space ;
+  typedef typename execution_space::size_type size_type ;
+
+  //------------------------------------
+
+  TestReduceDynamic( const size_type nwork )
+  {
+    run_test_dynamic(nwork);
+    run_test_dynamic_minmax(nwork);
+    run_test_dynamic_final(nwork);
+  }
+
+  void run_test_dynamic( const size_type nwork )
+  {
+    typedef Test::RuntimeReduceFunctor< ScalarType , execution_space > functor_type ;
+
+    enum { Count = 3 };
+    enum { Repeat = 100 };
+
+    ScalarType result[ Repeat ][ Count ] ;
+
+    const unsigned long nw   = nwork ;
+    const unsigned long nsum = nw % 2 ? nw * (( nw + 1 )/2 )
+                                      : (nw/2) * ( nw + 1 );
+
+    for ( unsigned i = 0 ; i < Repeat ; ++i ) {
+      if(i%2==0)
+        Kokkos::parallel_reduce( nwork , functor_type(nwork,Count) , result[i] );
+      else
+        Kokkos::parallel_reduce( "Reduce", nwork , functor_type(nwork,Count) , result[i] );
+    }
+
+    for ( unsigned i = 0 ; i < Repeat ; ++i ) {
+      for ( unsigned j = 0 ; j < Count ; ++j ) {
+        const unsigned long correct = 0 == j % 3 ? nw : nsum ;
+        ASSERT_EQ( (ScalarType) correct , result[i][j] );
+      }
+    }
+  }
+
+  void run_test_dynamic_minmax( const size_type nwork )
+  {
+    typedef Test::RuntimeReduceMinMax< ScalarType , execution_space > functor_type ;
+
+    enum { Count = 2 };
+    enum { Repeat = 100 };
+
+    ScalarType result[ Repeat ][ Count ] ;
+
+    for ( unsigned i = 0 ; i < Repeat ; ++i ) {
+      if(i%2==0)
+        Kokkos::parallel_reduce( nwork , functor_type(nwork,Count) , result[i] );
+      else
+        Kokkos::parallel_reduce( "Reduce", nwork , functor_type(nwork,Count) , result[i] );
+    }
+
+    for ( unsigned i = 0 ; i < Repeat ; ++i ) {
+      for ( unsigned j = 0 ; j < Count ; ++j ) {
+        const unsigned long correct = j % 2 ? 1 : nwork ;
+        ASSERT_EQ( (ScalarType) correct , result[i][j] );
+      }
+    }
+  }
+
+  void run_test_dynamic_final( const size_type nwork )
+  {
+    typedef Test::RuntimeReduceFunctorFinal< execution_space > functor_type ;
+
+    enum { Count = 3 };
+    enum { Repeat = 100 };
+
+    typename functor_type::scalar_type result[ Repeat ][ Count ] ;
+
+    const unsigned long nw   = nwork ;
+    const unsigned long nsum = nw % 2 ? nw * (( nw + 1 )/2 )
+                                      : (nw/2) * ( nw + 1 );
+
+    for ( unsigned i = 0 ; i < Repeat ; ++i ) {
+      if(i%2==0)
+        Kokkos::parallel_reduce( nwork , functor_type(nwork,Count) , result[i] );
+      else
+        Kokkos::parallel_reduce( "TestKernelReduce" , nwork , functor_type(nwork,Count) , result[i] );
+
+    }
+
+    for ( unsigned i = 0 ; i < Repeat ; ++i ) {
+      for ( unsigned j = 0 ; j < Count ; ++j ) {
+        const unsigned long correct = 0 == j % 3 ? nw : nsum ;
+        ASSERT_EQ( (ScalarType) correct , - result[i][j] );
+      }
+    }
+  }
+};
+
+template< typename ScalarType , class DeviceType >
+class TestReduceDynamicView
+{
+public:
+  typedef DeviceType    execution_space ;
+  typedef typename execution_space::size_type size_type ;
+
+  //------------------------------------
+
+  TestReduceDynamicView( const size_type nwork )
+  {
+    run_test_dynamic_view(nwork);
+  }
+
+  void run_test_dynamic_view( const size_type nwork )
+  {
+    typedef Test::RuntimeReduceFunctor< ScalarType , execution_space > functor_type ;
+
+    typedef Kokkos::View< ScalarType* , DeviceType > result_type ;
+    typedef typename result_type::HostMirror result_host_type ;
+
+    const unsigned CountLimit = 23 ;
+
+    const unsigned long nw   = nwork ;
+    const unsigned long nsum = nw % 2 ? nw * (( nw + 1 )/2 )
+                                      : (nw/2) * ( nw + 1 );
+
+    for ( unsigned count = 0 ; count < CountLimit ; ++count ) {
+
+      result_type result("result",count);
+      result_host_type host_result = Kokkos::create_mirror( result );
+
+      // Test result to host pointer:
+
+      std::string str("TestKernelReduce");
+      if(count%2==0)
+        Kokkos::parallel_reduce( nw , functor_type(nw,count) , host_result.ptr_on_device() );
+      else
+        Kokkos::parallel_reduce( str , nw , functor_type(nw,count) , host_result.ptr_on_device() );
+
+      for ( unsigned j = 0 ; j < count ; ++j ) {
+        const unsigned long correct = 0 == j % 3 ? nw : nsum ;
+        ASSERT_EQ( host_result(j), (ScalarType) correct );
+        host_result(j) = 0 ;
+      }
+    }
+  }
+};
+}
+
+// Computes y^T*A*x
+// (modified from kokkos-tutorials/GTC2016/Exercises/ThreeLevelPar )
+
+#if ( ! defined( KOKKOS_HAVE_CUDA ) ) || defined( KOKKOS_CUDA_USE_LAMBDA )
+
+template< typename ScalarType , class DeviceType >
+class TestTripleNestedReduce
+{
+public:
+  typedef DeviceType execution_space ;
+  typedef typename execution_space::size_type size_type ;
+
+  //------------------------------------
+
+  TestTripleNestedReduce( const size_type & nrows , const size_type & ncols 
+                        , const size_type & team_size , const size_type & vector_length )
+  {
+    run_test( nrows , ncols , team_size, vector_length );
+  }
+
+  void run_test( const size_type & nrows , const size_type & ncols 
+               , const size_type & team_size, const size_type & vector_length )
+  {
+    //typedef Kokkos::LayoutLeft Layout;
+    typedef Kokkos::LayoutRight Layout;
+
+    typedef Kokkos::View<ScalarType* , DeviceType>            ViewVector;
+    typedef Kokkos::View<ScalarType** , Layout , DeviceType>   ViewMatrix;
+    ViewVector y( "y" , nrows );
+    ViewVector x( "x" , ncols );
+    ViewMatrix A( "A" , nrows , ncols );
+
+    typedef Kokkos::RangePolicy<DeviceType> range_policy;
+
+    // Initialize y vector
+    Kokkos::parallel_for( range_policy( 0 , nrows ) , KOKKOS_LAMBDA( const int i ) { y( i ) = 1; } );
+
+    // Initialize x vector
+    Kokkos::parallel_for( range_policy( 0 , ncols ) , KOKKOS_LAMBDA( const int i ) { x( i ) = 1; } );
+
+    typedef Kokkos::TeamPolicy<DeviceType>                        team_policy;
+    typedef typename Kokkos::TeamPolicy<DeviceType>::member_type  member_type;
+
+    // Initialize A matrix, note 2D indexing computation
+    Kokkos::parallel_for( team_policy( nrows , Kokkos::AUTO ) , KOKKOS_LAMBDA( const member_type& teamMember ) {
+      const int j = teamMember.league_rank();
+      Kokkos::parallel_for( Kokkos::TeamThreadRange( teamMember , ncols ) , [&] ( const int i ) {
+        A( j , i ) = 1;
+      } );
+    } );
+
+    // Three level parallelism kernel to force caching of vector x 
+    ScalarType result = 0.0;
+    int chunk_size = 128;
+    Kokkos::parallel_reduce( team_policy( nrows/chunk_size , team_size , vector_length ) , KOKKOS_LAMBDA ( const member_type& teamMember , double &update ) {
+      const int row_start = teamMember.league_rank() * chunk_size;
+      const int row_end   = row_start + chunk_size;
+      Kokkos::parallel_for( Kokkos::TeamThreadRange( teamMember , row_start , row_end ) , [&] ( const int i ) {
+        ScalarType sum_i = 0.0;
+        Kokkos::parallel_reduce( Kokkos::ThreadVectorRange( teamMember , ncols ) , [&] ( const int j , ScalarType &innerUpdate ) {
+          innerUpdate += A( i , j ) * x( j );
+        } , sum_i );
+        Kokkos::single( Kokkos::PerThread( teamMember ) , [&] () {
+          update += y( i ) * sum_i;
+        } );
+      } );
+    } , result );
+
+    const ScalarType solution= ( ScalarType ) nrows * ( ScalarType ) ncols;
+    ASSERT_EQ( solution , result );
+  }
+};
+
+#else /* #if ( ! defined( KOKKOS_HAVE_CUDA ) ) || defined( KOKKOS_CUDA_USE_LAMBDA ) */
+
+template< typename ScalarType , class DeviceType >
+class TestTripleNestedReduce
+{
+public:
+  typedef DeviceType execution_space ;
+  typedef typename execution_space::size_type size_type ;
+
+  TestTripleNestedReduce( const size_type & , const size_type  
+                        , const size_type & , const size_type )
+  { }
+};
+
+#endif
+
+//--------------------------------------------------------------------------
+
+namespace Test {
+namespace ReduceCombinatorical {
+
+template<class Scalar,class Space = Kokkos::HostSpace>
+struct AddPlus {
+public:
+  //Required
+  typedef AddPlus reducer_type;
+  typedef Scalar value_type;
+
+  typedef Kokkos::View<value_type, Space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
+
+private:
+  result_view_type result;
+
+public:
+
+  AddPlus(value_type& result_):result(&result_) {}
+
+  //Required
+  KOKKOS_INLINE_FUNCTION
+  void join(value_type& dest, const value_type& src)  const {
+    dest += src + 1;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile value_type& dest, const volatile value_type& src) const {
+    dest += src + 1;
+  }
+
+  //Optional
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type& val)  const {
+    val = value_type();
+  }
+
+  result_view_type result_view() const {
+    return result;
+  }
+};
+
+template<int ISTEAM>
+struct FunctorScalar;
+
+template<>
+struct FunctorScalar<0>{
+  FunctorScalar(Kokkos::View<double> r):result(r) {}
+  Kokkos::View<double> result;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i,double& update) const {
+    update+=i;
+  }
+};
+
+template<>
+struct FunctorScalar<1>{
+  FunctorScalar(Kokkos::View<double> r):result(r) {}
+  Kokkos::View<double> result;
+
+  typedef Kokkos::TeamPolicy<>::member_type team_type;
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const team_type& team,double& update) const {
+    update+=1.0/team.team_size()*team.league_rank();
+  }
+};
+
+template<int ISTEAM>
+struct FunctorScalarInit;
+
+template<>
+struct FunctorScalarInit<0> {
+  FunctorScalarInit(Kokkos::View<double> r):result(r) {}
+
+  Kokkos::View<double> result;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i, double& update)  const {
+    update += i;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void init(double& update) const {
+    update = 0.0;
+  }
+};
+
+template<>
+struct FunctorScalarInit<1> {
+  FunctorScalarInit(Kokkos::View<double> r):result(r) {}
+
+  Kokkos::View<double> result;
+
+  typedef Kokkos::TeamPolicy<>::member_type team_type;
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const team_type& team,double& update) const {
+    update+=1.0/team.team_size()*team.league_rank();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void init(double& update) const {
+    update = 0.0;
+  }
+};
+
+template<int ISTEAM>
+struct FunctorScalarFinal;
+
+
+template<>
+struct FunctorScalarFinal<0> {
+  FunctorScalarFinal(Kokkos::View<double> r):result(r) {}
+
+  Kokkos::View<double> result;
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i, double& update)  const {
+    update += i;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void final(double& update) const {
+    result() = update;
+  }
+};
+
+template<>
+struct FunctorScalarFinal<1> {
+  FunctorScalarFinal(Kokkos::View<double> r):result(r) {}
+
+  Kokkos::View<double> result;
+
+  typedef Kokkos::TeamPolicy<>::member_type team_type;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const team_type& team, double& update) const {
+    update+=1.0/team.team_size()*team.league_rank();
+  }
+  KOKKOS_INLINE_FUNCTION
+  void final(double& update) const {
+    result() = update;
+  }
+};
+
+template<int ISTEAM>
+struct FunctorScalarJoin;
+
+template<>
+struct FunctorScalarJoin<0> {
+  FunctorScalarJoin(Kokkos::View<double> r):result(r) {}
+
+  Kokkos::View<double> result;
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i, double& update)  const {
+    update += i;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile double& dst, const volatile double& update) const {
+    dst += update;
+  }
+};
+
+template<>
+struct FunctorScalarJoin<1> {
+  FunctorScalarJoin(Kokkos::View<double> r):result(r) {}
+
+  Kokkos::View<double> result;
+
+  typedef Kokkos::TeamPolicy<>::member_type team_type;
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const team_type& team,double& update) const {
+    update+=1.0/team.team_size()*team.league_rank();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile double& dst, const volatile double& update) const {
+    dst += update;
+  }
+};
+
+template<int ISTEAM>
+struct FunctorScalarJoinFinal;
+
+template<>
+struct FunctorScalarJoinFinal<0> {
+  FunctorScalarJoinFinal(Kokkos::View<double> r):result(r) {}
+
+  Kokkos::View<double> result;
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i, double& update)  const {
+    update += i;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile double& dst, const volatile double& update) const {
+    dst += update;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void final(double& update) const {
+    result() = update;
+  }
+};
+
+template<>
+struct FunctorScalarJoinFinal<1> {
+  FunctorScalarJoinFinal(Kokkos::View<double> r):result(r) {}
+
+  Kokkos::View<double> result;
+
+  typedef Kokkos::TeamPolicy<>::member_type team_type;
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const team_type& team,double& update) const {
+    update+=1.0/team.team_size()*team.league_rank();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile double& dst, const volatile double& update) const {
+    dst += update;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void final(double& update) const {
+    result() = update;
+  }
+};
+
+template<int ISTEAM>
+struct FunctorScalarJoinInit;
+
+template<>
+struct FunctorScalarJoinInit<0> {
+  FunctorScalarJoinInit(Kokkos::View<double> r):result(r) {}
+
+  Kokkos::View<double> result;
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i, double& update)  const {
+    update += i;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile double& dst, const volatile double& update) const {
+    dst += update;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void init(double& update) const {
+    update = 0.0;
+  }
+};
+
+template<>
+struct FunctorScalarJoinInit<1> {
+  FunctorScalarJoinInit(Kokkos::View<double> r):result(r) {}
+
+  Kokkos::View<double> result;
+
+  typedef Kokkos::TeamPolicy<>::member_type team_type;
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const team_type& team,double& update) const {
+    update+=1.0/team.team_size()*team.league_rank();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile double& dst, const volatile double& update) const {
+    dst += update;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void init(double& update) const {
+    update = 0.0;
+  }
+};
+
+template<int ISTEAM>
+struct FunctorScalarJoinFinalInit;
+
+template<>
+struct FunctorScalarJoinFinalInit<0> {
+  FunctorScalarJoinFinalInit(Kokkos::View<double> r):result(r) {}
+
+  Kokkos::View<double> result;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i, double& update)  const {
+    update += i;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile double& dst, const volatile double& update) const {
+    dst += update;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void final(double& update) const {
+    result() = update;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void init(double& update) const {
+    update = 0.0;
+  }
+};
+
+template<>
+struct FunctorScalarJoinFinalInit<1> {
+  FunctorScalarJoinFinalInit(Kokkos::View<double> r):result(r) {}
+
+  Kokkos::View<double> result;
+
+  typedef Kokkos::TeamPolicy<>::member_type team_type;
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const team_type& team,double& update) const {
+    update+=1.0/team.team_size()*team.league_rank();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile double& dst, const volatile double& update) const {
+    dst += update;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void final(double& update) const {
+    result() = update;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void init(double& update) const {
+    update = 0.0;
+  }
+};
+struct Functor1 {
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i,double& update) const {
+    update+=i;
+  }
+};
+
+struct Functor2 {
+  typedef double value_type[];
+  const unsigned value_count;
+
+  Functor2(unsigned n):value_count(n){}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const unsigned& i,double update[]) const {
+    for(unsigned j=0;j<value_count;j++)
+      update[j]+=i;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void init( double dst[] ) const
+  {
+    for ( unsigned i = 0 ; i < value_count ; ++i ) dst[i] = 0 ;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile double dst[] ,
+             const volatile double src[] ) const
+  {
+    for ( unsigned i = 0 ; i < value_count ; ++i ) dst[i] += src[i] ;
+  }
+};
+
+}
+}
+
+namespace Test {
+
+template<class ExecSpace = Kokkos::DefaultExecutionSpace>
+struct TestReduceCombinatoricalInstantiation {
+  template<class ... Args>
+  static void CallParallelReduce(Args... args) {
+    Kokkos::parallel_reduce(args...);
+  }
+
+  template<class ... Args>
+  static void AddReturnArgument(Args... args) {
+    Kokkos::View<double,Kokkos::HostSpace> result_view("ResultView");
+    double expected_result = 1000.0*999.0/2.0;
+
+    double value = 0;
+    Kokkos::parallel_reduce(args...,value);
+    ASSERT_EQ(expected_result,value);
+
+    result_view() = 0;
+    CallParallelReduce(args...,result_view);
+    ASSERT_EQ(expected_result,result_view());
+
+    value = 0;
+    CallParallelReduce(args...,Kokkos::View<double,Kokkos::HostSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>(&value));
+    ASSERT_EQ(expected_result,value);
+
+    result_view() = 0;
+    const Kokkos::View<double,Kokkos::HostSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> result_view_const_um = result_view;
+    CallParallelReduce(args...,result_view_const_um);
+    ASSERT_EQ(expected_result,result_view_const_um());
+
+    value = 0;
+    CallParallelReduce(args...,Test::ReduceCombinatorical::AddPlus<double>(value));
+    if((Kokkos::DefaultExecutionSpace::concurrency() > 1) && (ExecSpace::concurrency()>1))
+      ASSERT_TRUE(expected_result<value);
+    else if((Kokkos::DefaultExecutionSpace::concurrency() > 1) || (ExecSpace::concurrency()>1))
+      ASSERT_TRUE(expected_result<=value);
+    else
+      ASSERT_EQ(expected_result,value);
+
+    value = 0;
+    Test::ReduceCombinatorical::AddPlus<double> add(value);
+    CallParallelReduce(args...,add);
+    if((Kokkos::DefaultExecutionSpace::concurrency() > 1) && (ExecSpace::concurrency()>1))
+      ASSERT_TRUE(expected_result<value);
+    else if((Kokkos::DefaultExecutionSpace::concurrency() > 1) || (ExecSpace::concurrency()>1))
+      ASSERT_TRUE(expected_result<=value);
+    else
+      ASSERT_EQ(expected_result,value);
+  }
+
+
+  template<class ... Args>
+  static void AddLambdaRange(void*,Args... args) {
+    AddReturnArgument(args...,  KOKKOS_LAMBDA (const int&i , double& lsum) {
+      lsum += i;
+    });
+  }
+
+  template<class ... Args>
+  static void AddLambdaTeam(void*,Args... args) {
+    AddReturnArgument(args..., KOKKOS_LAMBDA (const Kokkos::TeamPolicy<>::member_type& team, double& update) {
+      update+=1.0/team.team_size()*team.league_rank();
+    });
+  }
+
+  template<class ... Args>
+  static void AddLambdaRange(Kokkos::InvalidType,Args... args) {
+  }
+
+  template<class ... Args>
+  static void AddLambdaTeam(Kokkos::InvalidType,Args... args) {
+  }
+
+  template<int ISTEAM, class ... Args>
+  static void AddFunctor(Args... args) {
+    Kokkos::View<double> result_view("FunctorView");
+    auto h_r = Kokkos::create_mirror_view(result_view);
+    Test::ReduceCombinatorical::FunctorScalar<ISTEAM> functor(result_view);
+    double expected_result = 1000.0*999.0/2.0;
+
+    AddReturnArgument(args..., functor);
+    AddReturnArgument(args..., Test::ReduceCombinatorical::FunctorScalar<ISTEAM>(result_view));
+    AddReturnArgument(args..., Test::ReduceCombinatorical::FunctorScalarInit<ISTEAM>(result_view));
+    AddReturnArgument(args..., Test::ReduceCombinatorical::FunctorScalarJoin<ISTEAM>(result_view));
+    AddReturnArgument(args..., Test::ReduceCombinatorical::FunctorScalarJoinInit<ISTEAM>(result_view));
+
+    h_r() = 0;
+    Kokkos::deep_copy(result_view,h_r);
+    CallParallelReduce(args..., Test::ReduceCombinatorical::FunctorScalarFinal<ISTEAM>(result_view));
+    Kokkos::deep_copy(h_r,result_view);
+    ASSERT_EQ(expected_result,h_r());
+
+    h_r() = 0;
+    Kokkos::deep_copy(result_view,h_r);
+    CallParallelReduce(args..., Test::ReduceCombinatorical::FunctorScalarJoinFinal<ISTEAM>(result_view));
+    Kokkos::deep_copy(h_r,result_view);
+    ASSERT_EQ(expected_result,h_r());
+
+    h_r() = 0;
+    Kokkos::deep_copy(result_view,h_r);
+    CallParallelReduce(args..., Test::ReduceCombinatorical::FunctorScalarJoinFinalInit<ISTEAM>(result_view));
+    Kokkos::deep_copy(h_r,result_view);
+    ASSERT_EQ(expected_result,h_r());
+  }
+
+  template<class ... Args>
+  static void AddFunctorLambdaRange(Args... args) {
+    AddFunctor<0,Args...>(args...);
+    #ifdef  KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA
+    AddLambdaRange(typename std::conditional<std::is_same<ExecSpace,Kokkos::DefaultExecutionSpace>::value,void*,Kokkos::InvalidType>::type(), args...);
+    #endif
+  }
+
+  template<class ... Args>
+  static void AddFunctorLambdaTeam(Args... args) {
+    AddFunctor<1,Args...>(args...);
+    #ifdef  KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA
+    AddLambdaTeam(typename std::conditional<std::is_same<ExecSpace,Kokkos::DefaultExecutionSpace>::value,void*,Kokkos::InvalidType>::type(), args...);
+    #endif
+  }
+
+  template<class ... Args>
+  static void AddPolicy(Args... args) {
+    int N = 1000;
+    Kokkos::RangePolicy<ExecSpace> policy(0,N);
+
+    AddFunctorLambdaRange(args...,1000);
+    AddFunctorLambdaRange(args...,N);
+    AddFunctorLambdaRange(args...,policy);
+    AddFunctorLambdaRange(args...,Kokkos::RangePolicy<ExecSpace>(0,N));
+    AddFunctorLambdaRange(args...,Kokkos::RangePolicy<ExecSpace,Kokkos::Schedule<Kokkos::Dynamic> >(0,N));
+    AddFunctorLambdaRange(args...,Kokkos::RangePolicy<ExecSpace,Kokkos::Schedule<Kokkos::Static> >(0,N).set_chunk_size(10));
+    AddFunctorLambdaRange(args...,Kokkos::RangePolicy<ExecSpace,Kokkos::Schedule<Kokkos::Dynamic> >(0,N).set_chunk_size(10));
+
+    AddFunctorLambdaTeam(args...,Kokkos::TeamPolicy<ExecSpace>(N,Kokkos::AUTO));
+    AddFunctorLambdaTeam(args...,Kokkos::TeamPolicy<ExecSpace,Kokkos::Schedule<Kokkos::Dynamic> >(N,Kokkos::AUTO));
+    AddFunctorLambdaTeam(args...,Kokkos::TeamPolicy<ExecSpace,Kokkos::Schedule<Kokkos::Static> >(N,Kokkos::AUTO).set_chunk_size(10));
+    AddFunctorLambdaTeam(args...,Kokkos::TeamPolicy<ExecSpace,Kokkos::Schedule<Kokkos::Dynamic> >(N,Kokkos::AUTO).set_chunk_size(10));
+  }
+
+
+  static void AddLabel() {
+    std::string s("Std::String");
+    AddPolicy();
+    AddPolicy("Char Constant");
+    AddPolicy(s.c_str());
+    AddPolicy(s);
+  }
+
+  static void execute() {
+    AddLabel();
+  }
+};
+
+template<class Scalar, class ExecSpace = Kokkos::DefaultExecutionSpace>
+struct TestReducers {
+
+  struct SumFunctor {
+    Kokkos::View<const Scalar*,ExecSpace> values;
+    KOKKOS_INLINE_FUNCTION
+    void operator() (const int& i, Scalar& value) const {
+      value += values(i);
+    }
+  };
+
+  struct ProdFunctor {
+    Kokkos::View<const Scalar*,ExecSpace> values;
+    KOKKOS_INLINE_FUNCTION
+    void operator() (const int& i, Scalar& value) const {
+      value *= values(i);
+    }
+  };
+
+  struct MinFunctor {
+    Kokkos::View<const Scalar*,ExecSpace> values;
+    KOKKOS_INLINE_FUNCTION
+    void operator() (const int& i, Scalar& value) const {
+      if(values(i) < value)
+        value = values(i);
+    }
+  };
+
+  struct MaxFunctor {
+    Kokkos::View<const Scalar*,ExecSpace> values;
+    KOKKOS_INLINE_FUNCTION
+    void operator() (const int& i, Scalar& value) const {
+      if(values(i) > value)
+        value = values(i);
+    }
+  };
+
+  struct MinLocFunctor {
+    Kokkos::View<const Scalar*,ExecSpace> values;
+    KOKKOS_INLINE_FUNCTION
+    void operator() (const int& i,
+        typename Kokkos::Experimental::MinLoc<Scalar,int>::value_type& value) const {
+      if(values(i) < value.val) {
+        value.val = values(i);
+        value.loc = i;
+      }
+    }
+  };
+
+  struct MaxLocFunctor {
+    Kokkos::View<const Scalar*,ExecSpace> values;
+    KOKKOS_INLINE_FUNCTION
+    void operator() (const int& i,
+        typename Kokkos::Experimental::MaxLoc<Scalar,int>::value_type& value) const {
+      if(values(i) > value.val) {
+        value.val = values(i);
+        value.loc = i;
+      }
+    }
+  };
+
+  struct MinMaxLocFunctor {
+    Kokkos::View<const Scalar*,ExecSpace> values;
+    KOKKOS_INLINE_FUNCTION
+    void operator() (const int& i,
+        typename Kokkos::Experimental::MinMaxLoc<Scalar,int>::value_type& value) const {
+      if(values(i) > value.max_val) {
+        value.max_val = values(i);
+        value.max_loc = i;
+      }
+      if(values(i) < value.min_val) {
+        value.min_val = values(i);
+        value.min_loc = i;
+      }
+    }
+  };
+
+  struct BAndFunctor {
+    Kokkos::View<const Scalar*,ExecSpace> values;
+    KOKKOS_INLINE_FUNCTION
+    void operator() (const int& i, Scalar& value) const {
+      value = value & values(i);
+    }
+  };
+
+  struct BOrFunctor {
+    Kokkos::View<const Scalar*,ExecSpace> values;
+    KOKKOS_INLINE_FUNCTION
+    void operator() (const int& i, Scalar& value) const {
+      value = value | values(i);
+    }
+  };
+
+  struct BXorFunctor {
+    Kokkos::View<const Scalar*,ExecSpace> values;
+    KOKKOS_INLINE_FUNCTION
+    void operator() (const int& i, Scalar& value) const {
+      value = value ^ values(i);
+    }
+  };
+
+  struct LAndFunctor {
+    Kokkos::View<const Scalar*,ExecSpace> values;
+    KOKKOS_INLINE_FUNCTION
+    void operator() (const int& i, Scalar& value) const {
+      value = value && values(i);
+    }
+  };
+
+  struct LOrFunctor {
+    Kokkos::View<const Scalar*,ExecSpace> values;
+    KOKKOS_INLINE_FUNCTION
+    void operator() (const int& i, Scalar& value) const {
+      value = value || values(i);
+    }
+  };
+
+  struct LXorFunctor {
+    Kokkos::View<const Scalar*,ExecSpace> values;
+    KOKKOS_INLINE_FUNCTION
+    void operator() (const int& i, Scalar& value) const {
+      value = value ? (!values(i)) : values(i);
+    }
+  };
+
+  static void test_sum(int N) {
+    Kokkos::View<Scalar*,ExecSpace> values("Values",N);
+    auto h_values = Kokkos::create_mirror_view(values);
+    Scalar reference_sum = 0;
+    for(int i=0; i<N; i++) {
+      h_values(i) = (Scalar)(rand()%100);
+      reference_sum += h_values(i);
+    }
+    Kokkos::deep_copy(values,h_values);
+
+    SumFunctor f;
+    f.values = values;
+    Scalar init = 0;
+
+    {
+      Scalar sum_scalar = init;
+      Kokkos::Experimental::Sum<Scalar> reducer_scalar(sum_scalar);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar);
+      ASSERT_EQ(sum_scalar,reference_sum);
+      Scalar sum_scalar_view = reducer_scalar.result_view()();
+      ASSERT_EQ(sum_scalar_view,reference_sum);
+    }
+    {
+      Scalar sum_scalar_init = init;
+      Kokkos::Experimental::Sum<Scalar> reducer_scalar_init(sum_scalar_init,init);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar_init);
+      ASSERT_EQ(sum_scalar_init,reference_sum);
+      Scalar sum_scalar_init_view = reducer_scalar_init.result_view()();
+      ASSERT_EQ(sum_scalar_init_view,reference_sum);
+    }
+    {
+      Kokkos::View<Scalar,Kokkos::HostSpace> sum_view("View");
+      sum_view() = init;
+      Kokkos::Experimental::Sum<Scalar> reducer_view(sum_view);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view);
+      Scalar sum_view_scalar = sum_view();
+      ASSERT_EQ(sum_view_scalar,reference_sum);
+      Scalar sum_view_view = reducer_view.result_view()();
+      ASSERT_EQ(sum_view_view,reference_sum);
+    }
+    {
+      Kokkos::View<Scalar,Kokkos::HostSpace> sum_view_init("View");
+      sum_view_init() = init;
+      Kokkos::Experimental::Sum<Scalar> reducer_view_init(sum_view_init,init);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view_init);
+      Scalar sum_view_init_scalar = sum_view_init();
+      ASSERT_EQ(sum_view_init_scalar,reference_sum);
+      Scalar sum_view_init_view = reducer_view_init.result_view()();
+      ASSERT_EQ(sum_view_init_view,reference_sum);
+    }
+  }
+
+  static void test_prod(int N) {
+    Kokkos::View<Scalar*,ExecSpace> values("Values",N);
+    auto h_values = Kokkos::create_mirror_view(values);
+    Scalar reference_prod = 1;
+    for(int i=0; i<N; i++) {
+      h_values(i) = (Scalar)(rand()%4+1);
+      reference_prod *= h_values(i);
+    }
+    Kokkos::deep_copy(values,h_values);
+
+    ProdFunctor f;
+    f.values = values;
+    Scalar init = 1;
+
+    if(std::is_arithmetic<Scalar>::value)
+    {
+      Scalar prod_scalar = init;
+      Kokkos::Experimental::Prod<Scalar> reducer_scalar(prod_scalar);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar);
+      ASSERT_EQ(prod_scalar,reference_prod);
+      Scalar prod_scalar_view = reducer_scalar.result_view()();
+      ASSERT_EQ(prod_scalar_view,reference_prod);
+    }
+    {
+      Scalar prod_scalar_init = init;
+      Kokkos::Experimental::Prod<Scalar> reducer_scalar_init(prod_scalar_init,init);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar_init);
+      ASSERT_EQ(prod_scalar_init,reference_prod);
+      Scalar prod_scalar_init_view = reducer_scalar_init.result_view()();
+      ASSERT_EQ(prod_scalar_init_view,reference_prod);
+    }
+
+    if(std::is_arithmetic<Scalar>::value)
+    {
+      Kokkos::View<Scalar,Kokkos::HostSpace> prod_view("View");
+      prod_view() = init;
+      Kokkos::Experimental::Prod<Scalar> reducer_view(prod_view);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view);
+      Scalar prod_view_scalar = prod_view();
+      ASSERT_EQ(prod_view_scalar,reference_prod);
+      Scalar prod_view_view = reducer_view.result_view()();
+      ASSERT_EQ(prod_view_view,reference_prod);
+    }
+    {
+      Kokkos::View<Scalar,Kokkos::HostSpace> prod_view_init("View");
+      prod_view_init() = init;
+      Kokkos::Experimental::Prod<Scalar> reducer_view_init(prod_view_init,init);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view_init);
+      Scalar prod_view_init_scalar = prod_view_init();
+      ASSERT_EQ(prod_view_init_scalar,reference_prod);
+      Scalar prod_view_init_view = reducer_view_init.result_view()();
+      ASSERT_EQ(prod_view_init_view,reference_prod);
+    }
+  }
+
+  static void test_min(int N) {
+    Kokkos::View<Scalar*,ExecSpace> values("Values",N);
+    auto h_values = Kokkos::create_mirror_view(values);
+    Scalar reference_min = std::numeric_limits<Scalar>::max();
+    for(int i=0; i<N; i++) {
+      h_values(i) = (Scalar)(rand()%100000);
+      if(h_values(i)<reference_min)
+        reference_min = h_values(i);
+    }
+    Kokkos::deep_copy(values,h_values);
+
+    MinFunctor f;
+    f.values = values;
+    Scalar init = std::numeric_limits<Scalar>::max();
+
+    {
+      Scalar min_scalar = init;
+      Kokkos::Experimental::Min<Scalar> reducer_scalar(min_scalar);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar);
+      ASSERT_EQ(min_scalar,reference_min);
+      Scalar min_scalar_view = reducer_scalar.result_view()();
+      ASSERT_EQ(min_scalar_view,reference_min);
+    }
+    {
+      Scalar min_scalar_init = init;
+      Kokkos::Experimental::Min<Scalar> reducer_scalar_init(min_scalar_init,init);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar_init);
+      ASSERT_EQ(min_scalar_init,reference_min);
+      Scalar min_scalar_init_view = reducer_scalar_init.result_view()();
+      ASSERT_EQ(min_scalar_init_view,reference_min);
+    }
+    {
+      Kokkos::View<Scalar,Kokkos::HostSpace> min_view("View");
+      min_view() = init;
+      Kokkos::Experimental::Min<Scalar> reducer_view(min_view);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view);
+      Scalar min_view_scalar = min_view();
+      ASSERT_EQ(min_view_scalar,reference_min);
+      Scalar min_view_view = reducer_view.result_view()();
+      ASSERT_EQ(min_view_view,reference_min);
+    }
+    {
+      Kokkos::View<Scalar,Kokkos::HostSpace> min_view_init("View");
+      min_view_init() = init;
+      Kokkos::Experimental::Min<Scalar> reducer_view_init(min_view_init,init);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view_init);
+      Scalar min_view_init_scalar = min_view_init();
+      ASSERT_EQ(min_view_init_scalar,reference_min);
+      Scalar min_view_init_view = reducer_view_init.result_view()();
+      ASSERT_EQ(min_view_init_view,reference_min);
+    }
+  }
+
+  static void test_max(int N) {
+    Kokkos::View<Scalar*,ExecSpace> values("Values",N);
+    auto h_values = Kokkos::create_mirror_view(values);
+    Scalar reference_max = std::numeric_limits<Scalar>::min();
+    for(int i=0; i<N; i++) {
+      h_values(i) = (Scalar)(rand()%100000+1);
+      if(h_values(i)>reference_max)
+        reference_max = h_values(i);
+    }
+    Kokkos::deep_copy(values,h_values);
+
+    MaxFunctor f;
+    f.values = values;
+    Scalar init = std::numeric_limits<Scalar>::min();
+
+    {
+      Scalar max_scalar = init;
+      Kokkos::Experimental::Max<Scalar> reducer_scalar(max_scalar);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar);
+      ASSERT_EQ(max_scalar,reference_max);
+      Scalar max_scalar_view = reducer_scalar.result_view()();
+      ASSERT_EQ(max_scalar_view,reference_max);
+    }
+    {
+      Scalar max_scalar_init = init;
+      Kokkos::Experimental::Max<Scalar> reducer_scalar_init(max_scalar_init,init);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar_init);
+      ASSERT_EQ(max_scalar_init,reference_max);
+      Scalar max_scalar_init_view = reducer_scalar_init.result_view()();
+      ASSERT_EQ(max_scalar_init_view,reference_max);
+    }
+    {
+      Kokkos::View<Scalar,Kokkos::HostSpace> max_view("View");
+      max_view() = init;
+      Kokkos::Experimental::Max<Scalar> reducer_view(max_view);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view);
+      Scalar max_view_scalar = max_view();
+      ASSERT_EQ(max_view_scalar,reference_max);
+      Scalar max_view_view = reducer_view.result_view()();
+      ASSERT_EQ(max_view_view,reference_max);
+    }
+    {
+      Kokkos::View<Scalar,Kokkos::HostSpace> max_view_init("View");
+      max_view_init() = init;
+      Kokkos::Experimental::Max<Scalar> reducer_view_init(max_view_init,init);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view_init);
+      Scalar max_view_init_scalar = max_view_init();
+      ASSERT_EQ(max_view_init_scalar,reference_max);
+      Scalar max_view_init_view = reducer_view_init.result_view()();
+      ASSERT_EQ(max_view_init_view,reference_max);
+    }
+  }
+
+  static void test_minloc(int N) {
+    Kokkos::View<Scalar*,ExecSpace> values("Values",N);
+    auto h_values = Kokkos::create_mirror_view(values);
+    Scalar reference_min = std::numeric_limits<Scalar>::max();
+    int reference_loc = -1;
+    for(int i=0; i<N; i++) {
+      h_values(i) = (Scalar)(rand()%100000);
+      if(h_values(i)<reference_min) {
+        reference_min = h_values(i);
+        reference_loc = i;
+      }
+    }
+    Kokkos::deep_copy(values,h_values);
+
+    MinLocFunctor f;
+    typedef typename Kokkos::Experimental::MinLoc<Scalar,int>::value_type value_type;
+    f.values = values;
+    Scalar init = std::numeric_limits<Scalar>::max();
+
+
+    {
+      value_type min_scalar;
+      Kokkos::Experimental::MinLoc<Scalar,int> reducer_scalar(min_scalar);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar);
+      ASSERT_EQ(min_scalar.val,reference_min);
+      ASSERT_EQ(min_scalar.loc,reference_loc);
+      value_type min_scalar_view = reducer_scalar.result_view()();
+      ASSERT_EQ(min_scalar_view.val,reference_min);
+      ASSERT_EQ(min_scalar_view.loc,reference_loc);
+    }
+    {
+      value_type min_scalar_init;
+      Kokkos::Experimental::MinLoc<Scalar,int> reducer_scalar_init(min_scalar_init,init);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar_init);
+      ASSERT_EQ(min_scalar_init.val,reference_min);
+      ASSERT_EQ(min_scalar_init.loc,reference_loc);
+      value_type min_scalar_init_view = reducer_scalar_init.result_view()();
+      ASSERT_EQ(min_scalar_init_view.val,reference_min);
+      ASSERT_EQ(min_scalar_init_view.loc,reference_loc);
+    }
+    {
+      Kokkos::View<value_type,Kokkos::HostSpace> min_view("View");
+      Kokkos::Experimental::MinLoc<Scalar,int> reducer_view(min_view);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view);
+      value_type min_view_scalar = min_view();
+      ASSERT_EQ(min_view_scalar.val,reference_min);
+      ASSERT_EQ(min_view_scalar.loc,reference_loc);
+      value_type min_view_view = reducer_view.result_view()();
+      ASSERT_EQ(min_view_view.val,reference_min);
+      ASSERT_EQ(min_view_view.loc,reference_loc);
+    }
+    {
+      Kokkos::View<value_type,Kokkos::HostSpace> min_view_init("View");
+      Kokkos::Experimental::MinLoc<Scalar,int> reducer_view_init(min_view_init,init);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view_init);
+      value_type min_view_init_scalar = min_view_init();
+      ASSERT_EQ(min_view_init_scalar.val,reference_min);
+      ASSERT_EQ(min_view_init_scalar.loc,reference_loc);
+      value_type min_view_init_view = reducer_view_init.result_view()();
+      ASSERT_EQ(min_view_init_view.val,reference_min);
+      ASSERT_EQ(min_view_init_view.loc,reference_loc);
+    }
+  }
+
+  static void test_maxloc(int N) {
+    Kokkos::View<Scalar*,ExecSpace> values("Values",N);
+    auto h_values = Kokkos::create_mirror_view(values);
+    Scalar reference_max = std::numeric_limits<Scalar>::min();
+    int reference_loc = -1;
+    for(int i=0; i<N; i++) {
+      h_values(i) = (Scalar)(rand()%100000);
+      if(h_values(i)>reference_max) {
+        reference_max = h_values(i);
+        reference_loc = i;
+      }
+    }
+    Kokkos::deep_copy(values,h_values);
+
+    MaxLocFunctor f;
+    typedef typename Kokkos::Experimental::MaxLoc<Scalar,int>::value_type value_type;
+    f.values = values;
+    Scalar init = std::numeric_limits<Scalar>::min();
+
+
+    {
+      value_type max_scalar;
+      Kokkos::Experimental::MaxLoc<Scalar,int> reducer_scalar(max_scalar);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar);
+      ASSERT_EQ(max_scalar.val,reference_max);
+      ASSERT_EQ(max_scalar.loc,reference_loc);
+      value_type max_scalar_view = reducer_scalar.result_view()();
+      ASSERT_EQ(max_scalar_view.val,reference_max);
+      ASSERT_EQ(max_scalar_view.loc,reference_loc);
+    }
+    {
+      value_type max_scalar_init;
+      Kokkos::Experimental::MaxLoc<Scalar,int> reducer_scalar_init(max_scalar_init,init);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar_init);
+      ASSERT_EQ(max_scalar_init.val,reference_max);
+      ASSERT_EQ(max_scalar_init.loc,reference_loc);
+      value_type max_scalar_init_view = reducer_scalar_init.result_view()();
+      ASSERT_EQ(max_scalar_init_view.val,reference_max);
+      ASSERT_EQ(max_scalar_init_view.loc,reference_loc);
+    }
+    {
+      Kokkos::View<value_type,Kokkos::HostSpace> max_view("View");
+      Kokkos::Experimental::MaxLoc<Scalar,int> reducer_view(max_view);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view);
+      value_type max_view_scalar = max_view();
+      ASSERT_EQ(max_view_scalar.val,reference_max);
+      ASSERT_EQ(max_view_scalar.loc,reference_loc);
+      value_type max_view_view = reducer_view.result_view()();
+      ASSERT_EQ(max_view_view.val,reference_max);
+      ASSERT_EQ(max_view_view.loc,reference_loc);
+    }
+    {
+      Kokkos::View<value_type,Kokkos::HostSpace> max_view_init("View");
+      Kokkos::Experimental::MaxLoc<Scalar,int> reducer_view_init(max_view_init,init);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view_init);
+      value_type max_view_init_scalar = max_view_init();
+      ASSERT_EQ(max_view_init_scalar.val,reference_max);
+      ASSERT_EQ(max_view_init_scalar.loc,reference_loc);
+      value_type max_view_init_view = reducer_view_init.result_view()();
+      ASSERT_EQ(max_view_init_view.val,reference_max);
+      ASSERT_EQ(max_view_init_view.loc,reference_loc);
+    }
+  }
+
+  static void test_minmaxloc(int N) {
+     Kokkos::View<Scalar*,ExecSpace> values("Values",N);
+     auto h_values = Kokkos::create_mirror_view(values);
+     Scalar reference_max = std::numeric_limits<Scalar>::min();
+     Scalar reference_min = std::numeric_limits<Scalar>::max();
+     int reference_minloc = -1;
+     int reference_maxloc = -1;
+     for(int i=0; i<N; i++) {
+       h_values(i) = (Scalar)(rand()%100000);
+       if(h_values(i)>reference_max) {
+         reference_max = h_values(i);
+         reference_maxloc = i;
+       }
+       if(h_values(i)<reference_min) {
+         reference_min = h_values(i);
+         reference_minloc = i;
+       }
+     }
+     Kokkos::deep_copy(values,h_values);
+
+     MinMaxLocFunctor f;
+     typedef typename Kokkos::Experimental::MinMaxLoc<Scalar,int>::value_type value_type;
+     f.values = values;
+     Scalar init_min = std::numeric_limits<Scalar>::max();
+     Scalar init_max = std::numeric_limits<Scalar>::min();
+
+
+     {
+       value_type minmax_scalar;
+       Kokkos::Experimental::MinMaxLoc<Scalar,int> reducer_scalar(minmax_scalar);
+       Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar);
+       ASSERT_EQ(minmax_scalar.min_val,reference_min);
+       ASSERT_EQ(minmax_scalar.min_loc,reference_minloc);
+       ASSERT_EQ(minmax_scalar.max_val,reference_max);
+       ASSERT_EQ(minmax_scalar.max_loc,reference_maxloc);
+       value_type minmax_scalar_view = reducer_scalar.result_view()();
+       ASSERT_EQ(minmax_scalar_view.min_val,reference_min);
+       ASSERT_EQ(minmax_scalar_view.min_loc,reference_minloc);
+       ASSERT_EQ(minmax_scalar_view.max_val,reference_max);
+       ASSERT_EQ(minmax_scalar_view.max_loc,reference_maxloc);
+     }
+     {
+       value_type minmax_scalar_init;
+       Kokkos::Experimental::MinMaxLoc<Scalar,int> reducer_scalar_init(minmax_scalar_init,init_min,init_max);
+       Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar_init);
+       ASSERT_EQ(minmax_scalar_init.min_val,reference_min);
+       ASSERT_EQ(minmax_scalar_init.min_loc,reference_minloc);
+       ASSERT_EQ(minmax_scalar_init.max_val,reference_max);
+       ASSERT_EQ(minmax_scalar_init.max_loc,reference_maxloc);
+       value_type minmax_scalar_init_view = reducer_scalar_init.result_view()();
+       ASSERT_EQ(minmax_scalar_init_view.min_val,reference_min);
+       ASSERT_EQ(minmax_scalar_init_view.min_loc,reference_minloc);
+       ASSERT_EQ(minmax_scalar_init_view.max_val,reference_max);
+       ASSERT_EQ(minmax_scalar_init_view.max_loc,reference_maxloc);
+     }
+     {
+       Kokkos::View<value_type,Kokkos::HostSpace> minmax_view("View");
+       Kokkos::Experimental::MinMaxLoc<Scalar,int> reducer_view(minmax_view);
+       Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view);
+       value_type minmax_view_scalar = minmax_view();
+       ASSERT_EQ(minmax_view_scalar.min_val,reference_min);
+       ASSERT_EQ(minmax_view_scalar.min_loc,reference_minloc);
+       ASSERT_EQ(minmax_view_scalar.max_val,reference_max);
+       ASSERT_EQ(minmax_view_scalar.max_loc,reference_maxloc);
+       value_type minmax_view_view = reducer_view.result_view()();
+       ASSERT_EQ(minmax_view_view.min_val,reference_min);
+       ASSERT_EQ(minmax_view_view.min_loc,reference_minloc);
+       ASSERT_EQ(minmax_view_view.max_val,reference_max);
+       ASSERT_EQ(minmax_view_view.max_loc,reference_maxloc);
+     }
+     {
+       Kokkos::View<value_type,Kokkos::HostSpace> minmax_view_init("View");
+       Kokkos::Experimental::MinMaxLoc<Scalar,int> reducer_view_init(minmax_view_init,init_min,init_max);
+       Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view_init);
+       value_type minmax_view_init_scalar = minmax_view_init();
+       ASSERT_EQ(minmax_view_init_scalar.min_val,reference_min);
+       ASSERT_EQ(minmax_view_init_scalar.min_loc,reference_minloc);
+       ASSERT_EQ(minmax_view_init_scalar.max_val,reference_max);
+       ASSERT_EQ(minmax_view_init_scalar.max_loc,reference_maxloc);
+       value_type minmax_view_init_view = reducer_view_init.result_view()();
+       ASSERT_EQ(minmax_view_init_view.min_val,reference_min);
+       ASSERT_EQ(minmax_view_init_view.min_loc,reference_minloc);
+       ASSERT_EQ(minmax_view_init_view.max_val,reference_max);
+       ASSERT_EQ(minmax_view_init_view.max_loc,reference_maxloc);
+     }
+   }
+
+  static void test_BAnd(int N) {
+    Kokkos::View<Scalar*,ExecSpace> values("Values",N);
+    auto h_values = Kokkos::create_mirror_view(values);
+    Scalar reference_band = Scalar() | (~Scalar());
+    for(int i=0; i<N; i++) {
+      h_values(i) = (Scalar)(rand()%100000+1);
+      reference_band = reference_band & h_values(i);
+    }
+    Kokkos::deep_copy(values,h_values);
+
+    BAndFunctor f;
+    f.values = values;
+    Scalar init = Scalar() | (~Scalar());
+
+    {
+      Scalar band_scalar = init;
+      Kokkos::Experimental::BAnd<Scalar> reducer_scalar(band_scalar);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar);
+      ASSERT_EQ(band_scalar,reference_band);
+      Scalar band_scalar_view = reducer_scalar.result_view()();
+      ASSERT_EQ(band_scalar_view,reference_band);
+    }
+
+    {
+      Kokkos::View<Scalar,Kokkos::HostSpace> band_view("View");
+      band_view() = init;
+      Kokkos::Experimental::BAnd<Scalar> reducer_view(band_view);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view);
+      Scalar band_view_scalar = band_view();
+      ASSERT_EQ(band_view_scalar,reference_band);
+      Scalar band_view_view = reducer_view.result_view()();
+      ASSERT_EQ(band_view_view,reference_band);
+    }
+  }
+
+  static void test_BOr(int N) {
+    Kokkos::View<Scalar*,ExecSpace> values("Values",N);
+    auto h_values = Kokkos::create_mirror_view(values);
+    Scalar reference_bor = Scalar() & (~Scalar());
+    for(int i=0; i<N; i++) {
+      h_values(i) = (Scalar)((rand()%100000+1)*2);
+      reference_bor = reference_bor | h_values(i);
+    }
+    Kokkos::deep_copy(values,h_values);
+
+    BOrFunctor f;
+    f.values = values;
+    Scalar init = Scalar() & (~Scalar());
+
+    {
+      Scalar bor_scalar = init;
+      Kokkos::Experimental::BOr<Scalar> reducer_scalar(bor_scalar);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar);
+      ASSERT_EQ(bor_scalar,reference_bor);
+      Scalar bor_scalar_view = reducer_scalar.result_view()();
+      ASSERT_EQ(bor_scalar_view,reference_bor);
+    }
+
+    {
+      Kokkos::View<Scalar,Kokkos::HostSpace> bor_view("View");
+      bor_view() = init;
+      Kokkos::Experimental::BOr<Scalar> reducer_view(bor_view);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view);
+      Scalar bor_view_scalar = bor_view();
+      ASSERT_EQ(bor_view_scalar,reference_bor);
+      Scalar bor_view_view = reducer_view.result_view()();
+      ASSERT_EQ(bor_view_view,reference_bor);
+    }
+  }
+
+  static void test_BXor(int N) {
+    Kokkos::View<Scalar*,ExecSpace> values("Values",N);
+    auto h_values = Kokkos::create_mirror_view(values);
+    Scalar reference_bxor = Scalar() & (~Scalar());
+    for(int i=0; i<N; i++) {
+      h_values(i) = (Scalar)((rand()%100000+1)*2);
+      reference_bxor = reference_bxor ^ h_values(i);
+    }
+    Kokkos::deep_copy(values,h_values);
+
+    BXorFunctor f;
+    f.values = values;
+    Scalar init = Scalar() & (~Scalar());
+
+    {
+      Scalar bxor_scalar = init;
+      Kokkos::Experimental::BXor<Scalar> reducer_scalar(bxor_scalar);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar);
+      ASSERT_EQ(bxor_scalar,reference_bxor);
+      Scalar bxor_scalar_view = reducer_scalar.result_view()();
+      ASSERT_EQ(bxor_scalar_view,reference_bxor);
+    }
+
+    {
+      Kokkos::View<Scalar,Kokkos::HostSpace> bxor_view("View");
+      bxor_view() = init;
+      Kokkos::Experimental::BXor<Scalar> reducer_view(bxor_view);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view);
+      Scalar bxor_view_scalar = bxor_view();
+      ASSERT_EQ(bxor_view_scalar,reference_bxor);
+      Scalar bxor_view_view = reducer_view.result_view()();
+      ASSERT_EQ(bxor_view_view,reference_bxor);
+    }
+  }
+
+  static void test_LAnd(int N) {
+    Kokkos::View<Scalar*,ExecSpace> values("Values",N);
+    auto h_values = Kokkos::create_mirror_view(values);
+    Scalar reference_land = 1;
+    for(int i=0; i<N; i++) {
+      h_values(i) = (Scalar)(rand()%2);
+      reference_land = reference_land && h_values(i);
+    }
+    Kokkos::deep_copy(values,h_values);
+
+    LAndFunctor f;
+    f.values = values;
+    Scalar init = 1;
+
+    {
+      Scalar land_scalar = init;
+      Kokkos::Experimental::LAnd<Scalar> reducer_scalar(land_scalar);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar);
+      ASSERT_EQ(land_scalar,reference_land);
+      Scalar land_scalar_view = reducer_scalar.result_view()();
+      ASSERT_EQ(land_scalar_view,reference_land);
+    }
+
+    {
+      Kokkos::View<Scalar,Kokkos::HostSpace> land_view("View");
+      land_view() = init;
+      Kokkos::Experimental::LAnd<Scalar> reducer_view(land_view);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view);
+      Scalar land_view_scalar = land_view();
+      ASSERT_EQ(land_view_scalar,reference_land);
+      Scalar land_view_view = reducer_view.result_view()();
+      ASSERT_EQ(land_view_view,reference_land);
+    }
+  }
+
+  static void test_LOr(int N) {
+    Kokkos::View<Scalar*,ExecSpace> values("Values",N);
+    auto h_values = Kokkos::create_mirror_view(values);
+    Scalar reference_lor = 0;
+    for(int i=0; i<N; i++) {
+      h_values(i) = (Scalar)(rand()%2);
+      reference_lor = reference_lor || h_values(i);
+    }
+    Kokkos::deep_copy(values,h_values);
+
+    LOrFunctor f;
+    f.values = values;
+    Scalar init = 0;
+
+    {
+      Scalar lor_scalar = init;
+      Kokkos::Experimental::LOr<Scalar> reducer_scalar(lor_scalar);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar);
+      ASSERT_EQ(lor_scalar,reference_lor);
+      Scalar lor_scalar_view = reducer_scalar.result_view()();
+      ASSERT_EQ(lor_scalar_view,reference_lor);
+    }
+
+    {
+      Kokkos::View<Scalar,Kokkos::HostSpace> lor_view("View");
+      lor_view() = init;
+      Kokkos::Experimental::LOr<Scalar> reducer_view(lor_view);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view);
+      Scalar lor_view_scalar = lor_view();
+      ASSERT_EQ(lor_view_scalar,reference_lor);
+      Scalar lor_view_view = reducer_view.result_view()();
+      ASSERT_EQ(lor_view_view,reference_lor);
+    }
+  }
+
+  static void test_LXor(int N) {
+    Kokkos::View<Scalar*,ExecSpace> values("Values",N);
+    auto h_values = Kokkos::create_mirror_view(values);
+    Scalar reference_lxor = 0;
+    for(int i=0; i<N; i++) {
+      h_values(i) = (Scalar)(rand()%2);
+      reference_lxor = reference_lxor ? (!h_values(i)) : h_values(i);
+    }
+    Kokkos::deep_copy(values,h_values);
+
+    LXorFunctor f;
+    f.values = values;
+    Scalar init = 0;
+
+    {
+      Scalar lxor_scalar = init;
+      Kokkos::Experimental::LXor<Scalar> reducer_scalar(lxor_scalar);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar);
+      ASSERT_EQ(lxor_scalar,reference_lxor);
+      Scalar lxor_scalar_view = reducer_scalar.result_view()();
+      ASSERT_EQ(lxor_scalar_view,reference_lxor);
+    }
+
+    {
+      Kokkos::View<Scalar,Kokkos::HostSpace> lxor_view("View");
+      lxor_view() = init;
+      Kokkos::Experimental::LXor<Scalar> reducer_view(lxor_view);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view);
+      Scalar lxor_view_scalar = lxor_view();
+      ASSERT_EQ(lxor_view_scalar,reference_lxor);
+      Scalar lxor_view_view = reducer_view.result_view()();
+      ASSERT_EQ(lxor_view_view,reference_lxor);
+    }
+  }
+
+  static void execute_float() {
+    test_sum(10001);
+    test_prod(35);
+    test_min(10003);
+    test_minloc(10003);
+    test_max(10007);
+    test_maxloc(10007);
+    test_minmaxloc(10007);
+  }
+
+  static void execute_integer() {
+    test_sum(10001);
+    test_prod(35);
+    test_min(10003);
+    test_minloc(10003);
+    test_max(10007);
+    test_maxloc(10007);
+    test_minmaxloc(10007);
+    test_BAnd(35);
+    test_BOr(35);
+    test_BXor(35);
+    test_LAnd(35);
+    test_LOr(35);
+    test_LXor(35);
+  }
+
+  static void execute_basic() {
+    test_sum(10001);
+    test_prod(35);
+  }
+};
+}
+
+/*--------------------------------------------------------------------------*/
+
diff --git a/lib/kokkos/core/unit_test/TestScan.hpp b/lib/kokkos/core/unit_test/TestScan.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..3eeea57043ece1142be96ed15dcbab3aa8a9285f
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestScan.hpp
@@ -0,0 +1,103 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+/*--------------------------------------------------------------------------*/
+
+#include <stdio.h>
+
+namespace Test {
+
+template< class Device , class WorkSpec = size_t >
+struct TestScan {
+
+  typedef  Device    execution_space ;
+  typedef  long int  value_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int iwork , value_type & update , const bool final_pass ) const
+  {
+    const value_type n = iwork + 1 ;
+    const value_type imbalance = ( (1000 <= n) && (0 == n % 1000) ) ? 1000 : 0 ;
+
+    // Insert an artificial load imbalance
+
+    for ( value_type i = 0 ; i < imbalance ; ++i ) { ++update ; }
+
+    update += n - imbalance ;
+
+    if ( final_pass ) {
+      const value_type answer = n & 1 ? ( n * ( ( n + 1 ) / 2 ) ) : ( ( n / 2 ) * ( n + 1 ) );
+
+      if ( answer != update ) {
+        printf("TestScan(%d,%ld) != %ld\n",iwork,update,answer);
+      }
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type & update ) const { update = 0 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile       value_type & update ,
+             volatile const value_type & input ) const
+  { update += input ; }
+
+  TestScan( const WorkSpec & N )
+    { parallel_scan( N , *this ); }
+
+  TestScan( const WorkSpec & Start , const WorkSpec & N )
+    {
+      typedef Kokkos::RangePolicy<execution_space> exec_policy ;
+      parallel_scan( exec_policy( Start , N ) , *this );
+    }
+
+  static void test_range( const WorkSpec & begin , const WorkSpec & end )
+    {
+      for ( WorkSpec i = begin ; i < end ; ++i ) {
+        (void) TestScan( i );
+      }
+    }
+};
+
+}
+
diff --git a/lib/kokkos/core/unit_test/TestSerial.cpp b/lib/kokkos/core/unit_test/TestSerial.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d85614e66e67af2ccae9979d7f3869cbf5165c1d
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestSerial.cpp
@@ -0,0 +1,571 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+#include <gtest/gtest.h>
+
+#include <Kokkos_Macros.hpp>
+#ifdef KOKKOS_LAMBDA
+#undef KOKKOS_LAMBDA
+#endif
+#define KOKKOS_LAMBDA [=]
+
+#include <Kokkos_Core.hpp>
+
+#include <impl/Kokkos_ViewTileLeft.hpp>
+#include <TestTile.hpp>
+
+#include <impl/Kokkos_Serial_TaskPolicy.hpp>
+
+//----------------------------------------------------------------------------
+
+#include <TestSharedAlloc.hpp>
+#include <TestViewMapping.hpp>
+
+#include <TestViewImpl.hpp>
+
+#include <TestViewAPI.hpp>
+#include <TestViewOfClass.hpp>
+#include <TestViewSubview.hpp>
+#include <TestAtomic.hpp>
+#include <TestAtomicOperations.hpp>
+#include <TestRange.hpp>
+#include <TestTeam.hpp>
+#include <TestReduce.hpp>
+#include <TestScan.hpp>
+#include <TestAggregate.hpp>
+#include <TestAggregateReduction.hpp>
+#include <TestCompilerMacros.hpp>
+#include <TestTaskPolicy.hpp>
+#include <TestMemoryPool.hpp>
+
+
+#include <TestCXX11.hpp>
+#include <TestCXX11Deduction.hpp>
+#include <TestTeamVector.hpp>
+#include <TestMemorySpaceTracking.hpp>
+#include <TestTemplateMetaFunctions.hpp>
+
+#include <TestPolicyConstruction.hpp>
+
+#include <TestMDRange.hpp>
+
+namespace Test {
+
+class serial : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+    {
+      Kokkos::HostSpace::execution_space::initialize();
+    }
+  static void TearDownTestCase()
+    {
+      Kokkos::HostSpace::execution_space::finalize();
+    }
+};
+
+TEST_F( serial , md_range ) {
+  TestMDRange_2D< Kokkos::Serial >::test_for2(100,100);
+
+  TestMDRange_3D< Kokkos::Serial >::test_for3(100,100,100);
+}
+
+TEST_F( serial , impl_shared_alloc ) {
+  test_shared_alloc< Kokkos::HostSpace , Kokkos::Serial >();
+}
+
+TEST_F( serial, policy_construction) {
+  TestRangePolicyConstruction< Kokkos::Serial >();
+  TestTeamPolicyConstruction< Kokkos::Serial >();
+}
+
+TEST_F( serial , impl_view_mapping ) {
+  test_view_mapping< Kokkos::Serial >();
+  test_view_mapping_subview< Kokkos::Serial >();
+  test_view_mapping_operator< Kokkos::Serial >();
+  TestViewMappingAtomic< Kokkos::Serial >::run();
+}
+
+TEST_F( serial, view_impl) {
+  test_view_impl< Kokkos::Serial >();
+}
+
+TEST_F( serial, view_api) {
+  TestViewAPI< double , Kokkos::Serial >();
+}
+
+TEST_F( serial , view_nested_view )
+{
+  ::Test::view_nested_view< Kokkos::Serial >();
+}
+
+TEST_F( serial, view_subview_auto_1d_left ) {
+  TestViewSubview::test_auto_1d< Kokkos::LayoutLeft,Kokkos::Serial >();
+}
+
+TEST_F( serial, view_subview_auto_1d_right ) {
+  TestViewSubview::test_auto_1d< Kokkos::LayoutRight,Kokkos::Serial >();
+}
+
+TEST_F( serial, view_subview_auto_1d_stride ) {
+  TestViewSubview::test_auto_1d< Kokkos::LayoutStride,Kokkos::Serial >();
+}
+
+TEST_F( serial, view_subview_assign_strided ) {
+  TestViewSubview::test_1d_strided_assignment< Kokkos::Serial >();
+}
+
+TEST_F( serial, view_subview_left_0 ) {
+  TestViewSubview::test_left_0< Kokkos::Serial >();
+}
+
+TEST_F( serial, view_subview_left_1 ) {
+  TestViewSubview::test_left_1< Kokkos::Serial >();
+}
+
+TEST_F( serial, view_subview_left_2 ) {
+  TestViewSubview::test_left_2< Kokkos::Serial >();
+}
+
+TEST_F( serial, view_subview_left_3 ) {
+  TestViewSubview::test_left_3< Kokkos::Serial >();
+}
+
+TEST_F( serial, view_subview_right_0 ) {
+  TestViewSubview::test_right_0< Kokkos::Serial >();
+}
+
+TEST_F( serial, view_subview_right_1 ) {
+  TestViewSubview::test_right_1< Kokkos::Serial >();
+}
+
+TEST_F( serial, view_subview_right_3 ) {
+  TestViewSubview::test_right_3< Kokkos::Serial >();
+}
+
+TEST_F( serial , range_tag )
+{
+  TestRange< Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >::test_for(1000);
+  TestRange< Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >::test_reduce(1000);
+  TestRange< Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >::test_scan(1000);
+  TestRange< Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(1001);
+  TestRange< Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(1001);
+  TestRange< Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >::test_scan(1001);
+  TestRange< Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >::test_dynamic_policy(1000);
+}
+
+TEST_F( serial , team_tag )
+{
+  TestTeamPolicy< Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >::test_for(1000);
+  TestTeamPolicy< Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >::test_reduce(1000);
+  TestTeamPolicy< Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(1000);
+  TestTeamPolicy< Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(1000);
+}
+
+TEST_F( serial, long_reduce) {
+  TestReduce< long ,   Kokkos::Serial >( 1000000 );
+}
+
+TEST_F( serial, double_reduce) {
+  TestReduce< double ,   Kokkos::Serial >( 1000000 );
+}
+
+TEST_F( serial , reducers )
+{
+  TestReducers<int, Kokkos::Serial>::execute_integer();
+  TestReducers<size_t, Kokkos::Serial>::execute_integer();
+  TestReducers<double, Kokkos::Serial>::execute_float();
+  TestReducers<Kokkos::complex<double>, Kokkos::Serial>::execute_basic();
+}
+
+TEST_F( serial, long_reduce_dynamic ) {
+  TestReduceDynamic< long ,   Kokkos::Serial >( 1000000 );
+}
+
+TEST_F( serial, double_reduce_dynamic ) {
+  TestReduceDynamic< double ,   Kokkos::Serial >( 1000000 );
+}
+
+TEST_F( serial, long_reduce_dynamic_view ) {
+  TestReduceDynamicView< long ,   Kokkos::Serial >( 1000000 );
+}
+
+TEST_F( serial , scan )
+{
+  TestScan< Kokkos::Serial >::test_range( 1 , 1000 );
+  TestScan< Kokkos::Serial >( 10 );
+  TestScan< Kokkos::Serial >( 10000 );
+}
+
+TEST_F( serial , team_long_reduce) {
+  TestReduceTeam< long ,   Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >( 3 );
+  TestReduceTeam< long ,   Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
+  TestReduceTeam< long ,   Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >( 100000 );
+  TestReduceTeam< long ,   Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
+}
+
+TEST_F( serial , team_double_reduce) {
+  TestReduceTeam< double ,   Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >( 3 );
+  TestReduceTeam< double ,   Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
+  TestReduceTeam< double ,   Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >( 100000 );
+  TestReduceTeam< double ,   Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
+}
+
+TEST_F( serial , team_shared_request) {
+  TestSharedTeam< Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >();
+  TestSharedTeam< Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >();
+}
+
+#if defined(KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA)
+TEST_F( serial , team_lambda_shared_request) {
+  TestLambdaSharedTeam< Kokkos::HostSpace, Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >();
+  TestLambdaSharedTeam< Kokkos::HostSpace, Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >();
+}
+#endif
+
+TEST_F( serial, shmem_size) {
+  TestShmemSize< Kokkos::Serial >();
+}
+
+TEST_F( serial  , team_scan )
+{
+  TestScanTeam< Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >( 10 );
+  TestScanTeam< Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >( 10 );
+  TestScanTeam< Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >( 10000 );
+  TestScanTeam< Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >( 10000 );
+}
+
+
+TEST_F( serial , view_remap )
+{
+  enum { N0 = 3 , N1 = 2 , N2 = 8 , N3 = 9 };
+
+  typedef Kokkos::View< double*[N1][N2][N3] ,
+                             Kokkos::LayoutRight ,
+                             Kokkos::Serial > output_type ;
+
+  typedef Kokkos::View< int**[N2][N3] ,
+                             Kokkos::LayoutLeft ,
+                             Kokkos::Serial > input_type ;
+
+  typedef Kokkos::View< int*[N0][N2][N3] ,
+                             Kokkos::LayoutLeft ,
+                             Kokkos::Serial > diff_type ;
+
+  output_type output( "output" , N0 );
+  input_type  input ( "input" , N0 , N1 );
+  diff_type   diff  ( "diff" , N0 );
+
+  int value = 0 ;
+  for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
+  for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
+  for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
+  for ( size_t i0 = 0 ; i0 < N0 ; ++i0 ) {
+    input(i0,i1,i2,i3) = ++value ;
+  }}}}
+
+  // Kokkos::deep_copy( diff , input ); // throw with incompatible shape
+  Kokkos::deep_copy( output , input );
+
+  value = 0 ;
+  for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
+  for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
+  for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
+  for ( size_t i0 = 0 ; i0 < N0 ; ++i0 ) {
+    ++value ;
+    ASSERT_EQ( value , ((int) output(i0,i1,i2,i3) ) );
+  }}}}
+}
+
+//----------------------------------------------------------------------------
+
+TEST_F( serial , view_aggregate )
+{
+  TestViewAggregate< Kokkos::Serial >();
+  TestViewAggregateReduction< Kokkos::Serial >();
+}
+
+//----------------------------------------------------------------------------
+
+TEST_F( serial , atomics )
+{
+  const int loop_count = 1e6 ;
+
+  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Serial>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Serial>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Serial>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Serial>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Serial>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Serial>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Serial>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Serial>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Serial>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Serial>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Serial>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Serial>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Serial>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Serial>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Serial>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Serial>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Serial>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Serial>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Serial>(100,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Serial>(100,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Serial>(100,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Serial>(100,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Serial>(100,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Serial>(100,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<4> ,Kokkos::Serial>(100,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<4> ,Kokkos::Serial>(100,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<4> ,Kokkos::Serial>(100,3) ) );
+}
+
+TEST_F( serial , atomic_operations )
+{
+  const int start = 1; //Avoid zero for division
+  const int end = 11;
+  for (int i = start; i < end; ++i)
+  {
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Serial>(start, end-i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Serial>(start, end-i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Serial>(start, end-i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Serial>(start, end-i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Serial>(start, end-i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Serial>(start, end-i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Serial>(start, end-i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Serial>(start, end-i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Serial>(start, end-i, 9 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Serial>(start, end-i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Serial>(start, end-i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Serial>(start, end-i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Serial>(start, end-i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Serial>(start, end-i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Serial>(start, end-i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Serial>(start, end-i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Serial>(start, end-i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Serial>(start, end-i, 9 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Serial>(start, end-i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Serial>(start, end-i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Serial>(start, end-i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Serial>(start, end-i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Serial>(start, end-i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Serial>(start, end-i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Serial>(start, end-i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Serial>(start, end-i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Serial>(start, end-i, 9 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Serial>(start, end-i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Serial>(start, end-i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Serial>(start, end-i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Serial>(start, end-i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Serial>(start, end-i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Serial>(start, end-i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Serial>(start, end-i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Serial>(start, end-i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Serial>(start, end-i, 9 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Serial>(start, end-i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Serial>(start, end-i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Serial>(start, end-i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Serial>(start, end-i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Serial>(start, end-i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Serial>(start, end-i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Serial>(start, end-i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Serial>(start, end-i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Serial>(start, end-i, 9 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::Serial>(start, end-i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::Serial>(start, end-i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::Serial>(start, end-i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::Serial>(start, end-i, 4 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::Serial>(start, end-i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::Serial>(start, end-i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::Serial>(start, end-i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::Serial>(start, end-i, 4 ) ) );
+  }
+
+}
+//----------------------------------------------------------------------------
+
+TEST_F( serial, tile_layout )
+{
+  TestTile::test< Kokkos::Serial , 1 , 1 >( 1 , 1 );
+  TestTile::test< Kokkos::Serial , 1 , 1 >( 2 , 3 );
+  TestTile::test< Kokkos::Serial , 1 , 1 >( 9 , 10 );
+
+  TestTile::test< Kokkos::Serial , 2 , 2 >( 1 , 1 );
+  TestTile::test< Kokkos::Serial , 2 , 2 >( 2 , 3 );
+  TestTile::test< Kokkos::Serial , 2 , 2 >( 4 , 4 );
+  TestTile::test< Kokkos::Serial , 2 , 2 >( 9 , 9 );
+
+  TestTile::test< Kokkos::Serial , 2 , 4 >( 9 , 9 );
+  TestTile::test< Kokkos::Serial , 4 , 2 >( 9 , 9 );
+
+  TestTile::test< Kokkos::Serial , 4 , 4 >( 1 , 1 );
+  TestTile::test< Kokkos::Serial , 4 , 4 >( 4 , 4 );
+  TestTile::test< Kokkos::Serial , 4 , 4 >( 9 , 9 );
+  TestTile::test< Kokkos::Serial , 4 , 4 >( 9 , 11 );
+
+  TestTile::test< Kokkos::Serial , 8 , 8 >( 1 , 1 );
+  TestTile::test< Kokkos::Serial , 8 , 8 >( 4 , 4 );
+  TestTile::test< Kokkos::Serial , 8 , 8 >( 9 , 9 );
+  TestTile::test< Kokkos::Serial , 8 , 8 >( 9 , 11 );
+}
+
+//----------------------------------------------------------------------------
+
+TEST_F( serial , compiler_macros )
+{
+  ASSERT_TRUE( ( TestCompilerMacros::Test< Kokkos::Serial >() ) );
+}
+
+//----------------------------------------------------------------------------
+
+TEST_F( serial , memory_space )
+{
+  TestMemorySpace< Kokkos::Serial >();
+}
+
+TEST_F( serial , memory_pool )
+{
+  bool val = TestMemoryPool::test_mempool< Kokkos::Serial >( 128, 128000000 );
+  ASSERT_TRUE( val );
+
+  TestMemoryPool::test_mempool2< Kokkos::Serial >( 64, 4, 1000000, 2000000 );
+
+  TestMemoryPool::test_memory_exhaustion< Kokkos::Serial >();
+}
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_ENABLE_TASKPOLICY )
+
+TEST_F( serial , task_fib )
+{
+  for ( int i = 0 ; i < 25 ; ++i ) {
+    TestTaskPolicy::TestFib< Kokkos::Serial >::run(i);
+  }
+}
+
+TEST_F( serial , task_depend )
+{
+  for ( int i = 0 ; i < 25 ; ++i ) {
+    TestTaskPolicy::TestTaskDependence< Kokkos::Serial >::run(i);
+  }
+}
+
+TEST_F( serial , task_team )
+{
+  TestTaskPolicy::TestTaskTeam< Kokkos::Serial >::run(1000);
+  //TestTaskPolicy::TestTaskTeamValue< Kokkos::Serial >::run(1000); //put back after testing
+}
+
+TEST_F( serial , old_task_policy )
+{
+  TestTaskPolicy::test_task_dep< Kokkos::Serial >( 10 );
+  // TestTaskPolicy::test_norm2< Kokkos::Serial >( 1000 );
+  // for ( long i = 0 ; i < 30 ; ++i ) TestTaskPolicy::test_fib< Kokkos::Serial >(i);
+  // for ( long i = 0 ; i < 40 ; ++i ) TestTaskPolicy::test_fib2< Kokkos::Serial >(i);
+  for ( long i = 0 ; i < 20 ; ++i ) TestTaskPolicy::test_fib< Kokkos::Serial >(i);
+  for ( long i = 0 ; i < 25 ; ++i ) TestTaskPolicy::test_fib2< Kokkos::Serial >(i);
+}
+
+TEST_F( serial , old_task_team )
+{
+  TestTaskPolicy::test_task_team< Kokkos::Serial >(1000);
+}
+
+#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
+
+//----------------------------------------------------------------------------
+
+TEST_F( serial , template_meta_functions )
+{
+  TestTemplateMetaFunctions<int, Kokkos::Serial >();
+}
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_SERIAL )
+TEST_F( serial , cxx11 )
+{
+  if ( Kokkos::Impl::is_same< Kokkos::DefaultExecutionSpace , Kokkos::Serial >::value ) {
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Serial >(1) ) );
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Serial >(2) ) );
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Serial >(3) ) );
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Serial >(4) ) );
+  }
+}
+#endif
+
+TEST_F( serial , reduction_deduction )
+{
+  TestCXX11::test_reduction_deduction< Kokkos::Serial >();
+}
+
+TEST_F( serial , team_vector )
+{
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >(0) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >(1) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >(2) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >(3) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >(4) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >(5) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >(6) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >(7) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >(8) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >(9) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >(10) ) );
+}
+
+} // namespace test
+
diff --git a/lib/kokkos/core/unit_test/TestSharedAlloc.hpp b/lib/kokkos/core/unit_test/TestSharedAlloc.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..61166888142e7f666b303dc1c837daa34c07a00c
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestSharedAlloc.hpp
@@ -0,0 +1,215 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <stdexcept>
+#include <sstream>
+#include <iostream>
+
+#include <Kokkos_Core.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Test {
+
+struct SharedAllocDestroy {
+
+  volatile int * count ;
+
+  SharedAllocDestroy() = default ;
+  SharedAllocDestroy( int * arg ) : count( arg ) {}
+
+  void destroy_shared_allocation()
+    {
+      Kokkos::atomic_fetch_add( count , 1 );
+    }
+
+};
+
+template< class MemorySpace , class ExecutionSpace >
+void test_shared_alloc()
+{
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+
+  typedef const Kokkos::Experimental::Impl::SharedAllocationHeader   Header ;
+  typedef Kokkos::Experimental::Impl::SharedAllocationTracker  Tracker ;
+  typedef Kokkos::Experimental::Impl::SharedAllocationRecord< void , void >                       RecordBase ;
+  typedef Kokkos::Experimental::Impl::SharedAllocationRecord< MemorySpace , void >                RecordMemS ;
+  typedef Kokkos::Experimental::Impl::SharedAllocationRecord< MemorySpace , SharedAllocDestroy >  RecordFull ;
+
+  static_assert( sizeof(Tracker) == sizeof(int*), "SharedAllocationTracker has wrong size!" );
+
+  MemorySpace s ;
+
+  const size_t N = 1200 ;
+  const size_t size = 8 ;
+
+  RecordMemS * rarray[ N ];
+  Header     * harray[ N ];
+
+  RecordMemS ** const r = rarray ;
+  Header     ** const h = harray ;
+
+  Kokkos::RangePolicy< ExecutionSpace > range(0,N);
+  
+  //----------------------------------------
+  {
+  // Since always executed on host space, leave [=]
+    Kokkos::parallel_for( range , [=]( size_t i ){
+      char name[64] ;
+      sprintf(name,"test_%.2d",int(i));
+
+      r[i] = RecordMemS::allocate( s , name , size * ( i + 1 ) );
+      h[i] = Header::get_header( r[i]->data() );
+
+      ASSERT_EQ( r[i]->use_count() , 0 );
+
+      for ( size_t j = 0 ; j < ( i / 10 ) + 1 ; ++j ) RecordBase::increment( r[i] );
+
+      ASSERT_EQ( r[i]->use_count() , ( i / 10 ) + 1 );
+      ASSERT_EQ( r[i] , RecordMemS::get_record( r[i]->data() ) );
+    });
+
+    // Sanity check for the whole set of allocation records to which this record belongs.
+    RecordBase::is_sane( r[0] );
+    // RecordMemS::print_records( std::cout , s , true );
+
+    Kokkos::parallel_for( range , [=]( size_t i ){
+      while ( 0 != ( r[i] = static_cast< RecordMemS *>( RecordBase::decrement( r[i] ) ) ) ) {
+        if ( r[i]->use_count() == 1 ) RecordBase::is_sane( r[i] );
+      }
+    });
+  }
+  //----------------------------------------
+  {
+    int destroy_count = 0 ;
+    SharedAllocDestroy counter( & destroy_count );
+
+    Kokkos::parallel_for( range , [=]( size_t i ){
+      char name[64] ;
+      sprintf(name,"test_%.2d",int(i));
+
+      RecordFull * rec = RecordFull::allocate( s , name , size * ( i + 1 ) );
+
+      rec->m_destroy = counter ;
+
+      r[i] = rec ;
+      h[i] = Header::get_header( r[i]->data() );
+
+      ASSERT_EQ( r[i]->use_count() , 0 );
+
+      for ( size_t j = 0 ; j < ( i / 10 ) + 1 ; ++j ) RecordBase::increment( r[i] );
+
+      ASSERT_EQ( r[i]->use_count() , ( i / 10 ) + 1 );
+      ASSERT_EQ( r[i] , RecordMemS::get_record( r[i]->data() ) );
+    });
+
+    RecordBase::is_sane( r[0] );
+
+    Kokkos::parallel_for( range , [=]( size_t i ){
+      while ( 0 != ( r[i] = static_cast< RecordMemS *>( RecordBase::decrement( r[i] ) ) ) ) {
+        if ( r[i]->use_count() == 1 ) RecordBase::is_sane( r[i] );
+      }
+    });
+
+    ASSERT_EQ( destroy_count , int(N) );
+  }
+
+  //----------------------------------------
+  {
+    int destroy_count = 0 ;
+
+    {
+      RecordFull * rec = RecordFull::allocate( s , "test" , size );
+
+      // ... Construction of the allocated { rec->data() , rec->size() }
+
+      // Copy destruction function object into the allocation record
+      rec->m_destroy = SharedAllocDestroy( & destroy_count );
+
+      ASSERT_EQ( rec->use_count() , 0 );
+
+      // Start tracking, increments the use count from 0 to 1
+      Tracker track ;
+
+      track.assign_allocated_record_to_uninitialized( rec );
+
+      ASSERT_EQ( rec->use_count() , 1 );
+      ASSERT_EQ( track.use_count() , 1 );
+
+      // Verify construction / destruction increment
+      for ( size_t i = 0 ; i < N ; ++i ) {
+        ASSERT_EQ( rec->use_count() , 1 );
+        {
+          Tracker local_tracker ;
+          local_tracker.assign_allocated_record_to_uninitialized( rec );
+          ASSERT_EQ( rec->use_count() , 2 );
+          ASSERT_EQ( local_tracker.use_count() , 2 );
+        }
+        ASSERT_EQ( rec->use_count() , 1 );
+        ASSERT_EQ( track.use_count() , 1 );
+      }
+
+      Kokkos::parallel_for( range , [=]( size_t i ){
+        Tracker local_tracker ;
+        local_tracker.assign_allocated_record_to_uninitialized( rec );
+        ASSERT_GT( rec->use_count() , 1 );
+      });
+
+      ASSERT_EQ( rec->use_count() , 1 );
+      ASSERT_EQ( track.use_count() , 1 );
+
+      // Destruction of 'track' object deallocates the 'rec' and invokes the destroy function object.
+    }
+
+    ASSERT_EQ( destroy_count , 1 );
+  }
+
+#endif /* #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) */
+
+}
+
+
+}
+
diff --git a/lib/kokkos/core/unit_test/TestSynchronic.cpp b/lib/kokkos/core/unit_test/TestSynchronic.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9121dc15a17ecead1895ce1df660c1d25a2deda2
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestSynchronic.cpp
@@ -0,0 +1,448 @@
+/*
+
+Copyright (c) 2014, NVIDIA Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#undef _WIN32_WINNT
+//#define _WIN32_WINNT 0x0602
+
+#if defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) || defined(__APPLE__)
+
+// Skip for now
+
+#else
+
+#include <gtest/gtest.h>
+
+#ifdef USEOMP
+#include <omp.h>
+#endif
+
+#include <iostream>
+#include <sstream>
+#include <algorithm>
+#include <string>
+#include <vector>
+#include <map>
+#include <cstring>
+#include <ctime>
+
+//#include <details/config>
+//#undef __SYNCHRONIC_COMPATIBLE
+
+#include <impl/Kokkos_Synchronic.hpp>
+#include <impl/Kokkos_Synchronic_n3998.hpp>
+
+#include "TestSynchronic.hpp"
+
+// Uncomment to allow test to dump output
+//#define VERBOSE_TEST
+
+namespace Test {
+
+unsigned next_table[] =
+    {
+        0, 1, 2, 3,         //0-3
+        4, 4, 6, 6,         //4-7
+        8, 8, 8, 8,         //8-11
+        12, 12, 12, 12,     //12-15
+        16, 16, 16, 16,     //16-19
+        16, 16, 16, 16,     //20-23
+        24, 24, 24, 24,     //24-27
+        24, 24, 24, 24,     //28-31
+        32, 32, 32, 32,     //32-35
+        32, 32, 32, 32,     //36-39
+        40, 40, 40, 40,     //40-43
+        40, 40, 40, 40,     //44-47
+        48, 48, 48, 48,     //48-51
+        48, 48, 48, 48,     //52-55
+        56, 56, 56, 56,     //56-59
+        56, 56, 56, 56,     //60-63
+    };
+
+//change this if you want to allow oversubscription of the system, by default only the range {1-(system size)} is tested
+#define FOR_GAUNTLET(x) for(unsigned x = (std::min)(std::thread::hardware_concurrency()*8,unsigned(sizeof(next_table)/sizeof(unsigned))); x; x = next_table[x-1])
+
+//set this to override the benchmark of barriers to use OMP barriers instead of n3998 std::barrier
+//#define USEOMP
+
+#if defined(__SYNCHRONIC_COMPATIBLE)
+    #define PREFIX "futex-"
+#else
+    #define PREFIX "backoff-"
+#endif
+
+//this test uses a custom Mersenne twister to eliminate implementation variation
+MersenneTwister mt;
+
+int dummya = 1, dummyb =1;
+
+int dummy1 = 1;
+std::atomic<int> dummy2(1);
+std::atomic<int> dummy3(1);
+
+double time_item(int const count = (int)1E8)  {
+
+    clock_t const start = clock();
+
+    for(int i = 0;i < count; ++i)
+        mt.integer();
+
+    clock_t const end = clock();
+    double elapsed_seconds = (end - start) / double(CLOCKS_PER_SEC);
+
+    return elapsed_seconds / count;
+}
+double time_nil(int const count = (int)1E08)  {
+
+    clock_t const start = clock();
+
+    dummy3 = count;
+    for(int i = 0;i < (int)1E6; ++i) {
+        if(dummy1) {
+            // Do some work while holding the lock
+            int workunits = dummy3;//(int) (mtc.poissonInterval((float)num_items_critical) + 0.5f);
+            for (int j = 1; j < workunits; j++)
+                dummy1 &= j;       // Do one work unit
+            dummy2.fetch_add(dummy1,std::memory_order_relaxed);
+        }
+    }
+
+    clock_t const end = clock();
+    double elapsed_seconds = (end - start) / double(CLOCKS_PER_SEC);
+
+    return elapsed_seconds / count;
+}
+
+
+template <class mutex_type>
+void testmutex_inner(mutex_type& m, std::atomic<int>& t,std::atomic<int>& wc,std::atomic<int>& wnc, int const num_iterations,
+                     int const num_items_critical, int const num_items_noncritical, MersenneTwister& mtc, MersenneTwister& mtnc, bool skip) {
+
+    for(int k = 0; k < num_iterations; ++k) {
+
+        if(num_items_noncritical) {
+            // Do some work without holding the lock
+            int workunits = num_items_noncritical;//(int) (mtnc.poissonInterval((float)num_items_noncritical) + 0.5f);
+            for (int i = 1; i < workunits; i++)
+                mtnc.integer();       // Do one work unit
+            wnc.fetch_add(workunits,std::memory_order_relaxed);
+        }
+
+        t.fetch_add(1,std::memory_order_relaxed);
+
+        if(!skip) {
+            std::unique_lock<mutex_type> l(m);
+            if(num_items_critical) {
+                // Do some work while holding the lock
+                int workunits = num_items_critical;//(int) (mtc.poissonInterval((float)num_items_critical) + 0.5f);
+                for (int i = 1; i < workunits; i++)
+                    mtc.integer();       // Do one work unit
+                wc.fetch_add(workunits,std::memory_order_relaxed);
+            }
+        }
+    }
+}
+template <class mutex_type>
+void testmutex_outer(std::map<std::string,std::vector<double>>& results, std::string const& name, double critical_fraction, double critical_duration) {
+
+    std::ostringstream truename;
+    truename << name << " (f=" << critical_fraction << ",d=" << critical_duration << ")";
+
+    std::vector<double>& data = results[truename.str()];
+
+    double const workItemTime = time_item() ,
+                 nilTime = time_nil();
+
+    int const num_items_critical = (critical_duration <= 0 ? 0 : (std::max)( int(critical_duration / workItemTime + 0.5), int(100 * nilTime / workItemTime + 0.5))),
+              num_items_noncritical = (num_items_critical <= 0 ? 0 : int( ( 1 - critical_fraction ) * num_items_critical / critical_fraction + 0.5 ));
+
+    FOR_GAUNTLET(num_threads) {
+
+        //Kokkos::Impl::portable_sleep(std::chrono::microseconds(2000000));
+
+        int const num_iterations = (num_items_critical + num_items_noncritical != 0) ?
+#ifdef __SYNCHRONIC_JUST_YIELD
+                                        int( 1 / ( 8 * workItemTime ) / (num_items_critical + num_items_noncritical) / num_threads + 0.5 ) :
+#else
+                                        int( 1 / ( 8 * workItemTime ) / (num_items_critical + num_items_noncritical) / num_threads + 0.5 ) :
+#endif
+#ifdef WIN32
+                                        int( 1 / workItemTime / (20 * num_threads * num_threads) );
+#else
+                                        int( 1 / workItemTime / (200 * num_threads * num_threads) );
+#endif
+
+#ifdef VERBOSE_TEST
+        std::cerr << "running " << truename.str() << " #" << num_threads << ", " << num_iterations << " * " << num_items_noncritical << "\n" << std::flush;
+#endif
+
+
+        std::atomic<int> t[2], wc[2], wnc[2];
+
+        clock_t start[2], end[2];
+        for(int pass = 0; pass < 2; ++pass) {
+
+            t[pass] = 0;
+            wc[pass] = 0;
+            wnc[pass] = 0;
+
+            srand(num_threads);
+            std::vector<MersenneTwister> randomsnc(num_threads),
+                                         randomsc(num_threads);
+
+            mutex_type m;
+
+            start[pass] = clock();
+#ifdef USEOMP
+            omp_set_num_threads(num_threads);
+            std::atomic<int> _j(0);
+            #pragma omp parallel
+            {
+                int const j = _j.fetch_add(1,std::memory_order_relaxed);
+                testmutex_inner(m, t[pass], wc[pass], wnc[pass], num_iterations, num_items_critical, num_items_noncritical, randomsc[j], randomsnc[j], pass==0);
+                num_threads = omp_get_num_threads();
+            }
+#else
+            std::vector<std::thread*> threads(num_threads);
+            for(unsigned j = 0; j < num_threads; ++j)
+                threads[j] = new std::thread([&,j](){
+                        testmutex_inner(m, t[pass], wc[pass], wnc[pass], num_iterations, num_items_critical, num_items_noncritical, randomsc[j], randomsnc[j], pass==0);
+                    }
+                );
+            for(unsigned j = 0; j < num_threads; ++j) {
+                threads[j]->join();
+                delete threads[j];
+            }
+#endif
+            end[pass] = clock();
+        }
+        if(t[0] != t[1]) throw std::string("mismatched iteration counts");
+        if(wnc[0] != wnc[1]) throw std::string("mismatched work item counts");
+
+        double elapsed_seconds_0 = (end[0] - start[0]) / double(CLOCKS_PER_SEC),
+               elapsed_seconds_1 = (end[1] - start[1]) / double(CLOCKS_PER_SEC);
+        double time = (elapsed_seconds_1 - elapsed_seconds_0 - wc[1]*workItemTime) / num_iterations;
+
+        data.push_back(time);
+#ifdef VERBOSE_TEST
+        std::cerr << truename.str() << " : " << num_threads << "," << elapsed_seconds_1 / num_iterations << " - " << elapsed_seconds_0 / num_iterations << " - " << wc[1]*workItemTime/num_iterations << " = " << time << "                                                 \n";
+#endif
+    }
+}
+
+template <class barrier_type>
+void testbarrier_inner(barrier_type& b, int const num_threads, int const j, std::atomic<int>& t,std::atomic<int>& w,
+                       int const num_iterations_odd, int const num_iterations_even,
+                       int const num_items_noncritical, MersenneTwister& arg_mt, bool skip) {
+
+    for(int k = 0; k < (std::max)(num_iterations_even,num_iterations_odd); ++k) {
+
+        if(k >= (~j & 0x1 ? num_iterations_odd : num_iterations_even )) {
+            if(!skip)
+                b.arrive_and_drop();
+            break;
+        }
+
+        if(num_items_noncritical) {
+            // Do some work without holding the lock
+            int workunits = (int) (arg_mt.poissonInterval((float)num_items_noncritical) + 0.5f);
+            for (int i = 1; i < workunits; i++)
+                arg_mt.integer();       // Do one work unit
+            w.fetch_add(workunits,std::memory_order_relaxed);
+        }
+
+        t.fetch_add(1,std::memory_order_relaxed);
+
+        if(!skip) {
+            int const thiscount = (std::min)(k+1,num_iterations_odd)*((num_threads>>1)+(num_threads&1)) + (std::min)(k+1,num_iterations_even)*(num_threads>>1);
+            if(t.load(std::memory_order_relaxed) > thiscount) {
+                std::cerr << "FAILURE: some threads have run ahead of the barrier (" << t.load(std::memory_order_relaxed) << ">" <<  thiscount << ").\n";
+                EXPECT_TRUE(false);
+            }
+#ifdef USEOMP
+            #pragma omp barrier
+#else
+            b.arrive_and_wait();
+#endif
+            if(t.load(std::memory_order_relaxed) < thiscount) {
+                std::cerr << "FAILURE: some threads have fallen behind the barrier (" << t.load(std::memory_order_relaxed) << "<" << thiscount << ").\n";
+                EXPECT_TRUE(false);
+            }
+        }
+    }
+}
+template <class barrier_type>
+void testbarrier_outer(std::map<std::string,std::vector<double>>& results, std::string const& name, double barrier_frequency, double phase_duration, bool randomIterations = false) {
+
+    std::vector<double>& data = results[name];
+
+    double const workItemTime = time_item();
+    int const num_items_noncritical = int( phase_duration / workItemTime + 0.5 );
+
+    FOR_GAUNTLET(num_threads) {
+
+        int const num_iterations = int( barrier_frequency );
+#ifdef VERBOSE_TEST
+        std::cerr << "running " << name << " #" << num_threads << ", " << num_iterations << " * " << num_items_noncritical << "\r" << std::flush;
+#endif
+
+        srand(num_threads);
+
+        MersenneTwister local_mt;
+        int const num_iterations_odd = randomIterations ? int(local_mt.poissonInterval((float)num_iterations)+0.5f) : num_iterations,
+                  num_iterations_even = randomIterations ? int(local_mt.poissonInterval((float)num_iterations)+0.5f) : num_iterations;
+
+        std::atomic<int> t[2], w[2];
+        std::chrono::time_point<std::chrono::high_resolution_clock> start[2], end[2];
+        for(int pass = 0; pass < 2; ++pass) {
+
+            t[pass] = 0;
+            w[pass] = 0;
+
+            srand(num_threads);
+            std::vector<MersenneTwister> randoms(num_threads);
+
+            barrier_type b(num_threads);
+
+            start[pass] = std::chrono::high_resolution_clock::now();
+#ifdef USEOMP
+            omp_set_num_threads(num_threads);
+            std::atomic<int> _j(0);
+            #pragma omp parallel
+            {
+                int const j = _j.fetch_add(1,std::memory_order_relaxed);
+                testbarrier_inner(b, num_threads, j, t[pass], w[pass], num_iterations_odd, num_iterations_even, num_items_noncritical, randoms[j], pass==0);
+                num_threads = omp_get_num_threads();
+            }
+#else
+            std::vector<std::thread*> threads(num_threads);
+            for(unsigned j = 0; j < num_threads; ++j)
+                threads[j] = new std::thread([&,j](){
+                    testbarrier_inner(b, num_threads, j, t[pass], w[pass], num_iterations_odd, num_iterations_even, num_items_noncritical, randoms[j], pass==0);
+                });
+            for(unsigned j = 0; j < num_threads; ++j) {
+                threads[j]->join();
+                delete threads[j];
+            }
+#endif
+            end[pass] = std::chrono::high_resolution_clock::now();
+        }
+
+        if(t[0] != t[1]) throw std::string("mismatched iteration counts");
+        if(w[0] != w[1]) throw std::string("mismatched work item counts");
+
+        int const phases = (std::max)(num_iterations_odd, num_iterations_even);
+
+        std::chrono::duration<double> elapsed_seconds_0 = end[0]-start[0],
+                                      elapsed_seconds_1 = end[1]-start[1];
+        double const time = (elapsed_seconds_1.count() - elapsed_seconds_0.count()) / phases;
+
+        data.push_back(time);
+#ifdef VERBOSE_TEST
+        std::cerr << name << " : " << num_threads << "," << elapsed_seconds_1.count() / phases << " - " << elapsed_seconds_0.count() / phases << " = " << time << "                                                 \n";
+#endif
+    }
+}
+
+template <class... T>
+struct mutex_tester;
+template <class F>
+struct mutex_tester<F> {
+    static void run(std::map<std::string,std::vector<double>>& results, std::string const name[], double critical_fraction, double critical_duration) {
+        testmutex_outer<F>(results, *name, critical_fraction, critical_duration);
+    }
+};
+template <class F, class... T>
+struct mutex_tester<F,T...> {
+    static void run(std::map<std::string,std::vector<double>>& results, std::string const name[], double critical_fraction, double critical_duration) {
+        mutex_tester<F>::run(results, name, critical_fraction, critical_duration);
+        mutex_tester<T...>::run(results, ++name, critical_fraction, critical_duration);
+    }
+};
+
+TEST( synchronic, main )
+{
+    //warm up
+    time_item();
+
+    //measure up
+#ifdef VERBOSE_TEST
+    std::cerr << "measuring work item speed...\r";
+    std::cerr << "work item speed is " << time_item() << " per item, nil is " << time_nil() << "\n";
+#endif
+    try {
+
+      std::pair<double,double> testpoints[] = { {1, 0}, /*{1E-1, 10E-3}, {5E-1, 2E-6},  {3E-1, 50E-9},*/ };
+        for(auto x : testpoints ) {
+
+            std::map<std::string,std::vector<double>> results;
+
+            //testbarrier_outer<std::barrier>(results, PREFIX"bar 1khz 100us", 1E3, x.second);
+
+            std::string const names[] = {
+                PREFIX"tkt", PREFIX"mcs", PREFIX"ttas", PREFIX"std"
+#ifdef WIN32
+                ,PREFIX"srw"
+#endif
+            };
+
+            //run -->
+
+            mutex_tester<
+                ticket_mutex, mcs_mutex, ttas_mutex, std::mutex
+#ifdef WIN32
+                ,srw_mutex
+#endif
+            >::run(results, names, x.first, x.second);
+
+            //<-- run
+
+#ifdef VERBOSE_TEST
+            std::cout << "threads";
+            for(auto & i : results)
+                std::cout << ",\"" << i.first << '\"';
+            std::cout << std::endl;
+            int j = 0;
+            FOR_GAUNTLET(num_threads) {
+                std::cout << num_threads;
+                for(auto & i : results)
+                    std::cout << ',' << i.second[j];
+                std::cout << std::endl;
+                ++j;
+            }
+#endif
+        }
+    }
+    catch(std::string & e) {
+        std::cerr << "EXCEPTION : " << e << std::endl;
+        EXPECT_TRUE( false );
+    }
+}
+
+} // namespace Test
+
+#endif
diff --git a/lib/kokkos/core/unit_test/TestSynchronic.hpp b/lib/kokkos/core/unit_test/TestSynchronic.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d820129e8b571fa5eac2dc7f8d5016c47cd589f4
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestSynchronic.hpp
@@ -0,0 +1,240 @@
+/*
+
+Copyright (c) 2014, NVIDIA Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef TEST_SYNCHRONIC_HPP
+#define TEST_SYNCHRONIC_HPP
+
+#include <impl/Kokkos_Synchronic.hpp>
+#include <mutex>
+
+namespace Test {
+
+template <bool truly>
+struct dumb_mutex {
+
+    dumb_mutex () : locked(0) {
+    }
+
+    void lock() {
+        while(1) {
+            bool state = false;
+            if (locked.compare_exchange_weak(state,true,std::memory_order_acquire)) {
+                break;
+            }
+            while (locked.load(std::memory_order_relaxed)) {
+              if (!truly) {
+                Kokkos::Impl::portable_yield();
+              }
+            }
+        }
+    }
+
+    void unlock() {
+        locked.store(false,std::memory_order_release);
+    }
+
+private :
+    std::atomic<bool> locked;
+};
+
+#ifdef WIN32
+#include <winsock2.h>
+#include <windows.h>
+#include <synchapi.h>
+struct srw_mutex {
+
+    srw_mutex () {
+        InitializeSRWLock(&_lock);
+    }
+
+    void lock() {
+        AcquireSRWLockExclusive(&_lock);
+    }
+    void unlock() {
+        ReleaseSRWLockExclusive(&_lock);
+    }
+
+private :
+    SRWLOCK _lock;
+};
+#endif
+
+struct ttas_mutex {
+
+    ttas_mutex() : locked(false) {
+    }
+
+	ttas_mutex(const ttas_mutex&) = delete;
+	ttas_mutex& operator=(const ttas_mutex&) = delete;
+
+    void lock() {
+        for(int i = 0;; ++i) {
+            bool state = false;
+            if(locked.compare_exchange_weak(state,true,std::memory_order_relaxed,Kokkos::Impl::notify_none))
+                break;
+            locked.expect_update(true);
+        }
+        std::atomic_thread_fence(std::memory_order_acquire);
+    }
+    void unlock() {
+        locked.store(false,std::memory_order_release);
+    }
+
+private :
+    Kokkos::Impl::synchronic<bool> locked;
+};
+
+struct ticket_mutex {
+
+    ticket_mutex() : active(0), queue(0) {
+    }
+
+	ticket_mutex(const ticket_mutex&) = delete;
+	ticket_mutex& operator=(const ticket_mutex&) = delete;
+
+    void lock() {
+        int const me = queue.fetch_add(1, std::memory_order_relaxed);
+        while(me != active.load_when_equal(me, std::memory_order_acquire))
+            ;
+    }
+
+    void unlock() {
+        active.fetch_add(1,std::memory_order_release);
+    }
+private :
+    Kokkos::Impl::synchronic<int> active;
+    std::atomic<int> queue;
+};
+
+struct mcs_mutex {
+
+    mcs_mutex() : head(nullptr) {
+    }
+
+	mcs_mutex(const mcs_mutex&) = delete;
+	mcs_mutex& operator=(const mcs_mutex&) = delete;
+
+    struct unique_lock {
+
+        unique_lock(mcs_mutex & arg_m) : m(arg_m), next(nullptr), ready(false) {
+
+            unique_lock * const h = m.head.exchange(this,std::memory_order_acquire);
+            if(__builtin_expect(h != nullptr,0)) {
+                h->next.store(this,std::memory_order_seq_cst,Kokkos::Impl::notify_one);
+                while(!ready.load_when_not_equal(false,std::memory_order_acquire))
+                    ;
+            }
+        }
+
+	    unique_lock(const unique_lock&) = delete;
+	    unique_lock& operator=(const unique_lock&) = delete;
+
+        ~unique_lock() {
+            unique_lock * h = this;
+            if(__builtin_expect(!m.head.compare_exchange_strong(h,nullptr,std::memory_order_release, std::memory_order_relaxed),0)) {
+                unique_lock * n = next.load(std::memory_order_relaxed);
+                while(!n)
+                    n = next.load_when_not_equal(n,std::memory_order_relaxed);
+                n->ready.store(true,std::memory_order_release,Kokkos::Impl::notify_one);
+            }
+        }
+
+    private:
+        mcs_mutex & m;
+        Kokkos::Impl::synchronic<unique_lock*> next;
+        Kokkos::Impl::synchronic<bool> ready;
+    };
+
+private :
+    std::atomic<unique_lock*> head;
+};
+
+}
+
+namespace std {
+template<>
+struct unique_lock<Test::mcs_mutex> : Test::mcs_mutex::unique_lock {
+  unique_lock(Test::mcs_mutex & arg_m) : Test::mcs_mutex::unique_lock(arg_m) {
+  }
+  unique_lock(const unique_lock&) = delete;
+  unique_lock& operator=(const unique_lock&) = delete;
+};
+
+}
+
+/* #include <cmath> */
+#include <stdlib.h>
+
+namespace Test {
+
+//-------------------------------------
+//  MersenneTwister
+//-------------------------------------
+#define MT_IA  397
+#define MT_LEN 624
+
+class MersenneTwister
+{
+    volatile unsigned long m_buffer[MT_LEN][64/sizeof(unsigned long)];
+    volatile int m_index;
+
+public:
+    MersenneTwister() {
+        for (int i = 0; i < MT_LEN; i++)
+            m_buffer[i][0] = rand();
+        m_index = 0;
+        for (int i = 0; i < MT_LEN * 100; i++)
+            integer();
+    }
+    unsigned long integer() {
+        // Indices
+        int i = m_index;
+        int i2 = m_index + 1; if (i2 >= MT_LEN) i2 = 0; // wrap-around
+        int j = m_index + MT_IA; if (j >= MT_LEN) j -= MT_LEN; // wrap-around
+
+        // Twist
+        unsigned long s = (m_buffer[i][0] & 0x80000000) | (m_buffer[i2][0] & 0x7fffffff);
+        unsigned long r = m_buffer[j][0] ^ (s >> 1) ^ ((s & 1) * 0x9908B0DF);
+        m_buffer[m_index][0] = r;
+        m_index = i2;
+
+        // Swizzle
+        r ^= (r >> 11);
+        r ^= (r << 7) & 0x9d2c5680UL;
+        r ^= (r << 15) & 0xefc60000UL;
+        r ^= (r >> 18);
+        return r;
+    }
+    float poissonInterval(float ooLambda) {
+        return -logf(1.0f - integer() * 2.3283e-10f) * ooLambda;
+    }
+};
+
+} // namespace Test
+
+#endif //TEST_HPP
diff --git a/lib/kokkos/core/unit_test/TestTaskPolicy.hpp b/lib/kokkos/core/unit_test/TestTaskPolicy.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..71790f6def82d50a12d37d88e0b0e7d17f28799f
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestTaskPolicy.hpp
@@ -0,0 +1,1145 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+
+#ifndef KOKKOS_UNITTEST_TASKPOLICY_HPP
+#define KOKKOS_UNITTEST_TASKPOLICY_HPP
+
+#include <stdio.h>
+#include <iostream>
+#include <cmath>
+#include <Kokkos_TaskPolicy.hpp>
+
+#if defined( KOKKOS_ENABLE_TASKPOLICY )
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace TestTaskPolicy {
+
+namespace {
+
+long eval_fib( long n )
+{
+  constexpr long mask = 0x03 ;
+
+  long fib[4] = { 0 , 1 , 1 , 2 };
+
+  for ( long i = 2 ; i <= n ; ++i ) {
+    fib[ i & mask ] = fib[ ( i - 1 ) & mask ] + fib[ ( i - 2 ) & mask ];
+  }
+  
+  return fib[ n & mask ];
+}
+
+}
+
+template< typename Space >
+struct TestFib
+{
+  typedef Kokkos::TaskPolicy<Space>  policy_type ;
+  typedef Kokkos::Future<long,Space> future_type ;
+  typedef long value_type ;
+
+  policy_type policy ;
+  future_type fib_m1 ;
+  future_type fib_m2 ;
+  const value_type n ;
+
+  KOKKOS_INLINE_FUNCTION
+  TestFib( const policy_type & arg_policy , const value_type arg_n )
+    : policy(arg_policy)
+    , fib_m1() , fib_m2()
+    , n( arg_n )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( typename policy_type::member_type & , value_type & result )
+    {
+#if 0
+      printf( "\nTestFib(%ld) %d %d\n"
+             , n
+             , int( ! fib_m1.is_null() )
+             , int( ! fib_m2.is_null() )
+             );
+#endif
+
+      if ( n < 2 ) {
+        result = n ;
+      }
+      else if ( ! fib_m2.is_null() && ! fib_m1.is_null() ) {
+        result = fib_m1.get() + fib_m2.get();
+      }
+      else {
+
+        // Spawn new children and respawn myself to sum their results:
+        // Spawn lower value at higher priority as it has a shorter
+        // path to completion.
+
+        fib_m2 = policy.task_spawn( TestFib(policy,n-2)
+                                  , Kokkos::TaskSingle
+                                  , Kokkos::TaskHighPriority );
+
+        fib_m1 = policy.task_spawn( TestFib(policy,n-1)
+                                  , Kokkos::TaskSingle );
+
+        Kokkos::Future<Space> dep[] = { fib_m1 , fib_m2 };
+
+        Kokkos::Future<Space> fib_all = policy.when_all( 2 , dep );
+
+        if ( ! fib_m2.is_null() && ! fib_m1.is_null() && ! fib_all.is_null() ) {
+          // High priority to retire this branch
+          policy.respawn( this , Kokkos::TaskHighPriority , fib_all );
+        }
+        else {
+#if 0
+      printf( "TestFib(%ld) insufficient memory alloc_capacity(%d) task_max(%d) task_accum(%ld)\n"
+             , n
+             , policy.allocation_capacity()
+             , policy.allocated_task_count_max()
+             , policy.allocated_task_count_accum()
+             );
+#endif
+          Kokkos::abort("TestFib insufficient memory");
+
+        }
+      }
+    }
+
+  static void run( int i , size_t MemoryCapacity = 16000 )
+    {
+      typedef typename policy_type::memory_space memory_space ;
+
+      enum { Log2_SuperBlockSize = 12 };
+
+      policy_type root_policy( memory_space() , MemoryCapacity , Log2_SuperBlockSize );
+
+      future_type f = root_policy.host_spawn( TestFib(root_policy,i) , Kokkos::TaskSingle );
+      Kokkos::wait( root_policy );
+      ASSERT_EQ( eval_fib(i) , f.get() );
+
+#if 0
+      fprintf( stdout , "\nTestFib::run(%d) spawn_size(%d) when_all_size(%d) alloc_capacity(%d) task_max(%d) task_accum(%ld)\n"
+             , i
+             , int(root_policy.template spawn_allocation_size<TestFib>())
+             , int(root_policy.when_all_allocation_size(2))
+             , root_policy.allocation_capacity()
+             , root_policy.allocated_task_count_max()
+             , root_policy.allocated_task_count_accum()
+             );
+      fflush( stdout );
+#endif
+    }
+
+};
+
+} // namespace TestTaskPolicy
+
+//----------------------------------------------------------------------------
+
+namespace TestTaskPolicy {
+
+template< class Space >
+struct TestTaskDependence {
+
+  typedef Kokkos::TaskPolicy<Space>  policy_type ;
+  typedef Kokkos::Future<Space>      future_type ;
+  typedef Kokkos::View<long,Space>   accum_type ;
+  typedef void value_type ;
+
+  policy_type  m_policy ;
+  accum_type   m_accum ;
+  long         m_count ;
+
+  KOKKOS_INLINE_FUNCTION
+  TestTaskDependence( long n
+                    , const policy_type & arg_policy
+                    , const accum_type  & arg_accum )
+    : m_policy( arg_policy )
+    , m_accum( arg_accum )
+    , m_count( n )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( typename policy_type::member_type & )
+    {
+       enum { CHUNK = 8 };
+       const int n = CHUNK < m_count ? CHUNK : m_count ;
+
+       if ( 1 < m_count ) {
+         future_type f[ CHUNK ] ;
+
+         const int inc = ( m_count + n - 1 ) / n ;
+
+         for ( int i = 0 ; i < n ; ++i ) {
+           long begin = i * inc ;
+           long count = begin + inc < m_count ? inc : m_count - begin ;
+           f[i] = m_policy.task_spawn( TestTaskDependence(count,m_policy,m_accum) , Kokkos::TaskSingle );
+         }
+
+         m_count = 0 ;
+
+         m_policy.respawn( this , m_policy.when_all( n , f ) );
+       }
+       else if ( 1 == m_count ) {
+         Kokkos::atomic_increment( & m_accum() );
+       }
+    }
+
+  static void run( int n )
+    {
+      typedef typename policy_type::memory_space memory_space ;
+
+      // enum { MemoryCapacity = 4000 }; // Triggers infinite loop in memory pool
+      enum { MemoryCapacity = 16000 };
+      enum { Log2_SuperBlockSize = 12 };
+      policy_type policy( memory_space() , MemoryCapacity , Log2_SuperBlockSize );
+
+      accum_type accum("accum");
+
+      typename accum_type::HostMirror host_accum =
+        Kokkos::create_mirror_view( accum );
+
+      policy.host_spawn( TestTaskDependence(n,policy,accum) , Kokkos::TaskSingle );
+
+      Kokkos::wait( policy );
+
+      Kokkos::deep_copy( host_accum , accum );
+
+      ASSERT_EQ( host_accum() , n );
+    }
+};
+
+} // namespace TestTaskPolicy
+
+//----------------------------------------------------------------------------
+
+namespace TestTaskPolicy {
+
+template< class ExecSpace >
+struct TestTaskTeam {
+
+  //enum { SPAN = 8 };
+  enum { SPAN = 33 };
+  //enum { SPAN = 1 };
+
+  typedef void value_type ;
+  typedef Kokkos::TaskPolicy<ExecSpace>  policy_type ;
+  typedef Kokkos::Future<ExecSpace>      future_type ;
+  typedef Kokkos::View<long*,ExecSpace>  view_type ;
+
+  policy_type  policy ;
+  future_type  future ;
+
+  view_type  parfor_result ;
+  view_type  parreduce_check ;
+  view_type  parscan_result ;
+  view_type  parscan_check ;
+  const long nvalue ;
+
+  KOKKOS_INLINE_FUNCTION
+  TestTaskTeam( const policy_type & arg_policy
+              , const view_type   & arg_parfor_result
+              , const view_type   & arg_parreduce_check
+              , const view_type   & arg_parscan_result
+              , const view_type   & arg_parscan_check
+              , const long          arg_nvalue )
+    : policy(arg_policy)
+    , future()
+    , parfor_result( arg_parfor_result )
+    , parreduce_check( arg_parreduce_check )
+    , parscan_result( arg_parscan_result )
+    , parscan_check( arg_parscan_check )
+    , nvalue( arg_nvalue )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( typename policy_type::member_type & member )
+    {
+      const long end   = nvalue + 1 ;
+      const long begin = 0 < end - SPAN ? end - SPAN : 0 ;
+
+      if ( 0 < begin && future.is_null() ) {
+        if ( member.team_rank() == 0 ) {
+          future = policy.task_spawn
+            ( TestTaskTeam( policy ,
+                            parfor_result ,
+                            parreduce_check,
+                            parscan_result,
+                            parscan_check,
+                            begin - 1 )
+            , Kokkos::TaskTeam );
+
+          assert( ! future.is_null() );
+
+          policy.respawn( this , future );
+        }
+        return ;
+      }
+
+      Kokkos::parallel_for( Kokkos::TeamThreadRange(member,begin,end)
+                          , [&]( int i ) { parfor_result[i] = i ; }
+                          );
+
+      // test parallel_reduce without join
+    
+      long tot = 0;
+      long expected = (begin+end-1)*(end-begin)*0.5;
+      
+      Kokkos::parallel_reduce( Kokkos::TeamThreadRange(member,begin,end)
+                          , [&]( int i, long &res) { res += parfor_result[i]; }
+                          , tot);
+      Kokkos::parallel_for( Kokkos::TeamThreadRange(member,begin,end)
+                          , [&]( int i ) { parreduce_check[i] = expected-tot ; }
+                          );
+
+      // test parallel_reduce with join
+
+      tot = 0;
+      Kokkos::parallel_reduce( Kokkos::TeamThreadRange(member,begin,end)
+                          , [&]( int i, long &res) { res += parfor_result[i]; }
+                          , [&]( long& val1, const long& val2) { val1 += val2; }
+                          , tot);
+      Kokkos::parallel_for( Kokkos::TeamThreadRange(member,begin,end)
+                          , [&]( int i ) { parreduce_check[i] += expected-tot ; }
+                          );
+
+#if 0
+      // test parallel_scan
+
+      // Exclusive scan
+      Kokkos::parallel_scan<long>( Kokkos::TeamThreadRange(member,begin,end)
+                          , [&]( int i, long &val , const bool final ) {
+                              if ( final ) { parscan_result[i] = val; }
+                              val += i;
+                            }
+                          );
+
+      if ( member.team_rank() == 0 ) {
+        for ( long i = begin ; i < end ; ++i ) {
+          parscan_check[i] = (i*(i-1)-begin*(begin-1))*0.5-parscan_result[i];
+        }
+      }
+
+      // Inclusive scan
+      Kokkos::parallel_scan<long>( Kokkos::TeamThreadRange(member,begin,end)
+                          , [&]( int i, long &val , const bool final ) {
+                              val += i;
+                              if ( final ) { parscan_result[i] = val; }
+                            }
+                          );
+
+      if ( member.team_rank() == 0 ) {
+        for ( long i = begin ; i < end ; ++i ) {
+          parscan_check[i] += (i*(i+1)-begin*(begin-1))*0.5-parscan_result[i];
+        }
+      }
+#endif
+
+    }
+
+  static void run( long n )
+    {
+      // const unsigned memory_capacity = 10000 ; // causes memory pool infinite loop
+      // const unsigned memory_capacity = 100000 ; // fails with SPAN=1 for serial and OMP
+      const unsigned memory_capacity = 400000 ;
+
+      policy_type root_policy( typename policy_type::memory_space()
+                        , memory_capacity );
+
+      view_type   root_parfor_result("parfor_result",n+1);
+      view_type   root_parreduce_check("parreduce_check",n+1);
+      view_type   root_parscan_result("parscan_result",n+1);
+      view_type   root_parscan_check("parscan_check",n+1);
+
+      typename view_type::HostMirror
+        host_parfor_result = Kokkos::create_mirror_view( root_parfor_result );
+      typename view_type::HostMirror
+        host_parreduce_check = Kokkos::create_mirror_view( root_parreduce_check );
+      typename view_type::HostMirror
+        host_parscan_result = Kokkos::create_mirror_view( root_parscan_result );
+      typename view_type::HostMirror
+        host_parscan_check = Kokkos::create_mirror_view( root_parscan_check );
+
+      future_type f = root_policy.host_spawn(
+                        TestTaskTeam( root_policy ,
+                                      root_parfor_result ,
+                                      root_parreduce_check ,
+                                      root_parscan_result,
+                                      root_parscan_check,
+                                      n ) ,
+                        Kokkos::TaskTeam );
+
+      Kokkos::wait( root_policy );
+
+      Kokkos::deep_copy( host_parfor_result , root_parfor_result );
+      Kokkos::deep_copy( host_parreduce_check , root_parreduce_check );
+      Kokkos::deep_copy( host_parscan_result , root_parscan_result );
+      Kokkos::deep_copy( host_parscan_check , root_parscan_check );
+
+      for ( long i = 0 ; i <= n ; ++i ) {
+        const long answer = i ;
+        if ( host_parfor_result(i) != answer ) {
+          std::cerr << "TestTaskTeam::run ERROR parallel_for result(" << i << ") = "
+                    << host_parfor_result(i) << " != " << answer << std::endl ;
+        }
+        if ( host_parreduce_check(i) != 0 ) {
+          std::cerr << "TestTaskTeam::run ERROR parallel_reduce check(" << i << ") = "
+                    << host_parreduce_check(i) << " != 0" << std::endl ;
+        } //TODO
+        if ( host_parscan_check(i) != 0 ) {
+          std::cerr << "TestTaskTeam::run ERROR parallel_scan check(" << i << ") = "
+                    << host_parscan_check(i) << " != 0" << std::endl ;
+        }
+      }
+    }
+};
+
+template< class ExecSpace >
+struct TestTaskTeamValue {
+
+  enum { SPAN = 8 };
+
+  typedef long value_type ;
+  typedef Kokkos::TaskPolicy<ExecSpace>         policy_type ;
+  typedef Kokkos::Future<value_type,ExecSpace>  future_type ;
+  typedef Kokkos::View<long*,ExecSpace>         view_type ;
+
+  policy_type  policy ;
+  future_type  future ;
+
+  view_type  result ;
+  const long nvalue ;
+
+  KOKKOS_INLINE_FUNCTION
+  TestTaskTeamValue( const policy_type & arg_policy
+                   , const view_type   & arg_result
+                   , const long          arg_nvalue )
+    : policy(arg_policy)
+    , future()
+    , result( arg_result )
+    , nvalue( arg_nvalue )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( typename policy_type::member_type const & member
+                 , value_type & final )
+    {
+      const long end   = nvalue + 1 ;
+      const long begin = 0 < end - SPAN ? end - SPAN : 0 ;
+
+      if ( 0 < begin && future.is_null() ) {
+        if ( member.team_rank() == 0 ) {
+
+          future = policy.task_spawn
+            ( TestTaskTeamValue( policy , result , begin - 1 )
+            , Kokkos::TaskTeam );
+
+          assert( ! future.is_null() );
+
+          policy.respawn( this , future );
+        }
+        return ;
+      }
+
+      Kokkos::parallel_for( Kokkos::TeamThreadRange(member,begin,end)
+                          , [&]( int i ) { result[i] = i + 1 ; }
+                          );
+
+      if ( member.team_rank() == 0 ) {
+        final = result[nvalue] ;
+      }
+
+      Kokkos::memory_fence();
+    }
+
+  static void run( long n )
+    {
+      // const unsigned memory_capacity = 10000 ; // causes memory pool infinite loop
+      const unsigned memory_capacity = 100000 ;
+
+      policy_type root_policy( typename policy_type::memory_space()
+                             , memory_capacity );
+
+      view_type   root_result("result",n+1);
+
+      typename view_type::HostMirror
+        host_result = Kokkos::create_mirror_view( root_result );
+
+      future_type fv = root_policy.host_spawn
+        ( TestTaskTeamValue( root_policy, root_result, n ) , Kokkos::TaskTeam );
+
+      Kokkos::wait( root_policy );
+
+      Kokkos::deep_copy( host_result , root_result );
+
+      if ( fv.get() != n + 1 ) {
+        std::cerr << "TestTaskTeamValue ERROR future = "
+                  << fv.get() << " != " << n + 1 << std::endl ;
+      }
+      for ( long i = 0 ; i <= n ; ++i ) {
+        const long answer = i + 1 ;
+        if ( host_result(i) != answer ) {
+          std::cerr << "TestTaskTeamValue ERROR result(" << i << ") = "
+                    << host_result(i) << " != " << answer << std::endl ;
+        }
+      }
+    }
+};
+} // namespace TestTaskPolicy
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace TestTaskPolicy {
+
+template< class ExecSpace >
+struct FibChild {
+
+  typedef long value_type ;
+
+  Kokkos::Experimental::TaskPolicy<ExecSpace> policy ;
+  Kokkos::Experimental::Future<long,ExecSpace> fib_1 ;
+  Kokkos::Experimental::Future<long,ExecSpace> fib_2 ;
+  const value_type n ;
+  int has_nested ;
+
+  KOKKOS_INLINE_FUNCTION
+  FibChild( const Kokkos::Experimental::TaskPolicy<ExecSpace> & arg_policy
+          , const value_type arg_n )
+    : policy(arg_policy)
+    , fib_1() , fib_2()
+    , n( arg_n ), has_nested(0) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void apply( value_type & result )
+    {
+      typedef Kokkos::Experimental::Future<long,ExecSpace> future_type ;
+
+      if ( n < 2 ) {
+
+        has_nested = -1 ;
+
+        result = n ;
+      }
+      else {
+        if ( has_nested == 0 ) {
+          // Spawn new children and respawn myself to sum their results:
+          // Spawn lower value at higher priority as it has a shorter
+          // path to completion.
+          if ( fib_2.is_null() ) {
+            fib_2 = policy.task_create( FibChild(policy,n-2) );
+          }
+
+          if ( ! fib_2.is_null() && fib_1.is_null() ) {
+            fib_1 = policy.task_create( FibChild(policy,n-1) );
+          }
+
+          if ( ! fib_1.is_null() ) {
+            has_nested = 2 ;
+
+            policy.spawn( fib_2 , true /* high priority */ );
+            policy.spawn( fib_1 );
+            policy.add_dependence( this , fib_1 );
+            policy.add_dependence( this , fib_2 );
+            policy.respawn( this );
+          }
+          else {
+            // Release task memory before spawning the task,
+            // after spawning memory cannot be released.
+            fib_2 = future_type();
+            // Respawn when more memory is available
+            policy.respawn_needing_memory( this );
+          }
+        }
+        else if ( has_nested == 2 ) {
+
+          has_nested = -1 ;
+
+          result = fib_1.get() + fib_2.get();
+
+if ( false ) {
+  printf("FibChild %ld = fib(%ld), task_count(%d)\n"
+        , long(n), long(result), policy.allocated_task_count());
+}
+
+        }
+        else {
+          printf("FibChild(%ld) execution error\n",(long)n);
+          Kokkos::abort("FibChild execution error");
+        }
+      }
+    }
+};
+
+template< class ExecSpace >
+struct FibChild2 {
+
+  typedef long value_type ;
+
+  Kokkos::Experimental::TaskPolicy<ExecSpace> policy ;
+  Kokkos::Experimental::Future<long,ExecSpace> fib_a ;
+  Kokkos::Experimental::Future<long,ExecSpace> fib_b ;
+  const value_type n ;
+  int has_nested ;
+
+  KOKKOS_INLINE_FUNCTION
+  FibChild2( const Kokkos::Experimental::TaskPolicy<ExecSpace> & arg_policy
+           , const value_type arg_n )
+    : policy(arg_policy)
+    , n( arg_n ), has_nested(0) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void apply( value_type & result )
+    {
+      if ( 0 == has_nested ) {
+        if ( n < 2 ) {
+
+          has_nested = -1 ;
+
+          result = n ;
+        }
+        else if ( n < 4 ) {
+          // Spawn new children and respawn myself to sum their results:
+          // result = Fib(n-1) + Fib(n-2)
+          has_nested = 2 ;
+
+          // Spawn lower value at higher priority as it has a shorter
+          // path to completion.
+
+          policy.clear_dependence( this );
+          fib_a = policy.spawn( policy.task_create( FibChild2(policy,n-1) ) );
+          fib_b = policy.spawn( policy.task_create( FibChild2(policy,n-2) ) , true );
+          policy.add_dependence( this , fib_a );
+          policy.add_dependence( this , fib_b );
+          policy.respawn( this );
+        }
+        else {
+          // Spawn new children and respawn myself to sum their results:
+          // result = Fib(n-1) + Fib(n-2)
+          // result = ( Fib(n-2) + Fib(n-3) ) + ( Fib(n-3) + Fib(n-4) )
+          // result = ( ( Fib(n-3) + Fib(n-4) ) + Fib(n-3) ) + ( Fib(n-3) + Fib(n-4) )
+          // result = 3 * Fib(n-3) + 2 * Fib(n-4)
+          has_nested = 4 ;
+
+          // Spawn lower value at higher priority as it has a shorter
+          // path to completion.
+
+          policy.clear_dependence( this );
+          fib_a = policy.spawn( policy.task_create( FibChild2(policy,n-3) ) );
+          fib_b = policy.spawn( policy.task_create( FibChild2(policy,n-4) ) , true );
+          policy.add_dependence( this , fib_a );
+          policy.add_dependence( this , fib_b );
+          policy.respawn( this );
+        }
+     }
+     else if ( 2 == has_nested || 4 == has_nested ) {
+        result = ( has_nested == 2 ) ? fib_a.get() + fib_b.get()
+                                     : 3 * fib_a.get() + 2 * fib_b.get() ;
+
+        has_nested = -1 ;
+      }
+      else {
+        printf("FibChild2(%ld) execution error\n",(long)n);
+        Kokkos::abort("FibChild2 execution error");
+      }
+    }
+};
+
+template< class ExecSpace >
+void test_fib( long n , const unsigned task_max_count = 4096 )
+{
+  const unsigned task_max_size   = 256 ;
+  const unsigned task_dependence = 4 ;
+
+  Kokkos::Experimental::TaskPolicy<ExecSpace>
+    policy( task_max_count
+          , task_max_size
+          , task_dependence );
+
+  Kokkos::Experimental::Future<long,ExecSpace> f =
+    policy.spawn( policy.proc_create( FibChild<ExecSpace>(policy,n) ) );
+
+  Kokkos::Experimental::wait( policy );
+
+  if ( f.get() != eval_fib(n) ) {
+    std::cout << "Fib(" << n << ") = " << f.get();
+    std::cout << " != " << eval_fib(n);
+    std::cout << std::endl ;
+  }
+}
+
+template< class ExecSpace >
+void test_fib2( long n , const unsigned task_max_count = 1024 )
+{
+  const unsigned task_max_size   = 256 ;
+  const unsigned task_dependence = 4 ;
+
+  Kokkos::Experimental::TaskPolicy<ExecSpace>
+    policy( task_max_count
+          , task_max_size
+          , task_dependence );
+
+  Kokkos::Experimental::Future<long,ExecSpace> f =
+    policy.spawn( policy.proc_create( FibChild2<ExecSpace>(policy,n) ) );
+
+  Kokkos::Experimental::wait( policy );
+
+  if ( f.get() != eval_fib(n) ) {
+    std::cout << "Fib2(" << n << ") = " << f.get();
+    std::cout << " != " << eval_fib(n);
+    std::cout << std::endl ;
+  }
+}
+
+//----------------------------------------------------------------------------
+
+template< class ExecSpace >
+struct Norm2 {
+
+  typedef double value_type ;
+
+  const double * const m_x ;
+
+  Norm2( const double * x ) : m_x(x) {}
+
+  inline
+  void init( double & val ) const { val = 0 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int i , double & val ) const { val += m_x[i] * m_x[i] ; }
+
+  void apply( double & dst ) const { dst = std::sqrt( dst ); }
+};
+
+template< class ExecSpace >
+void test_norm2( const int n )
+{
+  const unsigned task_max_count  = 1024 ;
+  const unsigned task_max_size   = 256 ;
+  const unsigned task_dependence = 4 ;
+
+  Kokkos::Experimental::TaskPolicy<ExecSpace>
+    policy( task_max_count
+          , task_max_size
+          , task_dependence );
+
+  double * const x = new double[n];
+
+  for ( int i = 0 ; i < n ; ++i ) x[i] = 1 ;
+
+  Kokkos::RangePolicy<ExecSpace> r(0,n);
+
+  Kokkos::Experimental::Future<double,ExecSpace> f =
+    Kokkos::Experimental::spawn_reduce( policy , r , Norm2<ExecSpace>(x) );
+
+  Kokkos::Experimental::wait( policy );
+
+#if defined(PRINT)
+  std::cout << "Norm2: " << f.get() << std::endl ;
+#endif
+
+  delete[] x ;
+}
+
+//----------------------------------------------------------------------------
+
+template< class Space >
+struct TaskDep {
+
+  typedef int value_type ;
+  typedef Kokkos::Experimental::TaskPolicy< Space > policy_type ;
+
+  const policy_type policy ;
+  const int         input ;
+
+  TaskDep( const policy_type & arg_p , const int arg_i )
+    : policy( arg_p ), input( arg_i ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void apply( int & val )
+  {
+    val = input ;
+    const int num = policy.get_dependence( this );
+
+    for ( int i = 0 ; i < num ; ++i ) {
+      Kokkos::Experimental::Future<int,Space> f = policy.get_dependence( this , i );
+      val += f.get();
+    }
+  }
+};
+
+
+template< class Space >
+void test_task_dep( const int n )
+{
+  enum { NTEST = 64 };
+
+  const unsigned task_max_count  = 1024 ;
+  const unsigned task_max_size   = 64 ;
+  const unsigned task_dependence = 4 ;
+
+  Kokkos::Experimental::TaskPolicy<Space>
+    policy( task_max_count
+          , task_max_size
+          , task_dependence );
+
+  Kokkos::Experimental::Future<int,Space> f[ NTEST ];
+
+  for ( int i = 0 ; i < NTEST ; ++i ) {
+    // Create task in the "constructing" state with capacity for 'n+1' dependences
+    f[i] = policy.proc_create( TaskDep<Space>(policy,0) , n + 1 );
+
+    if ( f[i].get_task_state() != Kokkos::Experimental::TASK_STATE_CONSTRUCTING ) {
+      Kokkos::Impl::throw_runtime_exception("get_task_state() != Kokkos::Experimental::TASK_STATE_CONSTRUCTING");
+    }
+
+    // Only use 'n' dependences
+
+    for ( int j = 0 ; j < n ; ++j ) {
+
+      Kokkos::Experimental::Future<int,Space> nested =
+        policy.proc_create( TaskDep<Space>(policy,j+1) );
+
+      policy.spawn( nested );
+
+      // Add dependence to a "constructing" task
+      policy.add_dependence( f[i] , nested );
+    }
+
+    // Spawn task from the "constructing" to the "waiting" state
+    policy.spawn( f[i] );
+  }
+
+  const int answer = n % 2 ? n * ( ( n + 1 ) / 2 ) : ( n / 2 ) * ( n + 1 );
+
+  Kokkos::Experimental::wait( policy );
+
+  int error = 0 ;
+  for ( int i = 0 ; i < NTEST ; ++i ) {
+    if ( f[i].get_task_state() != Kokkos::Experimental::TASK_STATE_COMPLETE ) {
+      Kokkos::Impl::throw_runtime_exception("get_task_state() != Kokkos::Experimental::TASK_STATE_COMPLETE");
+    }
+    if ( answer != f[i].get() && 0 == error ) {
+      std::cout << "test_task_dep(" << n << ") ERROR at[" << i << "]"
+                << " answer(" << answer << ") != result(" << f[i].get() << ")" << std::endl ;
+    }
+  }
+}
+
+//----------------------------------------------------------------------------
+
+template< class ExecSpace >
+struct TaskTeam {
+
+  enum { SPAN = 8 };
+
+  typedef void value_type ;
+  typedef Kokkos::Experimental::TaskPolicy<ExecSpace>  policy_type ;
+  typedef Kokkos::Experimental::Future<void,ExecSpace> future_type ;
+  typedef Kokkos::View<long*,ExecSpace>                view_type ;
+
+  policy_type  policy ;
+  future_type  future ;
+
+  view_type  result ;
+  const long nvalue ;
+
+  KOKKOS_INLINE_FUNCTION
+  TaskTeam( const policy_type & arg_policy
+          , const view_type   & arg_result
+          , const long          arg_nvalue )
+    : policy(arg_policy)
+    , future()
+    , result( arg_result )
+    , nvalue( arg_nvalue )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  void apply( const typename policy_type::member_type & member )
+    {
+      const long end   = nvalue + 1 ;
+      const long begin = 0 < end - SPAN ? end - SPAN : 0 ;
+
+      if ( 0 < begin && future.get_task_state() == Kokkos::Experimental::TASK_STATE_NULL ) {
+        if ( member.team_rank() == 0 ) {
+          future = policy.spawn( policy.task_create_team( TaskTeam( policy , result , begin - 1 ) ) );
+          policy.clear_dependence( this );
+          policy.add_dependence( this , future );
+          policy.respawn( this );
+        }
+        return ;
+      }
+
+      Kokkos::parallel_for( Kokkos::TeamThreadRange(member,begin,end)
+                          , [&]( int i ) { result[i] = i + 1 ; }
+                          );
+    }
+};
+
+template< class ExecSpace >
+struct TaskTeamValue {
+
+  enum { SPAN = 8 };
+
+  typedef long value_type ;
+  typedef Kokkos::Experimental::TaskPolicy<ExecSpace>         policy_type ;
+  typedef Kokkos::Experimental::Future<value_type,ExecSpace>  future_type ;
+  typedef Kokkos::View<long*,ExecSpace>                       view_type ;
+
+  policy_type  policy ;
+  future_type  future ;
+
+  view_type  result ;
+  const long nvalue ;
+
+  KOKKOS_INLINE_FUNCTION
+  TaskTeamValue( const policy_type & arg_policy
+               , const view_type   & arg_result
+               , const long          arg_nvalue )
+    : policy(arg_policy)
+    , future()
+    , result( arg_result )
+    , nvalue( arg_nvalue )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  void apply( const typename policy_type::member_type & member , value_type & final )
+    {
+      const long end   = nvalue + 1 ;
+      const long begin = 0 < end - SPAN ? end - SPAN : 0 ;
+
+      if ( 0 < begin && future.is_null() ) {
+        if ( member.team_rank() == 0 ) {
+
+          future = policy.task_create_team( TaskTeamValue( policy , result , begin - 1 ) );
+
+          policy.spawn( future );
+          policy.add_dependence( this , future );
+          policy.respawn( this );
+        }
+        return ;
+      }
+
+      Kokkos::parallel_for( Kokkos::TeamThreadRange(member,begin,end)
+                          , [&]( int i ) { result[i] = i + 1 ; }
+                          );
+
+      if ( member.team_rank() == 0 ) {
+        final = result[nvalue] ;
+      }
+
+      Kokkos::memory_fence();
+    }
+};
+
+template< class ExecSpace >
+void test_task_team( long n )
+{
+  typedef TaskTeam< ExecSpace >            task_type ;
+  typedef TaskTeamValue< ExecSpace >       task_value_type ;
+  typedef typename task_type::view_type    view_type ;
+  typedef typename task_type::policy_type  policy_type ;
+
+  typedef typename task_type::future_type        future_type ;
+  typedef typename task_value_type::future_type  future_value_type ;
+
+  const unsigned task_max_count  = 1024 ;
+  const unsigned task_max_size   = 256 ;
+  const unsigned task_dependence = 4 ;
+
+  policy_type
+    policy( task_max_count
+          , task_max_size
+          , task_dependence );
+
+  view_type    result("result",n+1);
+
+  typename view_type::HostMirror
+    host_result = Kokkos::create_mirror_view( result );
+
+  future_type f = policy.proc_create_team( task_type( policy , result , n ) );
+
+  ASSERT_FALSE( f.is_null() );
+
+  policy.spawn( f );
+
+  Kokkos::Experimental::wait( policy );
+
+  Kokkos::deep_copy( host_result , result );
+
+  for ( long i = 0 ; i <= n ; ++i ) {
+    const long answer = i + 1 ;
+    if ( host_result(i) != answer ) {
+      std::cerr << "test_task_team void ERROR result(" << i << ") = "
+                << host_result(i) << " != " << answer << std::endl ;
+    }
+  }
+
+  future_value_type fv = policy.proc_create_team( task_value_type( policy , result , n ) );
+
+  ASSERT_FALSE( fv.is_null() );
+
+  policy.spawn( fv );
+
+  Kokkos::Experimental::wait( policy );
+
+  Kokkos::deep_copy( host_result , result );
+
+  if ( fv.get() != n + 1 ) {
+    std::cerr << "test_task_team value ERROR future = "
+              << fv.get() << " != " << n + 1 << std::endl ;
+  }
+  for ( long i = 0 ; i <= n ; ++i ) {
+    const long answer = i + 1 ;
+    if ( host_result(i) != answer ) {
+      std::cerr << "test_task_team value ERROR result(" << i << ") = "
+                << host_result(i) << " != " << answer << std::endl ;
+    }
+  }
+}
+
+//----------------------------------------------------------------------------
+
+template< class ExecSpace >
+struct TaskLatchAdd {
+
+  typedef void value_type ;
+  typedef Kokkos::Experimental::Future< Kokkos::Experimental::Latch , ExecSpace >  future_type ;
+
+  future_type     latch ;
+  volatile int *  count ;
+
+  KOKKOS_INLINE_FUNCTION
+  TaskLatchAdd( const future_type & arg_latch 
+              , volatile int * const arg_count )
+    : latch( arg_latch )
+    , count( arg_count )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  void apply()
+    {
+      Kokkos::atomic_fetch_add( count , 1 );
+      latch.add(1);
+    }
+};
+
+template< class ExecSpace >
+struct TaskLatchRun {
+
+  typedef void value_type ;
+  typedef Kokkos::Experimental::TaskPolicy< ExecSpace >      policy_type ;
+  typedef Kokkos::Experimental::Future< Kokkos::Experimental::Latch , ExecSpace >  future_type ;
+
+  policy_type policy ;
+  int total ;
+  volatile int count ;
+
+  KOKKOS_INLINE_FUNCTION
+  TaskLatchRun( const policy_type & arg_policy , const int arg_total )
+    : policy(arg_policy), total(arg_total), count(0) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void apply()
+    {
+      if ( 0 == count && 0 < total ) {
+        future_type latch = policy.create_latch( total );
+
+        for ( int i = 0 ; i < total ; ++i ) {
+          auto f = policy.task_create( TaskLatchAdd<ExecSpace>(latch,&count) , 0 );
+          if ( f.is_null() ) {
+            Kokkos::abort("TaskLatchAdd allocation FAILED" );
+          }
+
+          if ( policy.spawn( f ).is_null() ) {
+            Kokkos::abort("TaskLatcAdd spawning FAILED" );
+          }
+        }
+
+        policy.add_dependence( this , latch );
+        policy.respawn( this );
+      }
+      else if ( count != total ) {
+        printf("TaskLatchRun FAILED %d != %d\n",count,total);
+      }
+    }
+};
+
+
+template< class ExecSpace >
+void test_latch( int n )
+{
+  typedef TaskLatchRun< ExecSpace >        task_type ;
+  typedef typename task_type::policy_type  policy_type ;
+
+  // Primary + latch + n * LatchAdd
+  //
+  // This test uses several two different block sizes for allocation from the
+  // memory pool, so the memory size requested must be big enough to cause two
+  // or more superblocks to be used.  Currently, the superblock size in the
+  // task policy is 2^16, so make the minimum requested memory size greater
+  // than this.
+  const unsigned task_max_count  = n + 2 < 256 ? 256 : n + 2;
+  const unsigned task_max_size   = 256;
+  const unsigned task_dependence = 4 ;
+
+  policy_type
+    policy( task_max_count
+          , task_max_size
+          , task_dependence );
+
+  policy.spawn( policy.proc_create( TaskLatchRun<ExecSpace>(policy,n) ) );
+
+  wait( policy );
+}
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+} // namespace TestTaskPolicy
+
+#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
+#endif /* #ifndef KOKKOS_UNITTEST_TASKPOLICY_HPP */
+
+
diff --git a/lib/kokkos/core/unit_test/TestTeam.hpp b/lib/kokkos/core/unit_test/TestTeam.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..db6b0cff7e21654f7ba17b531e63fbc63deb2b06
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestTeam.hpp
@@ -0,0 +1,910 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <stdio.h>
+#include <stdexcept>
+#include <sstream>
+#include <iostream>
+
+#include <Kokkos_Core.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Test {
+namespace {
+
+template< class ExecSpace, class ScheduleType >
+struct TestTeamPolicy {
+
+  typedef typename Kokkos::TeamPolicy< ScheduleType,  ExecSpace >::member_type team_member ;
+  typedef Kokkos::View<int**,ExecSpace> view_type ;
+
+  view_type m_flags ;
+
+  TestTeamPolicy( const size_t league_size )
+    : m_flags( Kokkos::ViewAllocateWithoutInitializing("flags")
+             , Kokkos::TeamPolicy< ScheduleType,  ExecSpace >::team_size_max( *this )
+             , league_size )
+    {}
+
+  struct VerifyInitTag {};
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const team_member & member ) const
+    {
+      const int tid = member.team_rank() + member.team_size() * member.league_rank();
+
+      m_flags( member.team_rank() , member.league_rank() ) = tid ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const VerifyInitTag & , const team_member & member ) const
+    {
+      const int tid = member.team_rank() + member.team_size() * member.league_rank();
+
+      if ( tid != m_flags( member.team_rank() , member.league_rank() ) ) {
+        printf("TestTeamPolicy member(%d,%d) error %d != %d\n"
+              , member.league_rank() , member.team_rank()
+              , tid , m_flags( member.team_rank() , member.league_rank() ) );
+      }
+    }
+
+  // included for test_small_league_size
+  TestTeamPolicy()
+    : m_flags()
+  {}
+
+  // included for test_small_league_size
+  struct NoOpTag {} ;
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const NoOpTag & , const team_member & member ) const
+    {}
+
+
+  static void test_small_league_size() {
+
+    int bs = 8; // batch size (number of elements per batch)
+    int ns = 16; // total number of "problems" to process
+
+    // calculate total scratch memory space size
+    const int level = 0;
+    int mem_size = 960;
+    const int num_teams = ns/bs;
+    const Kokkos::TeamPolicy< ExecSpace, NoOpTag > policy(num_teams, Kokkos::AUTO());
+
+    Kokkos::parallel_for ( policy.set_scratch_size(level, Kokkos::PerTeam(mem_size), Kokkos::PerThread(0))
+                         , TestTeamPolicy()
+                         );
+  }
+
+  static void test_for( const size_t league_size )
+    {
+      TestTeamPolicy functor( league_size );
+
+      const int team_size = Kokkos::TeamPolicy< ScheduleType,  ExecSpace >::team_size_max( functor );
+
+      Kokkos::parallel_for( Kokkos::TeamPolicy< ScheduleType,  ExecSpace >( league_size , team_size ) , functor );
+      Kokkos::parallel_for( Kokkos::TeamPolicy< ScheduleType,  ExecSpace , VerifyInitTag >( league_size , team_size ) , functor );
+
+      test_small_league_size();
+    }
+
+  struct ReduceTag {};
+
+  typedef long value_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const team_member & member , value_type & update ) const
+    {
+      update += member.team_rank() + member.team_size() * member.league_rank();
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const ReduceTag & , const team_member & member , value_type & update ) const
+    {
+      update += 1 + member.team_rank() + member.team_size() * member.league_rank();
+    }
+
+  static void test_reduce( const size_t league_size )
+    {
+      TestTeamPolicy functor( league_size );
+
+      const int team_size = Kokkos::TeamPolicy< ScheduleType,  ExecSpace >::team_size_max( functor );
+      const long N = team_size * league_size ;
+
+      long total = 0 ;
+
+      Kokkos::parallel_reduce( Kokkos::TeamPolicy< ScheduleType,  ExecSpace >( league_size , team_size ) , functor , total );
+      ASSERT_EQ( size_t((N-1)*(N))/2 , size_t(total) );
+
+      Kokkos::parallel_reduce( Kokkos::TeamPolicy< ScheduleType,  ExecSpace , ReduceTag >( league_size , team_size ) , functor , total );
+      ASSERT_EQ( (size_t(N)*size_t(N+1))/2 , size_t(total) );
+    }
+};
+
+}
+}
+
+/*--------------------------------------------------------------------------*/
+
+namespace Test {
+
+template< typename ScalarType , class DeviceType, class ScheduleType >
+class ReduceTeamFunctor
+{
+public:
+  typedef DeviceType execution_space ;
+  typedef Kokkos::TeamPolicy< ScheduleType,  execution_space >  policy_type ;
+  typedef typename execution_space::size_type        size_type ;
+
+  struct value_type {
+    ScalarType value[3] ;
+  };
+
+  const size_type nwork ;
+
+  ReduceTeamFunctor( const size_type & arg_nwork ) : nwork( arg_nwork ) {}
+
+  ReduceTeamFunctor( const ReduceTeamFunctor & rhs )
+    : nwork( rhs.nwork ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type & dst ) const
+  {
+    dst.value[0] = 0 ;
+    dst.value[1] = 0 ;
+    dst.value[2] = 0 ;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile value_type & dst ,
+             const volatile value_type & src ) const
+  {
+    dst.value[0] += src.value[0] ;
+    dst.value[1] += src.value[1] ;
+    dst.value[2] += src.value[2] ;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const typename policy_type::member_type ind , value_type & dst ) const
+  {
+    const int thread_rank = ind.team_rank() + ind.team_size() * ind.league_rank();
+    const int thread_size = ind.team_size() * ind.league_size();
+    const int chunk = ( nwork + thread_size - 1 ) / thread_size ;
+
+    size_type iwork = chunk * thread_rank ;
+    const size_type iwork_end = iwork + chunk < nwork ? iwork + chunk : nwork ;
+
+    for ( ; iwork < iwork_end ; ++iwork ) {
+      dst.value[0] += 1 ;
+      dst.value[1] += iwork + 1 ;
+      dst.value[2] += nwork - iwork ;
+    }
+  }
+};
+
+} // namespace Test
+
+namespace {
+
+template< typename ScalarType , class DeviceType, class ScheduleType >
+class TestReduceTeam
+{
+public:
+  typedef DeviceType    execution_space ;
+  typedef Kokkos::TeamPolicy< ScheduleType,  execution_space >  policy_type ;
+  typedef typename execution_space::size_type    size_type ;
+
+  //------------------------------------
+
+  TestReduceTeam( const size_type & nwork )
+  {
+    run_test(nwork);
+  }
+
+  void run_test( const size_type & nwork )
+  {
+    typedef Test::ReduceTeamFunctor< ScalarType , execution_space , ScheduleType> functor_type ;
+    typedef typename functor_type::value_type value_type ;
+    typedef Kokkos::View< value_type, Kokkos::HostSpace, Kokkos::MemoryUnmanaged > result_type ;
+
+    enum { Count = 3 };
+    enum { Repeat = 100 };
+
+    value_type result[ Repeat ];
+
+    const unsigned long nw   = nwork ;
+    const unsigned long nsum = nw % 2 ? nw * (( nw + 1 )/2 )
+                                      : (nw/2) * ( nw + 1 );
+
+    const unsigned team_size   = policy_type::team_size_recommended( functor_type(nwork) );
+    const unsigned league_size = ( nwork + team_size - 1 ) / team_size ;
+
+    policy_type team_exec( league_size , team_size );
+
+    for ( unsigned i = 0 ; i < Repeat ; ++i ) {
+      result_type tmp( & result[i] );
+      Kokkos::parallel_reduce( team_exec , functor_type(nwork) , tmp );
+    }
+
+    execution_space::fence();
+
+    for ( unsigned i = 0 ; i < Repeat ; ++i ) {
+      for ( unsigned j = 0 ; j < Count ; ++j ) {
+        const unsigned long correct = 0 == j % 3 ? nw : nsum ;
+        ASSERT_EQ( (ScalarType) correct , result[i].value[j] );
+      }
+    }
+  }
+};
+
+}
+
+/*--------------------------------------------------------------------------*/
+
+namespace Test {
+
+template< class DeviceType, class ScheduleType >
+class ScanTeamFunctor
+{
+public:
+  typedef DeviceType  execution_space ;
+  typedef Kokkos::TeamPolicy< ScheduleType,  execution_space >  policy_type ;
+
+  typedef long int    value_type ;
+  Kokkos::View< value_type , execution_space > accum ;
+  Kokkos::View< value_type , execution_space > total ;
+
+  ScanTeamFunctor() : accum("accum"), total("total") {}
+
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type & error ) const { error = 0 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( value_type volatile & error ,
+             value_type volatile const & input ) const
+    { if ( input ) error = 1 ; }
+
+  struct JoinMax {
+    typedef long int value_type ;
+    KOKKOS_INLINE_FUNCTION
+    void join( value_type volatile & dst
+             , value_type volatile const & input ) const
+      { if ( dst < input ) dst = input ; }
+  };
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const typename policy_type::member_type ind , value_type & error ) const
+  {
+    if ( 0 == ind.league_rank() && 0 == ind.team_rank() ) {
+      const long int thread_count = ind.league_size() * ind.team_size();
+      total() = ( thread_count * ( thread_count + 1 ) ) / 2 ;
+    }
+
+    // Team max:
+    const int long m = ind.team_reduce( (long int) ( ind.league_rank() + ind.team_rank() ) , JoinMax() );
+
+    if ( m != ind.league_rank() + ( ind.team_size() - 1 ) ) {
+      printf("ScanTeamFunctor[%d.%d of %d.%d] reduce_max_answer(%ld) != reduce_max(%ld)\n"
+            , ind.league_rank(), ind.team_rank()
+            , ind.league_size(), ind.team_size()
+            , (long int)(ind.league_rank() + ( ind.team_size() - 1 )) , m );
+    }
+
+    // Scan:
+    const long int answer =
+      ( ind.league_rank() + 1 ) * ind.team_rank() +
+      ( ind.team_rank() * ( ind.team_rank() + 1 ) ) / 2 ;
+
+    const long int result =
+      ind.team_scan( ind.league_rank() + 1 + ind.team_rank() + 1 );
+
+    const long int result2 =
+      ind.team_scan( ind.league_rank() + 1 + ind.team_rank() + 1 );
+
+    if ( answer != result || answer != result2 ) {
+      printf("ScanTeamFunctor[%d.%d of %d.%d] answer(%ld) != scan_first(%ld) or scan_second(%ld)\n",
+             ind.league_rank(), ind.team_rank(),
+             ind.league_size(), ind.team_size(),
+             answer,result,result2);
+      error = 1 ;
+    }
+
+    const long int thread_rank = ind.team_rank() +
+                                 ind.team_size() * ind.league_rank();
+    ind.team_scan( 1 + thread_rank , accum.ptr_on_device() );
+  }
+};
+
+template< class DeviceType, class ScheduleType >
+class TestScanTeam
+{
+public:
+  typedef DeviceType  execution_space ;
+  typedef long int    value_type ;
+
+  typedef Kokkos::TeamPolicy< ScheduleType,  execution_space > policy_type ;
+  typedef Test::ScanTeamFunctor<DeviceType, ScheduleType> functor_type ;
+
+  //------------------------------------
+
+  TestScanTeam( const size_t nteam )
+  {
+    run_test(nteam);
+  }
+
+  void run_test( const size_t nteam )
+  {
+    typedef Kokkos::View< long int , Kokkos::HostSpace , Kokkos::MemoryUnmanaged >  result_type ;
+
+    const unsigned REPEAT = 100000 ;
+    const unsigned Repeat = ( REPEAT + nteam - 1 ) / nteam ;
+
+    functor_type functor ;
+
+    policy_type team_exec( nteam , policy_type::team_size_max( functor ) );
+
+    for ( unsigned i = 0 ; i < Repeat ; ++i ) {
+      long int accum = 0 ;
+      long int total = 0 ;
+      long int error = 0 ;
+      Kokkos::deep_copy( functor.accum , total );
+      Kokkos::parallel_reduce( team_exec , functor , result_type( & error ) );
+      DeviceType::fence();
+      Kokkos::deep_copy( accum , functor.accum );
+      Kokkos::deep_copy( total , functor.total );
+
+      ASSERT_EQ( error , 0 );
+      ASSERT_EQ( total , accum );
+    }
+
+    execution_space::fence();
+  }
+};
+
+} // namespace Test
+
+/*--------------------------------------------------------------------------*/
+
+namespace Test {
+
+template< class ExecSpace, class ScheduleType >
+struct SharedTeamFunctor {
+
+  typedef ExecSpace  execution_space ;
+  typedef int        value_type ;
+  typedef Kokkos::TeamPolicy< ScheduleType,  execution_space >  policy_type ;
+
+  enum { SHARED_COUNT = 1000 };
+
+  typedef typename ExecSpace::scratch_memory_space shmem_space ;
+
+  // tbd: MemoryUnmanaged should be the default for shared memory space
+  typedef Kokkos::View<int*,shmem_space,Kokkos::MemoryUnmanaged> shared_int_array_type ;
+
+  // Tell how much shared memory will be required by this functor:
+  inline
+  unsigned team_shmem_size( int team_size ) const
+  {
+    return shared_int_array_type::shmem_size( SHARED_COUNT ) +
+           shared_int_array_type::shmem_size( SHARED_COUNT );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const typename policy_type::member_type & ind , value_type & update ) const
+  {
+    const shared_int_array_type shared_A( ind.team_shmem() , SHARED_COUNT );
+    const shared_int_array_type shared_B( ind.team_shmem() , SHARED_COUNT );
+
+    if ((shared_A.ptr_on_device () == NULL && SHARED_COUNT > 0) ||
+        (shared_B.ptr_on_device () == NULL && SHARED_COUNT > 0)) {
+      printf ("Failed to allocate shared memory of size %lu\n",
+              static_cast<unsigned long> (SHARED_COUNT));
+      ++update; // failure to allocate is an error
+    }
+    else {
+      for ( int i = ind.team_rank() ; i < SHARED_COUNT ; i += ind.team_size() ) {
+        shared_A[i] = i + ind.league_rank();
+        shared_B[i] = 2 * i + ind.league_rank();
+      }
+
+      ind.team_barrier();
+
+      if ( ind.team_rank() + 1 == ind.team_size() ) {
+        for ( int i = 0 ; i < SHARED_COUNT ; ++i ) {
+          if ( shared_A[i] != i + ind.league_rank() ) {
+            ++update ;
+          }
+          if ( shared_B[i] != 2 * i + ind.league_rank() ) {
+            ++update ;
+          }
+        }
+      }
+    }
+  }
+};
+
+}
+
+namespace {
+
+template< class ExecSpace, class ScheduleType >
+struct TestSharedTeam {
+
+  TestSharedTeam()
+  { run(); }
+
+  void run()
+  {
+    typedef Test::SharedTeamFunctor<ExecSpace, ScheduleType> Functor ;
+    typedef Kokkos::View< typename Functor::value_type , Kokkos::HostSpace , Kokkos::MemoryUnmanaged >  result_type ;
+
+    const size_t team_size = Kokkos::TeamPolicy< ScheduleType,  ExecSpace >::team_size_max( Functor() );
+
+    Kokkos::TeamPolicy< ScheduleType,  ExecSpace > team_exec( 8192 / team_size , team_size );
+
+    typename Functor::value_type error_count = 0 ;
+
+    Kokkos::parallel_reduce( team_exec , Functor() , result_type( & error_count ) );
+
+    ASSERT_EQ( error_count , 0 );
+  }
+};
+}
+
+namespace Test {
+
+#if defined (KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA)
+template< class MemorySpace, class ExecSpace, class ScheduleType >
+struct TestLambdaSharedTeam {
+
+  TestLambdaSharedTeam()
+  { run(); }
+
+  void run()
+  {
+    typedef Test::SharedTeamFunctor<ExecSpace, ScheduleType> Functor ;
+    //typedef Kokkos::View< typename Functor::value_type , Kokkos::HostSpace , Kokkos::MemoryUnmanaged >  result_type ;
+    typedef Kokkos::View< typename Functor::value_type , MemorySpace, Kokkos::MemoryUnmanaged >  result_type ;
+
+    typedef typename ExecSpace::scratch_memory_space shmem_space ;
+
+    // tbd: MemoryUnmanaged should be the default for shared memory space
+    typedef Kokkos::View<int*,shmem_space,Kokkos::MemoryUnmanaged> shared_int_array_type ;
+
+    const int SHARED_COUNT = 1000;
+    int team_size = 1;
+#ifdef KOKKOS_HAVE_CUDA
+    if(std::is_same<ExecSpace,Kokkos::Cuda>::value)
+      team_size = 128;
+#endif
+    Kokkos::TeamPolicy< ScheduleType,  ExecSpace > team_exec( 8192 / team_size , team_size);
+    team_exec = team_exec.set_scratch_size(0,Kokkos::PerTeam(SHARED_COUNT*2*sizeof(int)));
+
+    typename Functor::value_type error_count = 0 ;
+
+    Kokkos::parallel_reduce( team_exec , KOKKOS_LAMBDA
+        ( const typename Kokkos::TeamPolicy< ScheduleType,  ExecSpace >::member_type & ind , int & update ) {
+
+      const shared_int_array_type shared_A( ind.team_shmem() , SHARED_COUNT );
+      const shared_int_array_type shared_B( ind.team_shmem() , SHARED_COUNT );
+
+      if ((shared_A.ptr_on_device () == NULL && SHARED_COUNT > 0) ||
+          (shared_B.ptr_on_device () == NULL && SHARED_COUNT > 0)) {
+        printf ("Failed to allocate shared memory of size %lu\n",
+                static_cast<unsigned long> (SHARED_COUNT));
+        ++update; // failure to allocate is an error
+      } else {
+        for ( int i = ind.team_rank() ; i < SHARED_COUNT ; i += ind.team_size() ) {
+          shared_A[i] = i + ind.league_rank();
+          shared_B[i] = 2 * i + ind.league_rank();
+        }
+
+        ind.team_barrier();
+
+        if ( ind.team_rank() + 1 == ind.team_size() ) {
+          for ( int i = 0 ; i < SHARED_COUNT ; ++i ) {
+            if ( shared_A[i] != i + ind.league_rank() ) {
+              ++update ;
+            }
+            if ( shared_B[i] != 2 * i + ind.league_rank() ) {
+              ++update ;
+            }
+          }
+        }
+      }
+    }, result_type( & error_count ) );
+
+    ASSERT_EQ( error_count , 0 );
+  }
+};
+#endif
+}
+
+namespace Test {
+
+template< class ExecSpace, class ScheduleType >
+struct ScratchTeamFunctor {
+
+  typedef ExecSpace  execution_space ;
+  typedef int        value_type ;
+  typedef Kokkos::TeamPolicy< ScheduleType,  execution_space >  policy_type ;
+
+  enum { SHARED_TEAM_COUNT = 100 };
+  enum { SHARED_THREAD_COUNT = 10 };
+
+  typedef typename ExecSpace::scratch_memory_space shmem_space ;
+
+  // tbd: MemoryUnmanaged should be the default for shared memory space
+  typedef Kokkos::View<size_t*,shmem_space,Kokkos::MemoryUnmanaged> shared_int_array_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const typename policy_type::member_type & ind , value_type & update ) const
+  {
+    const shared_int_array_type scratch_ptr( ind.team_scratch(1) , 2*ind.team_size() );
+    const shared_int_array_type scratch_A( ind.team_scratch(1) , SHARED_TEAM_COUNT );
+    const shared_int_array_type scratch_B( ind.thread_scratch(1) , SHARED_THREAD_COUNT );
+
+    if ((scratch_ptr.ptr_on_device () == NULL ) ||
+        (scratch_A.  ptr_on_device () == NULL && SHARED_TEAM_COUNT > 0) ||
+        (scratch_B.  ptr_on_device () == NULL && SHARED_THREAD_COUNT > 0)) {
+      printf ("Failed to allocate shared memory of size %lu\n",
+              static_cast<unsigned long> (SHARED_TEAM_COUNT));
+      ++update; // failure to allocate is an error
+    }
+    else {
+      Kokkos::parallel_for(Kokkos::TeamThreadRange(ind,0,(int)SHARED_TEAM_COUNT),[&] (const int &i) {
+        scratch_A[i] = i + ind.league_rank();
+      });
+      for(int i=0; i<SHARED_THREAD_COUNT; i++)
+        scratch_B[i] = 10000*ind.league_rank() + 100*ind.team_rank() + i;
+
+      scratch_ptr[ind.team_rank()] = (size_t) scratch_A.ptr_on_device();
+      scratch_ptr[ind.team_rank() + ind.team_size()] = (size_t) scratch_B.ptr_on_device();
+
+      ind.team_barrier();
+
+      for( int i = 0; i<SHARED_TEAM_COUNT; i++) {
+        if(scratch_A[i] != size_t(i + ind.league_rank()))
+          ++update;
+      }
+      for( int i = 0; i < ind.team_size(); i++) {
+        if(scratch_ptr[0]!=scratch_ptr[i]) ++update;
+      }
+      if(scratch_ptr[1+ind.team_size()] - scratch_ptr[0 + ind.team_size()] <
+         SHARED_THREAD_COUNT*sizeof(size_t))
+        ++update;
+      for( int i = 1; i < ind.team_size(); i++) {
+        if((scratch_ptr[i+ind.team_size()] - scratch_ptr[i-1+ind.team_size()]) !=
+           (scratch_ptr[1+ind.team_size()] - scratch_ptr[0 + ind.team_size()])) ++update;
+
+      }
+    }
+  }
+};
+
+}
+
+namespace {
+
+template< class ExecSpace, class ScheduleType >
+struct TestScratchTeam {
+
+  TestScratchTeam()
+  { run(); }
+
+  void run()
+  {
+    typedef Test::ScratchTeamFunctor<ExecSpace, ScheduleType> Functor ;
+    typedef Kokkos::View< typename Functor::value_type , Kokkos::HostSpace , Kokkos::MemoryUnmanaged >  result_type ;
+
+    const size_t team_size = Kokkos::TeamPolicy< ScheduleType,  ExecSpace >::team_size_max( Functor() );
+
+    Kokkos::TeamPolicy< ScheduleType,  ExecSpace > team_exec( 8192 / team_size , team_size );
+
+    typename Functor::value_type error_count = 0 ;
+
+    int team_scratch_size   = Functor::shared_int_array_type::shmem_size(Functor::SHARED_TEAM_COUNT) +
+                              Functor::shared_int_array_type::shmem_size(2*team_size);
+    int thread_scratch_size = Functor::shared_int_array_type::shmem_size(Functor::SHARED_THREAD_COUNT);
+    Kokkos::parallel_reduce( team_exec.set_scratch_size(0,Kokkos::PerTeam(team_scratch_size),
+                                                          Kokkos::PerThread(thread_scratch_size)) ,
+                             Functor() , result_type( & error_count ) );
+
+    ASSERT_EQ( error_count , 0 );
+  }
+};
+}
+
+namespace Test {
+template< class ExecSpace>
+KOKKOS_INLINE_FUNCTION
+int test_team_mulit_level_scratch_loop_body(const typename Kokkos::TeamPolicy<ExecSpace>::member_type& team) {
+      Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> a_team1(team.team_scratch(0),128);
+      Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> a_thread1(team.thread_scratch(0),16);
+      Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> a_team2(team.team_scratch(0),128);
+      Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> a_thread2(team.thread_scratch(0),16);
+
+      Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> b_team1(team.team_scratch(1),128000);
+      Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> b_thread1(team.thread_scratch(1),16000);
+      Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> b_team2(team.team_scratch(1),128000);
+      Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> b_thread2(team.thread_scratch(1),16000);
+
+      Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> a_team3(team.team_scratch(0),128);
+      Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> a_thread3(team.thread_scratch(0),16);
+      Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> b_team3(team.team_scratch(1),128000);
+      Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> b_thread3(team.thread_scratch(1),16000);
+
+
+      Kokkos::parallel_for(Kokkos::TeamThreadRange(team,0,128), [&] (const int& i) {
+        a_team1(i) = 1000000 + i;
+        a_team2(i) = 2000000 + i;
+        a_team3(i) = 3000000 + i;
+      });
+      team.team_barrier();
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,16), [&] (const int& i){
+        a_thread1(i) = 1000000 + 100000*team.team_rank() + 16-i;
+        a_thread2(i) = 2000000 + 100000*team.team_rank() + 16-i;
+        a_thread3(i) = 3000000 + 100000*team.team_rank() + 16-i;
+      });
+
+      Kokkos::parallel_for(Kokkos::TeamThreadRange(team,0,128000), [&] (const int& i) {
+        b_team1(i) = 1000000 + i;
+        b_team2(i) = 2000000 + i;
+        b_team3(i) = 3000000 + i;
+      });
+      team.team_barrier();
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,16000), [&] (const int& i){
+        b_thread1(i) = 1000000 + 100000*team.team_rank() + 16-i;
+        b_thread2(i) = 2000000 + 100000*team.team_rank() + 16-i;
+        b_thread3(i) = 3000000 + 100000*team.team_rank() + 16-i;
+      });
+
+      team.team_barrier();
+      int error = 0;
+      Kokkos::parallel_for(Kokkos::TeamThreadRange(team,0,128), [&] (const int& i) {
+        if(a_team1(i) != 1000000 + i) error++;
+        if(a_team2(i) != 2000000 + i) error++;
+        if(a_team3(i) != 3000000 + i) error++;
+      });
+      team.team_barrier();
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,16), [&] (const int& i){
+        if(a_thread1(i) != 1000000 + 100000*team.team_rank() + 16-i) error++;
+        if(a_thread2(i) != 2000000 + 100000*team.team_rank() + 16-i) error++;
+        if(a_thread3(i) != 3000000 + 100000*team.team_rank() + 16-i) error++;
+      });
+
+      Kokkos::parallel_for(Kokkos::TeamThreadRange(team,0,128000), [&] (const int& i) {
+        if(b_team1(i) != 1000000 + i) error++;
+        if(b_team2(i) != 2000000 + i) error++;
+        if(b_team3(i) != 3000000 + i) error++;
+      });
+      team.team_barrier();
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,16000), [&] (const int& i){
+        if(b_thread1(i) != 1000000 + 100000*team.team_rank() + 16-i) error++;
+        if(b_thread2(i) != 2000000 + 100000*team.team_rank() + 16-i) error++;
+        if( b_thread3(i) != 3000000 + 100000*team.team_rank() + 16-i) error++;
+      });
+
+  return error;
+}
+
+
+struct TagReduce {};
+struct TagFor {};
+
+template< class ExecSpace, class ScheduleType >
+struct ClassNoShmemSizeFunction {
+  Kokkos::View<int,ExecSpace,Kokkos::MemoryTraits<Kokkos::Atomic> > errors;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const TagFor&, const typename Kokkos::TeamPolicy<ExecSpace,ScheduleType>::member_type& team) const {
+    int error = test_team_mulit_level_scratch_loop_body<ExecSpace>(team);
+    errors() += error;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const TagReduce&, const typename Kokkos::TeamPolicy<ExecSpace,ScheduleType>::member_type& team, int& error) const {
+    error += test_team_mulit_level_scratch_loop_body<ExecSpace>(team);
+  }
+
+  void run() {
+    Kokkos::View<int,ExecSpace> d_errors = Kokkos::View<int,ExecSpace>("Errors");
+    errors = d_errors;
+
+    const int per_team0 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(128);
+    const int per_thread0 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(16);
+
+    const int per_team1 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(128000);
+    const int per_thread1 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(16000);
+    {
+    Kokkos::TeamPolicy<TagFor,ExecSpace,ScheduleType> policy(10,8,16);
+    Kokkos::parallel_for(policy.set_scratch_size(0,Kokkos::PerTeam(per_team0),Kokkos::PerThread(per_thread0)).set_scratch_size(1,Kokkos::PerTeam(per_team1),Kokkos::PerThread(per_thread1)),
+      *this);
+    Kokkos::fence();
+    typename Kokkos::View<int,ExecSpace>::HostMirror h_errors = Kokkos::create_mirror_view(d_errors);
+    Kokkos::deep_copy(h_errors,d_errors);
+    ASSERT_EQ(h_errors(),0);
+    }
+
+    {
+    int error = 0;
+    Kokkos::TeamPolicy<TagReduce,ExecSpace,ScheduleType> policy(10,8,16);
+    Kokkos::parallel_reduce(policy.set_scratch_size(0,Kokkos::PerTeam(per_team0),Kokkos::PerThread(per_thread0)).set_scratch_size(1,Kokkos::PerTeam(per_team1),Kokkos::PerThread(per_thread1)),
+      *this,error);
+    Kokkos::fence();
+    ASSERT_EQ(error,0);
+    }
+  };
+};
+
+template< class ExecSpace, class ScheduleType >
+struct ClassWithShmemSizeFunction {
+  Kokkos::View<int,ExecSpace,Kokkos::MemoryTraits<Kokkos::Atomic> > errors;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const TagFor&, const typename Kokkos::TeamPolicy<ExecSpace,ScheduleType>::member_type& team) const {
+    int error = test_team_mulit_level_scratch_loop_body<ExecSpace>(team);
+    errors() += error;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const TagReduce&, const typename Kokkos::TeamPolicy<ExecSpace,ScheduleType>::member_type& team, int& error) const {
+    error += test_team_mulit_level_scratch_loop_body<ExecSpace>(team);
+  }
+
+  void run() {
+    Kokkos::View<int,ExecSpace> d_errors = Kokkos::View<int,ExecSpace>("Errors");
+    errors = d_errors;
+
+    const int per_team1 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(128000);
+    const int per_thread1 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(16000);
+    {
+    Kokkos::TeamPolicy<TagFor,ExecSpace,ScheduleType> policy(10,8,16);
+    Kokkos::parallel_for(policy.set_scratch_size(1,Kokkos::PerTeam(per_team1),Kokkos::PerThread(per_thread1)),
+      *this);
+    Kokkos::fence();
+    typename Kokkos::View<int,ExecSpace>::HostMirror h_errors= Kokkos::create_mirror_view(d_errors);
+    Kokkos::deep_copy(h_errors,d_errors);
+    ASSERT_EQ(h_errors(),0);
+    }
+
+    {
+    int error = 0;
+    Kokkos::TeamPolicy<TagReduce,ExecSpace,ScheduleType> policy(10,8,16);
+    Kokkos::parallel_reduce(policy.set_scratch_size(1,Kokkos::PerTeam(per_team1),Kokkos::PerThread(per_thread1)),
+      *this,error);
+    Kokkos::fence();
+    ASSERT_EQ(error,0);
+    }
+  };
+
+  unsigned team_shmem_size(int team_size) const {
+    const int per_team0 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(128);
+    const int per_thread0 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(16);
+    return per_team0 + team_size * per_thread0;
+  }
+};
+
+template< class ExecSpace, class ScheduleType >
+void test_team_mulit_level_scratch_test_lambda() {
+#ifdef KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA
+  Kokkos::View<int,ExecSpace,Kokkos::MemoryTraits<Kokkos::Atomic> > errors;
+  Kokkos::View<int,ExecSpace> d_errors("Errors");
+  errors = d_errors;
+
+  const int per_team0 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(128);
+  const int per_thread0 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(16);
+
+  const int per_team1 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(128000);
+  const int per_thread1 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(16000);
+
+  Kokkos::TeamPolicy<ExecSpace,ScheduleType> policy(10,8,16);
+  Kokkos::parallel_for(policy.set_scratch_size(0,Kokkos::PerTeam(per_team0),Kokkos::PerThread(per_thread0)).set_scratch_size(1,Kokkos::PerTeam(per_team1),Kokkos::PerThread(per_thread1)),
+    KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<ExecSpace>::member_type& team) {
+    int error = test_team_mulit_level_scratch_loop_body<ExecSpace>(team);
+    errors() += error;
+  });
+  Kokkos::fence();
+  typename Kokkos::View<int,ExecSpace>::HostMirror h_errors= Kokkos::create_mirror_view(errors);
+  Kokkos::deep_copy(h_errors,d_errors);
+  ASSERT_EQ(h_errors(),0);
+
+  int error = 0;
+  Kokkos::parallel_reduce(policy.set_scratch_size(0,Kokkos::PerTeam(per_team0),Kokkos::PerThread(per_thread0)).set_scratch_size(1,Kokkos::PerTeam(per_team1),Kokkos::PerThread(per_thread1)),
+    KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<ExecSpace>::member_type& team, int& count) {
+      count += test_team_mulit_level_scratch_loop_body<ExecSpace>(team);
+  },error);
+  ASSERT_EQ(error,0);
+  Kokkos::fence();
+#endif
+}
+
+
+}
+
+namespace {
+template< class ExecSpace, class ScheduleType >
+struct TestMultiLevelScratchTeam {
+
+  TestMultiLevelScratchTeam()
+  { run(); }
+
+  void run()
+  {
+#ifdef KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA
+    Test::test_team_mulit_level_scratch_test_lambda<ExecSpace, ScheduleType>();
+#endif
+    Test::ClassNoShmemSizeFunction<ExecSpace, ScheduleType> c1;
+    c1.run();
+
+    Test::ClassWithShmemSizeFunction<ExecSpace, ScheduleType> c2;
+    c2.run();
+
+  }
+};
+}
+
+namespace Test {
+
+template< class ExecSpace >
+struct TestShmemSize {
+
+  TestShmemSize() { run(); }
+
+  void run()
+  {
+    typedef Kokkos::View< long***, ExecSpace > view_type;
+
+    size_t d1 = 5;
+    size_t d2 = 6;
+    size_t d3 = 7;
+
+    size_t size = view_type::shmem_size( d1, d2, d3 );
+
+    ASSERT_EQ( size, d1 * d2 * d3 * sizeof(long) );
+  }
+};
+}
+
+/*--------------------------------------------------------------------------*/
diff --git a/lib/kokkos/core/unit_test/TestTeamVector.hpp b/lib/kokkos/core/unit_test/TestTeamVector.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..48187f036844ccfda2d186f245b1673c7ffe5fd4
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestTeamVector.hpp
@@ -0,0 +1,646 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+
+#include <impl/Kokkos_Timer.hpp>
+#include <iostream>
+#include <cstdlib>
+
+namespace TestTeamVector {
+
+struct my_complex {
+  double re,im;
+  int dummy;
+  KOKKOS_INLINE_FUNCTION
+  my_complex() {
+    re = 0.0;
+    im = 0.0;
+    dummy = 0;
+  }
+  KOKKOS_INLINE_FUNCTION
+  my_complex(const my_complex& src) {
+    re = src.re;
+    im = src.im;
+    dummy = src.dummy;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  my_complex(const volatile my_complex& src) {
+    re = src.re;
+    im = src.im;
+    dummy = src.dummy;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  my_complex(const double& val) {
+    re = val;
+    im = 0.0;
+    dummy = 0;
+  }
+  KOKKOS_INLINE_FUNCTION
+  my_complex& operator += (const my_complex& src) {
+    re += src.re;
+    im += src.im;
+    dummy += src.dummy;
+    return *this;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator += (const volatile my_complex& src) volatile {
+    re += src.re;
+    im += src.im;
+    dummy += src.dummy;
+  }
+  KOKKOS_INLINE_FUNCTION
+  my_complex& operator *= (const my_complex& src) {
+    double re_tmp = re*src.re - im*src.im;
+    double im_tmp = re * src.im + im * src.re;
+    re = re_tmp;
+    im = im_tmp;
+    dummy *= src.dummy;
+    return *this;
+  }
+  KOKKOS_INLINE_FUNCTION
+  void operator *= (const volatile my_complex& src) volatile {
+    double re_tmp = re*src.re - im*src.im;
+    double im_tmp = re * src.im + im * src.re;
+    re = re_tmp;
+    im = im_tmp;
+    dummy *= src.dummy;
+  }
+  KOKKOS_INLINE_FUNCTION
+  bool operator == (const my_complex& src) {
+    return (re == src.re) && (im == src.im) && ( dummy == src.dummy );
+  }
+  KOKKOS_INLINE_FUNCTION
+  bool operator != (const my_complex& src) {
+      return (re != src.re) || (im != src.im) || ( dummy != src.dummy );
+  }
+  KOKKOS_INLINE_FUNCTION
+  bool operator != (const double& val) {
+    return (re != val) ||
+           (im != 0) || (dummy != 0);
+  }
+  KOKKOS_INLINE_FUNCTION
+  my_complex& operator= (const int& val) {
+    re = val;
+    im = 0.0;
+    dummy = 0;
+    return *this;
+  }
+  KOKKOS_INLINE_FUNCTION
+  my_complex& operator= (const double& val) {
+    re = val;
+    im = 0.0;
+    dummy = 0;
+    return *this;
+  }
+  KOKKOS_INLINE_FUNCTION
+  operator double() {
+    return re;
+  }
+};
+
+template<typename Scalar, class ExecutionSpace>
+struct functor_team_for {
+  typedef Kokkos::TeamPolicy<ExecutionSpace> policy_type;
+  typedef ExecutionSpace execution_space;
+
+  Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag;
+  functor_team_for(Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag_):flag(flag_) {}
+
+  unsigned team_shmem_size(int team_size) const {return team_size*13*sizeof(Scalar)+8;}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (typename policy_type::member_type team) const {
+
+    typedef typename ExecutionSpace::scratch_memory_space shmem_space ;
+    typedef Kokkos::View<Scalar*,shmem_space,Kokkos::MemoryUnmanaged> shared_int;
+    typedef typename shared_int::size_type size_type;
+
+    const size_type shmemSize = team.team_size () * 13;
+    shared_int values = shared_int (team.team_shmem (), shmemSize);
+
+    if (values.ptr_on_device () == NULL || values.dimension_0 () < shmemSize) {
+      printf ("FAILED to allocate shared memory of size %u\n",
+              static_cast<unsigned int> (shmemSize));
+    }
+    else {
+
+      // Initialize shared memory
+      values(team.team_rank ()) = 0;
+
+      // Accumulate value into per thread shared memory
+      // This is non blocking
+      Kokkos::parallel_for(Kokkos::TeamThreadRange(team,131),[&] (int i) {
+        values(team.team_rank ()) += i - team.league_rank () + team.league_size () + team.team_size ();
+      });
+      // Wait for all memory to be written
+      team.team_barrier ();
+      // One thread per team executes the comparison
+      Kokkos::single(Kokkos::PerTeam(team),[&]() {
+            Scalar test = 0;
+            Scalar value = 0;
+            for (int i = 0; i < 131; ++i) {
+              test += i - team.league_rank () + team.league_size () + team.team_size ();
+            }
+            for (int i = 0; i < team.team_size (); ++i) {
+              value += values(i);
+            }
+            if (test != value) {
+              printf ("FAILED team_parallel_for %i %i %f %f\n",
+                      team.league_rank (), team.team_rank (),
+                      static_cast<double> (test), static_cast<double> (value));
+              flag() = 1;
+            }
+      });
+    }
+  }
+};
+
+template<typename Scalar, class ExecutionSpace>
+struct functor_team_reduce {
+  typedef Kokkos::TeamPolicy<ExecutionSpace> policy_type;
+  typedef ExecutionSpace execution_space;
+
+  Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag;
+  functor_team_reduce(Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag_):flag(flag_) {}
+
+  unsigned team_shmem_size(int team_size) const {return team_size*13*sizeof(Scalar)+8;}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (typename policy_type::member_type team) const {
+
+    Scalar value = Scalar();
+    Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,131),[&] (int i, Scalar& val) {
+      val += i - team.league_rank () + team.league_size () + team.team_size ();
+    },value);
+
+    team.team_barrier ();
+    Kokkos::single(Kokkos::PerTeam(team),[&]() {
+         Scalar test = 0;
+         for (int i = 0; i < 131; ++i) {
+           test += i - team.league_rank () + team.league_size () + team.team_size ();
+         }
+         if (test != value) {
+           if(team.league_rank() == 0)
+           printf ("FAILED team_parallel_reduce %i %i %f %f %lu\n",
+             team.league_rank (), team.team_rank (),
+             static_cast<double> (test), static_cast<double> (value),sizeof(Scalar));
+              flag() = 1;
+         }
+    });
+  }
+};
+
+template<typename Scalar, class ExecutionSpace>
+struct functor_team_reduce_join {
+  typedef Kokkos::TeamPolicy<ExecutionSpace> policy_type;
+  typedef ExecutionSpace execution_space;
+
+  Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag;
+  functor_team_reduce_join(Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag_):flag(flag_) {}
+
+  unsigned team_shmem_size(int team_size) const {return team_size*13*sizeof(Scalar)+8;}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (typename policy_type::member_type team) const {
+
+    Scalar value = 0;
+
+    Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,131)
+      , [&] (int i, Scalar& val) {
+        val += i - team.league_rank () + team.league_size () + team.team_size ();
+      }
+      , [&] (volatile Scalar& val, const volatile Scalar& src) {val+=src;}
+      , value
+    );
+
+    team.team_barrier ();
+    Kokkos::single(Kokkos::PerTeam(team),[&]() {
+         Scalar test = 0;
+         for (int i = 0; i < 131; ++i) {
+           test += i - team.league_rank () + team.league_size () + team.team_size ();
+         }
+         if (test != value) {
+           printf ("FAILED team_vector_parallel_reduce_join %i %i %f %f\n",
+             team.league_rank (), team.team_rank (),
+             static_cast<double> (test), static_cast<double> (value));
+              flag() = 1;
+         }
+    });
+  }
+};
+
+template<typename Scalar, class ExecutionSpace>
+struct functor_team_vector_for {
+  typedef Kokkos::TeamPolicy<ExecutionSpace> policy_type;
+  typedef ExecutionSpace execution_space;
+
+  Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag;
+  functor_team_vector_for(Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag_):flag(flag_) {}
+
+  unsigned team_shmem_size(int team_size) const {return team_size*13*sizeof(Scalar)+8;}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (typename policy_type::member_type team) const {
+
+    typedef typename ExecutionSpace::scratch_memory_space shmem_space ;
+    typedef Kokkos::View<Scalar*,shmem_space,Kokkos::MemoryUnmanaged> shared_int;
+    typedef typename shared_int::size_type size_type;
+
+    const size_type shmemSize = team.team_size () * 13;
+    shared_int values = shared_int (team.team_shmem (), shmemSize);
+
+    if (values.ptr_on_device () == NULL || values.dimension_0 () < shmemSize) {
+      printf ("FAILED to allocate shared memory of size %u\n",
+              static_cast<unsigned int> (shmemSize));
+    }
+    else {
+      Kokkos::single(Kokkos::PerThread(team),[&] () {
+        values(team.team_rank ()) = 0;
+      });
+
+      Kokkos::parallel_for(Kokkos::TeamThreadRange(team,131),[&] (int i) {
+        Kokkos::single(Kokkos::PerThread(team),[&] () {
+          values(team.team_rank ()) += i - team.league_rank () + team.league_size () + team.team_size ();
+        });
+      });
+
+      team.team_barrier ();
+      Kokkos::single(Kokkos::PerTeam(team),[&]() {
+        Scalar test = 0;
+        Scalar value = 0;
+        for (int i = 0; i < 131; ++i) {
+          test += i - team.league_rank () + team.league_size () + team.team_size ();
+        }
+        for (int i = 0; i < team.team_size (); ++i) {
+          value += values(i);
+        }
+        if (test != value) {
+          printf ("FAILED team_vector_parallel_for %i %i %f %f\n",
+                  team.league_rank (), team.team_rank (),
+                  static_cast<double> (test), static_cast<double> (value));
+          flag() = 1;
+        }
+      });
+    }
+  }
+};
+
+template<typename Scalar, class ExecutionSpace>
+struct functor_team_vector_reduce {
+  typedef Kokkos::TeamPolicy<ExecutionSpace> policy_type;
+  typedef ExecutionSpace execution_space;
+
+  Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag;
+  functor_team_vector_reduce(Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag_):flag(flag_) {}
+
+  unsigned team_shmem_size(int team_size) const {return team_size*13*sizeof(Scalar)+8;}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (typename policy_type::member_type team) const {
+
+    Scalar value = Scalar();
+    Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,131),[&] (int i, Scalar& val) {
+        val += i - team.league_rank () + team.league_size () + team.team_size ();
+    },value);
+
+    team.team_barrier ();
+    Kokkos::single(Kokkos::PerTeam(team),[&]() {
+      Scalar test = 0;
+      for (int i = 0; i < 131; ++i) {
+        test += i - team.league_rank () + team.league_size () + team.team_size ();
+      }
+      if (test != value) {
+        if(team.league_rank() == 0)
+        printf ("FAILED team_vector_parallel_reduce %i %i %f %f %lu\n",
+          team.league_rank (), team.team_rank (),
+          static_cast<double> (test), static_cast<double> (value),sizeof(Scalar));
+           flag() = 1;
+      }
+    });
+  }
+};
+
+template<typename Scalar, class ExecutionSpace>
+struct functor_team_vector_reduce_join {
+  typedef Kokkos::TeamPolicy<ExecutionSpace> policy_type;
+  typedef ExecutionSpace execution_space;
+
+  Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag;
+  functor_team_vector_reduce_join(Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag_):flag(flag_) {}
+
+  unsigned team_shmem_size(int team_size) const {return team_size*13*sizeof(Scalar)+8;}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (typename policy_type::member_type team) const {
+
+    Scalar value = 0;
+    Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,131)
+      , [&] (int i, Scalar& val) {
+        val += i - team.league_rank () + team.league_size () + team.team_size ();
+      }
+      , [&] (volatile Scalar& val, const volatile Scalar& src) {val+=src;}
+      , value
+    );
+
+    team.team_barrier ();
+    Kokkos::single(Kokkos::PerTeam(team),[&]() {
+      Scalar test = 0;
+      for (int i = 0; i < 131; ++i) {
+         test += i - team.league_rank () + team.league_size () + team.team_size ();
+      }
+      if (test != value) {
+        printf ("FAILED team_vector_parallel_reduce_join %i %i %f %f\n",
+          team.league_rank (), team.team_rank (),
+          static_cast<double> (test), static_cast<double> (value));
+        flag() = 1;
+      }
+    });
+  }
+};
+
+template<typename Scalar, class ExecutionSpace>
+struct functor_vec_single {
+  typedef Kokkos::TeamPolicy<ExecutionSpace> policy_type;
+  typedef ExecutionSpace execution_space;
+
+  Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag;
+  functor_vec_single(Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag_):flag(flag_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (typename policy_type::member_type team) const {
+
+    // Warning: this test case intentionally violates permissable semantics
+    // It is not valid to get references to members of the enclosing region
+    // inside a parallel_for and write to it.
+    Scalar value = 0;
+
+    Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,13),[&] (int i) {
+      value = i; // This write is violating Kokkos semantics for nested parallelism
+    });
+
+    Kokkos::single(Kokkos::PerThread(team),[&] (Scalar& val) {
+      val = 1;
+    },value);
+
+    Scalar value2 = 0;
+    Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,13), [&] (int i, Scalar& val) {
+      val += value;
+    },value2);
+
+    if(value2!=(value*13)) {
+      printf("FAILED vector_single broadcast %i %i %f %f\n",team.league_rank(),team.team_rank(),(double) value2,(double) value);
+      flag()=1;
+    }
+  }
+};
+
+template<typename Scalar, class ExecutionSpace>
+struct functor_vec_for {
+  typedef Kokkos::TeamPolicy<ExecutionSpace> policy_type;
+  typedef ExecutionSpace execution_space;
+
+  Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag;
+  functor_vec_for(Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag_):flag(flag_) {}
+
+  unsigned team_shmem_size(int team_size) const {return team_size*13*sizeof(Scalar)+8;}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (typename policy_type::member_type team) const {
+
+    typedef typename ExecutionSpace::scratch_memory_space shmem_space ;
+    typedef Kokkos::View<Scalar*,shmem_space,Kokkos::MemoryUnmanaged> shared_int;
+    shared_int values = shared_int(team.team_shmem(),team.team_size()*13);
+
+    if (values.ptr_on_device () == NULL ||
+        values.dimension_0() < (unsigned) team.team_size() * 13) {
+      printf ("FAILED to allocate memory of size %i\n",
+              static_cast<int> (team.team_size () * 13));
+      flag() = 1;
+    }
+    else {
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,13), [&] (int i) {
+        values(13*team.team_rank() + i) = i - team.team_rank() - team.league_rank() + team.league_size() + team.team_size();
+      });
+
+      Kokkos::single(Kokkos::PerThread(team),[&] () {
+        Scalar test = 0;
+        Scalar value = 0;
+        for (int i = 0; i < 13; ++i) {
+          test += i - team.team_rank() - team.league_rank() + team.league_size() + team.team_size();
+          value += values(13*team.team_rank() + i);
+        }
+        if (test != value) {
+          printf ("FAILED vector_par_for %i %i %f %f\n",
+                  team.league_rank (), team.team_rank (),
+                  static_cast<double> (test), static_cast<double> (value));
+          flag() = 1;
+        }
+      });
+    }
+  }
+};
+
+template<typename Scalar, class ExecutionSpace>
+struct functor_vec_red {
+  typedef Kokkos::TeamPolicy<ExecutionSpace> policy_type;
+  typedef ExecutionSpace execution_space;
+
+  Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag;
+  functor_vec_red(Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag_):flag(flag_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (typename policy_type::member_type team) const {
+    Scalar value = 0;
+
+    Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,13),[&] (int i, Scalar& val) {
+      val += i;
+    }, value);
+
+    Kokkos::single(Kokkos::PerThread(team),[&] () {
+      Scalar test = 0;
+      for(int i = 0; i < 13; i++) {
+        test+=i;
+      }
+      if(test!=value) {
+        printf("FAILED vector_par_reduce %i %i %f %f\n",team.league_rank(),team.team_rank(),(double) test,(double) value);
+        flag()=1;
+      }
+    });
+  }
+};
+
+template<typename Scalar, class ExecutionSpace>
+struct functor_vec_red_join {
+  typedef Kokkos::TeamPolicy<ExecutionSpace> policy_type;
+  typedef ExecutionSpace execution_space;
+
+  Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag;
+  functor_vec_red_join(Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag_):flag(flag_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (typename policy_type::member_type team) const {
+    Scalar value = 1;
+
+    Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,13)
+      , [&] (int i, Scalar& val) { val *= i; }
+      , [&] (Scalar& val, const Scalar& src) {val*=src;}
+      , value
+    );
+
+    Kokkos::single(Kokkos::PerThread(team),[&] () {
+      Scalar test = 1;
+      for(int i = 0; i < 13; i++) {
+        test*=i;
+      }
+      if(test!=value) {
+        printf("FAILED vector_par_reduce_join %i %i %f %f\n",team.league_rank(),team.team_rank(),(double) test,(double) value);
+        flag()=1;
+      }
+    });
+  }
+};
+
+template<typename Scalar, class ExecutionSpace>
+struct functor_vec_scan {
+  typedef Kokkos::TeamPolicy<ExecutionSpace> policy_type;
+  typedef ExecutionSpace execution_space;
+
+  Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag;
+  functor_vec_scan(Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag_):flag(flag_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (typename policy_type::member_type team) const {
+    Kokkos::parallel_scan(Kokkos::ThreadVectorRange(team,13),[&] (int i, Scalar& val, bool final) {
+      val += i;
+      if(final) {
+        Scalar test = 0;
+        for(int k = 0; k <= i; k++) {
+          test+=k;
+        }
+        if(test!=val) {
+          printf("FAILED vector_par_scan %i %i %f %f\n",team.league_rank(),team.team_rank(),(double) test,(double) val);
+          flag()=1;
+        }
+      }
+    });
+  }
+};
+
+template<typename Scalar, class ExecutionSpace>
+struct functor_reduce {
+  typedef double value_type;
+  typedef Kokkos::TeamPolicy<ExecutionSpace> policy_type;
+  typedef ExecutionSpace execution_space;
+
+  Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag;
+  functor_reduce(Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag_):flag(flag_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (typename policy_type::member_type team, double& sum) const {
+    sum += team.league_rank() * 100 + team.thread_rank();
+  }
+};
+
+template<typename Scalar,class ExecutionSpace>
+bool test_scalar(int nteams, int team_size, int test) {
+  Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> d_flag("flag");
+  typename Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace>::HostMirror h_flag("h_flag");
+  h_flag() = 0 ;
+  Kokkos::deep_copy(d_flag,h_flag);
+  
+  if(test==0)
+  Kokkos::parallel_for( std::string("A") , Kokkos::TeamPolicy<ExecutionSpace>(nteams,team_size,8),
+      functor_vec_red<Scalar, ExecutionSpace>(d_flag));
+  if(test==1)
+  Kokkos::parallel_for( Kokkos::TeamPolicy<ExecutionSpace>(nteams,team_size,8),
+      functor_vec_red_join<Scalar, ExecutionSpace>(d_flag));
+  if(test==2)
+  Kokkos::parallel_for( Kokkos::TeamPolicy<ExecutionSpace>(nteams,team_size,8),
+      functor_vec_scan<Scalar, ExecutionSpace>(d_flag));
+  if(test==3)
+  Kokkos::parallel_for( Kokkos::TeamPolicy<ExecutionSpace>(nteams,team_size,8),
+      functor_vec_for<Scalar, ExecutionSpace>(d_flag));
+  if(test==4)
+  Kokkos::parallel_for( "B" , Kokkos::TeamPolicy<ExecutionSpace>(nteams,team_size,8),
+      functor_vec_single<Scalar, ExecutionSpace>(d_flag));
+  if(test==5)
+  Kokkos::parallel_for( Kokkos::TeamPolicy<ExecutionSpace>(nteams,team_size),
+      functor_team_for<Scalar, ExecutionSpace>(d_flag));
+  if(test==6)
+  Kokkos::parallel_for( Kokkos::TeamPolicy<ExecutionSpace>(nteams,team_size),
+      functor_team_reduce<Scalar, ExecutionSpace>(d_flag));
+  if(test==7)
+  Kokkos::parallel_for( Kokkos::TeamPolicy<ExecutionSpace>(nteams,team_size),
+      functor_team_reduce_join<Scalar, ExecutionSpace>(d_flag));
+  if(test==8)
+  Kokkos::parallel_for( Kokkos::TeamPolicy<ExecutionSpace>(nteams,team_size,8),
+      functor_team_vector_for<Scalar, ExecutionSpace>(d_flag));
+  if(test==9)
+  Kokkos::parallel_for( Kokkos::TeamPolicy<ExecutionSpace>(nteams,team_size,8),
+      functor_team_vector_reduce<Scalar, ExecutionSpace>(d_flag));
+  if(test==10)
+  Kokkos::parallel_for( Kokkos::TeamPolicy<ExecutionSpace>(nteams,team_size,8),
+      functor_team_vector_reduce_join<Scalar, ExecutionSpace>(d_flag));
+  
+  Kokkos::deep_copy(h_flag,d_flag);
+
+  return (h_flag() == 0);
+}
+
+template<class ExecutionSpace>
+bool Test(int test) {
+  bool passed = true;
+  passed = passed && test_scalar<int, ExecutionSpace>(317,33,test);
+  passed = passed && test_scalar<long long int, ExecutionSpace>(317,33,test);
+  passed = passed && test_scalar<float, ExecutionSpace>(317,33,test);
+  passed = passed && test_scalar<double, ExecutionSpace>(317,33,test);
+  passed = passed && test_scalar<my_complex, ExecutionSpace>(317,33,test);
+  return passed;
+}
+
+}
+
diff --git a/lib/kokkos/core/unit_test/TestTemplateMetaFunctions.hpp b/lib/kokkos/core/unit_test/TestTemplateMetaFunctions.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..4f136bc64b977e3243b9aaf789d4837e7e5ca793
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestTemplateMetaFunctions.hpp
@@ -0,0 +1,219 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+
+#define KOKKOS_PRAGMA_UNROLL(a)
+
+namespace {
+
+template<class Scalar, class ExecutionSpace>
+struct SumPlain {
+  typedef ExecutionSpace execution_space;
+  typedef typename Kokkos::View<Scalar*,execution_space> type;
+  type view;
+  SumPlain(type view_):view(view_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (int i, Scalar& val) {
+    val += Scalar();
+  }
+};
+
+template<class Scalar, class ExecutionSpace>
+struct SumInitJoinFinalValueType {
+  typedef ExecutionSpace execution_space;
+  typedef typename Kokkos::View<Scalar*,execution_space> type;
+  type view;
+  typedef Scalar value_type;
+  SumInitJoinFinalValueType(type view_):view(view_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void init(value_type& val) const {
+    val = value_type();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile value_type& val, volatile value_type& src) const {
+    val += src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (int i, value_type& val) const {
+    val += value_type();
+  }
+
+};
+
+template<class Scalar, class ExecutionSpace>
+struct SumInitJoinFinalValueType2 {
+  typedef ExecutionSpace execution_space;
+  typedef typename Kokkos::View<Scalar*,execution_space> type;
+  type view;
+  typedef Scalar value_type;
+  SumInitJoinFinalValueType2(type view_):view(view_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void init(volatile value_type& val) const {
+    val = value_type();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile value_type& val, const volatile value_type& src) const {
+    val += src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (int i, value_type& val) const {
+    val += value_type();
+  }
+
+};
+
+template<class Scalar, class ExecutionSpace>
+struct SumInitJoinFinalValueTypeArray {
+  typedef ExecutionSpace execution_space;
+  typedef typename Kokkos::View<Scalar*,execution_space> type;
+  type view;
+  typedef Scalar value_type[];
+  int n;
+  SumInitJoinFinalValueTypeArray(type view_, int n_):view(view_),n(n_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void init(value_type val) const {
+    for(int k=0;k<n;k++)
+      val[k] = 0;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile value_type val, const volatile value_type src) const {
+    for(int k=0;k<n;k++)
+      val[k] += src[k];
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (int i, value_type val) const {
+    for(int k=0;k<n;k++)
+      val[k] += k*i;
+  }
+
+};
+
+template<class Scalar, class ExecutionSpace>
+struct SumWrongInitJoinFinalValueType {
+  typedef ExecutionSpace execution_space;
+  typedef typename Kokkos::View<Scalar*,execution_space> type;
+  type view;
+  typedef Scalar value_type;
+  SumWrongInitJoinFinalValueType(type view_):view(view_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void init(double& val) const {
+    val = double();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile value_type& val, const value_type& src) const {
+    val += src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (int i, value_type& val) const {
+    val += value_type();
+  }
+
+};
+
+template<class Scalar, class ExecutionSpace>
+void TestTemplateMetaFunctions() {
+  typedef typename Kokkos::View<Scalar*,ExecutionSpace> type;
+  type a("A",100);
+/*  #ifdef KOKKOS_HAVE_CXX11
+  int sum_plain_has_init_arg = Kokkos::Impl::FunctorHasInit<SumPlain<Scalar,ExecutionSpace>, Scalar& >::value;
+  ASSERT_EQ(sum_plain_has_init_arg,0);
+  int sum_initjoinfinalvaluetype_has_init_arg = Kokkos::Impl::FunctorHasInit<SumInitJoinFinalValueType<Scalar,ExecutionSpace>, Scalar >::value;
+  ASSERT_EQ(sum_initjoinfinalvaluetype_has_init_arg,1);
+  int sum_initjoinfinalvaluetype_has_init_arg2 = Kokkos::Impl::FunctorHasInit<SumInitJoinFinalValueType2<Scalar,ExecutionSpace>, Scalar >::value;
+  ASSERT_EQ(sum_initjoinfinalvaluetype_has_init_arg2,1);
+  int sum_wronginitjoinfinalvaluetype_has_init_arg = Kokkos::Impl::FunctorHasInit<SumWrongInitJoinFinalValueType<Scalar,ExecutionSpace>, Scalar >::value;
+  ASSERT_EQ(sum_wronginitjoinfinalvaluetype_has_init_arg,0);
+
+  //int sum_initjoinfinalvaluetypearray_has_init_arg = Kokkos::Impl::FunctorHasInit<SumInitJoinFinalValueTypeArray<Scalar,ExecutionSpace>, Scalar[] >::value;
+  //ASSERT_EQ(sum_initjoinfinalvaluetypearray_has_init_arg,1);
+
+  #else
+
+  int sum_plain_has_init_arg = Kokkos::Impl::FunctorHasInit<SumPlain<Scalar,ExecutionSpace>, Scalar& >::value;
+  ASSERT_EQ(sum_plain_has_init_arg,0);
+  int sum_initjoinfinalvaluetype_has_init_arg = Kokkos::Impl::FunctorHasInit<SumInitJoinFinalValueType<Scalar,ExecutionSpace>, Scalar& >::value;
+  ASSERT_EQ(sum_initjoinfinalvaluetype_has_init_arg,1);
+  int sum_wronginitjoinfinalvaluetype_has_init_arg = Kokkos::Impl::FunctorHasInit<SumWrongInitJoinFinalValueType<Scalar,ExecutionSpace>, Scalar& >::value;
+  ASSERT_EQ(sum_wronginitjoinfinalvaluetype_has_init_arg,1);
+
+  #endif
+
+  //printf("Values Init: %i %i %i\n",sum_plain_has_init_arg,sum_initjoinfinalvaluetype_has_init_arg,sum_wronginitjoinfinalvaluetype_has_init_arg);
+
+#ifdef KOKKOS_HAVE_CXX11
+  int sum_plain_has_join_arg = Kokkos::Impl::FunctorHasJoin<SumPlain<Scalar,ExecutionSpace>, Scalar >::value;
+  ASSERT_EQ(sum_plain_has_join_arg,0);
+  int sum_initjoinfinalvaluetype_has_join_arg = Kokkos::Impl::FunctorHasJoin<SumInitJoinFinalValueType<Scalar,ExecutionSpace>, Scalar >::value;
+  ASSERT_EQ(sum_initjoinfinalvaluetype_has_join_arg,1);
+  int sum_initjoinfinalvaluetype_has_join_arg2 = Kokkos::Impl::FunctorHasJoin<SumInitJoinFinalValueType2<Scalar,ExecutionSpace>, Scalar >::value;
+  ASSERT_EQ(sum_initjoinfinalvaluetype_has_join_arg2,1);
+  int sum_wronginitjoinfinalvaluetype_has_join_arg = Kokkos::Impl::FunctorHasJoin<SumWrongInitJoinFinalValueType<Scalar,ExecutionSpace>, Scalar >::value;
+  ASSERT_EQ(sum_wronginitjoinfinalvaluetype_has_join_arg,0);
+#else
+  int sum_plain_has_join_arg = Kokkos::Impl::FunctorHasJoin<SumPlain<Scalar,ExecutionSpace>, Scalar& >::value;
+  ASSERT_EQ(sum_plain_has_join_arg,0);
+  int sum_initjoinfinalvaluetype_has_join_arg = Kokkos::Impl::FunctorHasJoin<SumInitJoinFinalValueType<Scalar,ExecutionSpace>, Scalar& >::value;
+  ASSERT_EQ(sum_initjoinfinalvaluetype_has_join_arg,1);
+  int sum_initjoinfinalvaluetype_has_join_arg2 = Kokkos::Impl::FunctorHasJoin<SumInitJoinFinalValueType2<Scalar,ExecutionSpace>, Scalar& >::value;
+  ASSERT_EQ(sum_initjoinfinalvaluetype_has_join_arg2,1);
+  int sum_wronginitjoinfinalvaluetype_has_join_arg = Kokkos::Impl::FunctorHasJoin<SumWrongInitJoinFinalValueType<Scalar,ExecutionSpace>, Scalar& >::value;
+  ASSERT_EQ(sum_wronginitjoinfinalvaluetype_has_join_arg,1);
+#endif*/
+  //printf("Values Join: %i %i %i\n",sum_plain_has_join_arg,sum_initjoinfinalvaluetype_has_join_arg,sum_wronginitjoinfinalvaluetype_has_join_arg);
+}
+
+}
diff --git a/lib/kokkos/core/unit_test/TestThreads.cpp b/lib/kokkos/core/unit_test/TestThreads.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..93049b95dd7c75bcd88b8d6408e8a0249f905855
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestThreads.cpp
@@ -0,0 +1,614 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Macros.hpp>
+
+#if defined( KOKKOS_HAVE_PTHREAD )
+#ifdef KOKKOS_LAMBDA
+#undef KOKKOS_LAMBDA
+#endif
+#define KOKKOS_LAMBDA [=]
+
+#include <Kokkos_Core.hpp>
+
+#include <Threads/Kokkos_Threads_TaskPolicy.hpp>
+
+//----------------------------------------------------------------------------
+
+#include <TestSharedAlloc.hpp>
+#include <TestViewMapping.hpp>
+
+#include <TestViewImpl.hpp>
+
+#include <TestViewAPI.hpp>
+#include <TestViewSubview.hpp>
+#include <TestViewOfClass.hpp>
+#include <TestAtomic.hpp>
+#include <TestAtomicOperations.hpp>
+
+#include <TestReduce.hpp>
+#include <TestScan.hpp>
+#include <TestRange.hpp>
+#include <TestTeam.hpp>
+#include <TestAggregate.hpp>
+#include <TestAggregateReduction.hpp>
+#include <TestCompilerMacros.hpp>
+#include <TestTaskPolicy.hpp>
+#include <TestMemoryPool.hpp>
+
+
+#include <TestCXX11.hpp>
+#include <TestCXX11Deduction.hpp>
+#include <TestTeamVector.hpp>
+#include <TestMemorySpaceTracking.hpp>
+#include <TestTemplateMetaFunctions.hpp>
+
+
+#include <TestPolicyConstruction.hpp>
+
+#include <TestMDRange.hpp>
+
+namespace Test {
+
+class threads : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+    // Finalize without initialize is a no-op:
+    Kokkos::Threads::finalize();
+
+    const unsigned numa_count       = Kokkos::hwloc::get_available_numa_count();
+    const unsigned cores_per_numa   = Kokkos::hwloc::get_available_cores_per_numa();
+    const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core();
+
+    unsigned threads_count = 0 ;
+
+    // Initialize and finalize with no threads:
+    Kokkos::Threads::initialize( 1u );
+    Kokkos::Threads::finalize();
+
+    threads_count = std::max( 1u , numa_count )
+                  * std::max( 2u , cores_per_numa * threads_per_core );
+
+    Kokkos::Threads::initialize( threads_count );
+    Kokkos::Threads::finalize();
+
+    threads_count = std::max( 1u , numa_count * 2 )
+                  * std::max( 2u , ( cores_per_numa * threads_per_core ) / 2 );
+
+    Kokkos::Threads::initialize( threads_count );
+    Kokkos::Threads::finalize();
+
+    // Quick attempt to verify thread start/terminate don't have race condition:
+    threads_count = std::max( 1u , numa_count )
+                  * std::max( 2u , ( cores_per_numa * threads_per_core ) / 2 );
+    for ( unsigned i = 0 ; i < 10 ; ++i ) {
+      Kokkos::Threads::initialize( threads_count );
+      Kokkos::Threads::sleep();
+      Kokkos::Threads::wake();
+      Kokkos::Threads::finalize();
+    }
+
+    Kokkos::Threads::initialize( threads_count );
+    Kokkos::Threads::print_configuration( std::cout , true /* detailed */ );
+  }
+
+  static void TearDownTestCase()
+  {
+    Kokkos::Threads::finalize();
+  }
+};
+
+TEST_F( threads , init ) {
+  ;
+}
+
+TEST_F( threads , md_range ) {
+  TestMDRange_2D< Kokkos::Threads >::test_for2(100,100);
+
+  TestMDRange_3D< Kokkos::Threads >::test_for3(100,100,100);
+}
+
+TEST_F( threads , dispatch )
+{
+  const int repeat = 100 ;
+  for ( int i = 0 ; i < repeat ; ++i ) {
+  for ( int j = 0 ; j < repeat ; ++j ) {
+    Kokkos::parallel_for( Kokkos::RangePolicy< Kokkos::Threads >(0,j)
+                        , KOKKOS_LAMBDA( int ) {} );
+  }}
+}
+
+TEST_F( threads , impl_shared_alloc ) {
+  test_shared_alloc< Kokkos::HostSpace , Kokkos::Threads >();
+}
+
+TEST_F( threads, policy_construction) {
+  TestRangePolicyConstruction< Kokkos::Threads >();
+  TestTeamPolicyConstruction< Kokkos::Threads >();
+}
+
+TEST_F( threads , impl_view_mapping ) {
+  test_view_mapping< Kokkos::Threads >();
+  test_view_mapping_subview< Kokkos::Threads >();
+  test_view_mapping_operator< Kokkos::Threads >();
+  TestViewMappingAtomic< Kokkos::Threads >::run();
+}
+
+
+TEST_F( threads, view_impl) {
+  test_view_impl< Kokkos::Threads >();
+}
+
+TEST_F( threads, view_api) {
+  TestViewAPI< double , Kokkos::Threads >();
+}
+
+TEST_F( threads , view_nested_view )
+{
+  ::Test::view_nested_view< Kokkos::Threads >();
+}
+
+TEST_F( threads, view_subview_auto_1d_left ) {
+  TestViewSubview::test_auto_1d< Kokkos::LayoutLeft,Kokkos::Threads >();
+}
+
+TEST_F( threads, view_subview_auto_1d_right ) {
+  TestViewSubview::test_auto_1d< Kokkos::LayoutRight,Kokkos::Threads >();
+}
+
+TEST_F( threads, view_subview_auto_1d_stride ) {
+  TestViewSubview::test_auto_1d< Kokkos::LayoutStride,Kokkos::Threads >();
+}
+
+TEST_F( threads, view_subview_assign_strided ) {
+  TestViewSubview::test_1d_strided_assignment< Kokkos::Threads >();
+}
+
+TEST_F( threads, view_subview_left_0 ) {
+  TestViewSubview::test_left_0< Kokkos::Threads >();
+}
+
+TEST_F( threads, view_subview_left_1 ) {
+  TestViewSubview::test_left_1< Kokkos::Threads >();
+}
+
+TEST_F( threads, view_subview_left_2 ) {
+  TestViewSubview::test_left_2< Kokkos::Threads >();
+}
+
+TEST_F( threads, view_subview_left_3 ) {
+  TestViewSubview::test_left_3< Kokkos::Threads >();
+}
+
+TEST_F( threads, view_subview_right_0 ) {
+  TestViewSubview::test_right_0< Kokkos::Threads >();
+}
+
+TEST_F( threads, view_subview_right_1 ) {
+  TestViewSubview::test_right_1< Kokkos::Threads >();
+}
+
+TEST_F( threads, view_subview_right_3 ) {
+  TestViewSubview::test_right_3< Kokkos::Threads >();
+}
+
+
+TEST_F( threads, view_aggregate ) {
+  TestViewAggregate< Kokkos::Threads >();
+  TestViewAggregateReduction< Kokkos::Threads >();
+}
+
+TEST_F( threads , range_tag )
+{
+  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >::test_for(2);
+  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >::test_reduce(2);
+  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >::test_scan(2);
+  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(3);
+  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(3);
+  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_scan(3);
+  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_dynamic_policy(2);
+  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >::test_for(1000);
+  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >::test_reduce(1000);
+  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >::test_scan(1000);
+  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(1001);
+  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(1001);
+  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_scan(1001);
+  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_dynamic_policy(1000);
+}
+
+TEST_F( threads , team_tag )
+{
+  TestTeamPolicy< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >::test_for(2);
+  TestTeamPolicy< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >::test_reduce(2);
+  TestTeamPolicy< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(2);
+  TestTeamPolicy< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(2);
+  TestTeamPolicy< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >::test_for(1000);
+  TestTeamPolicy< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >::test_reduce(1000);
+  TestTeamPolicy< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(1000);
+  TestTeamPolicy< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(1000);
+}
+
+TEST_F( threads, long_reduce) {
+  TestReduce< long ,   Kokkos::Threads >( 1000000 );
+}
+
+TEST_F( threads, double_reduce) {
+  TestReduce< double ,   Kokkos::Threads >( 1000000 );
+}
+
+TEST_F( threads , reducers )
+{
+  TestReducers<int, Kokkos::Threads>::execute_integer();
+  TestReducers<size_t, Kokkos::Threads>::execute_integer();
+  TestReducers<double, Kokkos::Threads>::execute_float();
+  TestReducers<Kokkos::complex<double>, Kokkos::Threads>::execute_basic();
+}
+
+TEST_F( threads, team_long_reduce) {
+  TestReduceTeam< long ,   Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >( 3 );
+  TestReduceTeam< long ,   Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
+  TestReduceTeam< long ,   Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >( 100000 );
+  TestReduceTeam< long ,   Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
+}
+
+TEST_F( threads, team_double_reduce) {
+  TestReduceTeam< double ,   Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >( 3 );
+  TestReduceTeam< double ,   Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
+  TestReduceTeam< double ,   Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >( 100000 );
+  TestReduceTeam< double ,   Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
+}
+
+TEST_F( threads, long_reduce_dynamic ) {
+  TestReduceDynamic< long ,   Kokkos::Threads >( 1000000 );
+}
+
+TEST_F( threads, double_reduce_dynamic ) {
+  TestReduceDynamic< double ,   Kokkos::Threads >( 1000000 );
+}
+
+TEST_F( threads, long_reduce_dynamic_view ) {
+  TestReduceDynamicView< long ,   Kokkos::Threads >( 1000000 );
+}
+
+TEST_F( threads, team_shared_request) {
+  TestSharedTeam< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >();
+  TestSharedTeam< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >();
+}
+
+#if defined(KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA)
+TEST_F( threads, team_lambda_shared_request) {
+  TestLambdaSharedTeam< Kokkos::HostSpace, Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >();
+  TestLambdaSharedTeam< Kokkos::HostSpace, Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >();
+}
+#endif
+
+TEST_F( threads, shmem_size) {
+  TestShmemSize< Kokkos::Threads >();
+}
+
+TEST_F( threads , view_remap )
+{
+  enum { N0 = 3 , N1 = 2 , N2 = 8 , N3 = 9 };
+
+  typedef Kokkos::View< double*[N1][N2][N3] ,
+                             Kokkos::LayoutRight ,
+                             Kokkos::Threads > output_type ;
+
+  typedef Kokkos::View< int**[N2][N3] ,
+                             Kokkos::LayoutLeft ,
+                             Kokkos::Threads > input_type ;
+
+  typedef Kokkos::View< int*[N0][N2][N3] ,
+                             Kokkos::LayoutLeft ,
+                             Kokkos::Threads > diff_type ;
+
+  output_type output( "output" , N0 );
+  input_type  input ( "input" , N0 , N1 );
+  diff_type   diff  ( "diff" , N0 );
+
+  int value = 0 ;
+  for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
+  for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
+  for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
+  for ( size_t i0 = 0 ; i0 < N0 ; ++i0 ) {
+    input(i0,i1,i2,i3) = ++value ;
+  }}}}
+
+  // Kokkos::deep_copy( diff , input ); // throw with incompatible shape
+  Kokkos::deep_copy( output , input );
+
+  value = 0 ;
+  for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
+  for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
+  for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
+  for ( size_t i0 = 0 ; i0 < N0 ; ++i0 ) {
+    ++value ;
+    ASSERT_EQ( value , ((int) output(i0,i1,i2,i3) ) );
+  }}}}
+}
+
+//----------------------------------------------------------------------------
+
+TEST_F( threads , atomics )
+{
+  const int loop_count = 1e6 ;
+
+  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Threads>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Threads>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Threads>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Threads>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Threads>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Threads>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Threads>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Threads>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Threads>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Threads>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Threads>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Threads>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Threads>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Threads>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Threads>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Threads>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Threads>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Threads>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Threads>(100,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Threads>(100,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Threads>(100,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Threads>(100,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Threads>(100,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Threads>(100,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<3>, Kokkos::Threads>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<3>, Kokkos::Threads>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<3>, Kokkos::Threads>(loop_count,3) ) );
+}
+
+TEST_F( threads , atomic_operations )
+{
+  const int start = 1; //Avoid zero for division
+  const int end = 11;
+  for (int i = start; i < end; ++i)
+  {
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Threads>(start, end-i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Threads>(start, end-i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Threads>(start, end-i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Threads>(start, end-i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Threads>(start, end-i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Threads>(start, end-i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Threads>(start, end-i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Threads>(start, end-i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Threads>(start, end-i, 9 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Threads>(start, end-i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Threads>(start, end-i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Threads>(start, end-i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Threads>(start, end-i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Threads>(start, end-i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Threads>(start, end-i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Threads>(start, end-i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Threads>(start, end-i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Threads>(start, end-i, 9 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Threads>(start, end-i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Threads>(start, end-i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Threads>(start, end-i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Threads>(start, end-i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Threads>(start, end-i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Threads>(start, end-i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Threads>(start, end-i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Threads>(start, end-i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Threads>(start, end-i, 9 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Threads>(start, end-i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Threads>(start, end-i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Threads>(start, end-i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Threads>(start, end-i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Threads>(start, end-i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Threads>(start, end-i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Threads>(start, end-i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Threads>(start, end-i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Threads>(start, end-i, 9 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Threads>(start, end-i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Threads>(start, end-i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Threads>(start, end-i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Threads>(start, end-i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Threads>(start, end-i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Threads>(start, end-i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Threads>(start, end-i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Threads>(start, end-i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Threads>(start, end-i, 9 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::Threads>(start, end-i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::Threads>(start, end-i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::Threads>(start, end-i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::Threads>(start, end-i, 4 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::Threads>(start, end-i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::Threads>(start, end-i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::Threads>(start, end-i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::Threads>(start, end-i, 4 ) ) );
+  }
+
+}
+
+//----------------------------------------------------------------------------
+
+#if 0
+TEST_F( threads , scan_small )
+{
+  typedef TestScan< Kokkos::Threads , Kokkos::Impl::ThreadsExecUseScanSmall > TestScanFunctor ;
+  for ( int i = 0 ; i < 1000 ; ++i ) {
+    TestScanFunctor( 10 );
+    TestScanFunctor( 10000 );
+  }
+  TestScanFunctor( 1000000 );
+  TestScanFunctor( 10000000 );
+
+  Kokkos::Threads::fence();
+}
+#endif
+
+TEST_F( threads , scan )
+{
+  TestScan< Kokkos::Threads >::test_range( 1 , 1000 );
+  TestScan< Kokkos::Threads >( 1000000 );
+  TestScan< Kokkos::Threads >( 10000000 );
+  Kokkos::Threads::fence();
+}
+
+//----------------------------------------------------------------------------
+
+TEST_F( threads , team_scan )
+{
+  TestScanTeam< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >( 10 );
+  TestScanTeam< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >( 10 );
+  TestScanTeam< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >( 10000 );
+  TestScanTeam< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >( 10000 );
+}
+
+//----------------------------------------------------------------------------
+
+TEST_F( threads , compiler_macros )
+{
+  ASSERT_TRUE( ( TestCompilerMacros::Test< Kokkos::Threads >() ) );
+}
+
+TEST_F( threads , memory_space )
+{
+  TestMemorySpace< Kokkos::Threads >();
+}
+
+TEST_F( threads , memory_pool )
+{
+  bool val = TestMemoryPool::test_mempool< Kokkos::Threads >( 128, 128000000 );
+  ASSERT_TRUE( val );
+
+  TestMemoryPool::test_mempool2< Kokkos::Threads >( 64, 4, 1000000, 2000000 );
+
+  TestMemoryPool::test_memory_exhaustion< Kokkos::Threads >();
+}
+
+//----------------------------------------------------------------------------
+
+TEST_F( threads , template_meta_functions )
+{
+  TestTemplateMetaFunctions<int, Kokkos::Threads >();
+}
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS )
+TEST_F( threads , cxx11 )
+{
+  if ( Kokkos::Impl::is_same< Kokkos::DefaultExecutionSpace , Kokkos::Threads >::value ) {
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Threads >(1) ) );
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Threads >(2) ) );
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Threads >(3) ) );
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Threads >(4) ) );
+  }
+}
+
+TEST_F( threads , reduction_deduction )
+{
+  TestCXX11::test_reduction_deduction< Kokkos::Threads >();
+}
+#endif /* #if defined( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS ) */
+
+TEST_F( threads , team_vector )
+{
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >(0) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >(1) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >(2) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >(3) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >(4) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >(5) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >(6) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >(7) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >(8) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >(9) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >(10) ) );
+}
+
+#if defined( KOKKOS_ENABLE_TASKPOLICY )
+
+TEST_F( threads , task_policy )
+{
+  TestTaskPolicy::test_task_dep< Kokkos::Threads >( 10 );
+
+  for ( long i = 0 ; i < 25 ; ++i ) {
+//    printf( "test_fib():  %2ld\n", i );
+    TestTaskPolicy::test_fib< Kokkos::Threads >(i);
+  }
+  for ( long i = 0 ; i < 35 ; ++i ) {
+//    printf( "test_fib2(): %2ld\n", i );
+    TestTaskPolicy::test_fib2< Kokkos::Threads >(i);
+  }
+}
+
+TEST_F( threads , task_team )
+{
+  TestTaskPolicy::test_task_team< Kokkos::Threads >(1000);
+}
+
+TEST_F( threads , task_latch )
+{
+  TestTaskPolicy::test_latch< Kokkos::Threads >(10);
+  TestTaskPolicy::test_latch< Kokkos::Threads >(1000);
+}
+
+#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
+
+} // namespace Test
+
+#endif /* #if defined( KOKKOS_HAVE_PTHREAD ) */
diff --git a/lib/kokkos/core/unit_test/TestTile.hpp b/lib/kokkos/core/unit_test/TestTile.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..dfb2bd81b3dec3485688f9827d3f1f7ad24ddb9d
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestTile.hpp
@@ -0,0 +1,153 @@
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+
+#ifndef TEST_TILE_HPP
+#define TEST_TILE_HPP
+
+#include <Kokkos_Core.hpp>
+
+namespace TestTile {
+
+template < typename Device , typename TileLayout>
+struct ReduceTileErrors
+{
+  typedef Device execution_space ;
+
+  typedef Kokkos::View< ptrdiff_t**, TileLayout, Device>  array_type;
+  typedef Kokkos::View< ptrdiff_t[ TileLayout::N0 ][ TileLayout::N1 ], Kokkos::LayoutLeft , Device >  tile_type ;
+
+  array_type m_array ;
+
+  typedef ptrdiff_t value_type;
+
+  ReduceTileErrors( array_type a )
+    : m_array(a)
+  {}
+
+
+  KOKKOS_INLINE_FUNCTION
+  static void init( value_type & errors )
+  {
+    errors = 0;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & errors ,
+                    const volatile value_type & src_errors )
+  {
+    errors += src_errors;
+  }
+
+  // Initialize
+  KOKKOS_INLINE_FUNCTION
+  void operator()( size_t iwork ) const
+  {
+    const size_t i = iwork % m_array.dimension_0();
+    const size_t j = iwork / m_array.dimension_0();
+    if ( j < m_array.dimension_1() ) {
+      m_array(i,j) = & m_array(i,j) - & m_array(0,0);
+
+// printf("m_array(%d,%d) = %d\n",int(i),int(j),int(m_array(i,j)));
+
+    }
+  }
+
+  // Verify:
+  KOKKOS_INLINE_FUNCTION
+  void operator()( size_t iwork , value_type & errors ) const
+  {
+    const size_t tile_dim0 = ( m_array.dimension_0() + TileLayout::N0 - 1 ) / TileLayout::N0 ;
+    const size_t tile_dim1 = ( m_array.dimension_1() + TileLayout::N1 - 1 ) / TileLayout::N1 ;
+
+    const size_t itile = iwork % tile_dim0 ;
+    const size_t jtile = iwork / tile_dim0 ;
+
+    if ( jtile < tile_dim1 ) {
+
+      tile_type tile = Kokkos::tile_subview( m_array , itile , jtile );
+
+      if ( tile(0,0) != ptrdiff_t(( itile + jtile * tile_dim0 ) * TileLayout::N0 * TileLayout::N1 ) ) {
+        ++errors ;
+      }
+      else {
+
+        for ( size_t j = 0 ; j < size_t(TileLayout::N1) ; ++j ) {
+        for ( size_t i = 0 ; i < size_t(TileLayout::N0) ; ++i ) {
+          const size_t iglobal = i + itile * TileLayout::N0 ;
+          const size_t jglobal = j + jtile * TileLayout::N1 ;
+
+          if ( iglobal < m_array.dimension_0() && jglobal < m_array.dimension_1() ) {
+            if ( tile(i,j) != ptrdiff_t( tile(0,0) + i + j * TileLayout::N0 ) ) ++errors ;
+
+// printf("tile(%d,%d)(%d,%d) = %d\n",int(itile),int(jtile),int(i),int(j),int(tile(i,j)));
+
+          }
+        }
+        }
+      }
+    }
+  }
+};
+
+template< class Space , unsigned N0 , unsigned N1 >
+void test( const size_t dim0 , const size_t dim1 )
+{
+  typedef Kokkos::LayoutTileLeft<N0,N1>  array_layout ;
+  typedef ReduceTileErrors< Space , array_layout > functor_type ;
+
+  const size_t tile_dim0 = ( dim0 + N0 - 1 ) / N0 ;
+  const size_t tile_dim1 = ( dim1 + N1 - 1 ) / N1 ;
+  
+  typename functor_type::array_type array("",dim0,dim1);
+
+  Kokkos::parallel_for( Kokkos::RangePolicy<Space,size_t>(0,dim0*dim1) , functor_type( array ) );
+
+  ptrdiff_t error = 0 ;
+
+  Kokkos::parallel_reduce( Kokkos::RangePolicy<Space,size_t>(0,tile_dim0*tile_dim1) , functor_type( array ) , error );
+
+  EXPECT_EQ( error , ptrdiff_t(0) );
+}
+
+} /* namespace TestTile */
+
+#endif //TEST_TILE_HPP
+
diff --git a/lib/kokkos/core/unit_test/TestViewAPI.hpp b/lib/kokkos/core/unit_test/TestViewAPI.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ae4c6d2185d12bdf1f61ab66c73244e6b38bb50b
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestViewAPI.hpp
@@ -0,0 +1,1416 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+#include <stdexcept>
+#include <sstream>
+#include <iostream>
+
+/*--------------------------------------------------------------------------*/
+
+
+/*--------------------------------------------------------------------------*/
+
+namespace Test {
+
+#if KOKKOS_USING_EXP_VIEW
+
+template< class T , class ... P >
+size_t allocation_count( const Kokkos::View<T,P...> & view )
+{
+  const size_t card  = view.size();
+  const size_t alloc = view.span();
+
+  const int memory_span = Kokkos::View<int*>::required_allocation_size(100);
+
+  return (card <= alloc && memory_span == 400) ? alloc : 0 ;
+}
+
+#else
+
+template< class T , class L , class D , class M , class S >
+size_t allocation_count( const Kokkos::View<T,L,D,M,S> & view )
+{
+  const size_t card  = Kokkos::Impl::cardinality_count( view.shape() );
+  const size_t alloc = view.capacity();
+
+  return card <= alloc ? alloc : 0 ;
+}
+
+#endif
+
+/*--------------------------------------------------------------------------*/
+
+template< typename T, class DeviceType>
+struct TestViewOperator
+{
+  typedef typename DeviceType::execution_space  execution_space ;
+
+  static const unsigned N = 100 ;
+  static const unsigned D = 3 ;
+
+  typedef Kokkos::View< T*[D] , execution_space > view_type ;
+
+  const view_type v1 ;
+  const view_type v2 ;
+
+  TestViewOperator()
+    : v1( "v1" , N )
+    , v2( "v2" , N )
+    {}
+
+  static void testit()
+  {
+    Kokkos::parallel_for( N , TestViewOperator() );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const unsigned i ) const
+  {
+    const unsigned X = 0 ;
+    const unsigned Y = 1 ;
+    const unsigned Z = 2 ;
+
+    v2(i,X) = v1(i,X);
+    v2(i,Y) = v1(i,Y);
+    v2(i,Z) = v1(i,Z);
+  }
+};
+
+/*--------------------------------------------------------------------------*/
+
+template< class DataType ,
+          class DeviceType ,
+          unsigned Rank = Kokkos::ViewTraits< DataType >::rank >
+struct TestViewOperator_LeftAndRight ;
+
+template< class DataType , class DeviceType >
+struct TestViewOperator_LeftAndRight< DataType , DeviceType , 8 >
+{
+  typedef typename DeviceType::execution_space    execution_space ;
+  typedef typename DeviceType::memory_space       memory_space ;
+  typedef typename execution_space::size_type     size_type ;
+
+  typedef int value_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update ,
+                    const volatile value_type & input )
+    { update |= input ; }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init( value_type & update )
+    { update = 0 ; }
+
+
+  typedef Kokkos::
+    View< DataType, Kokkos::LayoutLeft, execution_space > left_view ;
+
+  typedef Kokkos::
+    View< DataType, Kokkos::LayoutRight, execution_space > right_view ;
+
+  typedef Kokkos::
+    View< DataType, Kokkos::LayoutStride, execution_space > stride_view ;
+
+  left_view    left ;
+  right_view   right ;
+  stride_view  left_stride ;
+  stride_view  right_stride ;
+  long         left_alloc ;
+  long         right_alloc ;
+
+  TestViewOperator_LeftAndRight()
+    : left(  "left" )
+    , right( "right" )
+    , left_stride( left )
+    , right_stride( right )
+    , left_alloc( allocation_count( left ) )
+    , right_alloc( allocation_count( right ) )
+    {}
+
+  static void testit()
+  {
+    TestViewOperator_LeftAndRight driver ;
+
+    int error_flag = 0 ;
+
+    Kokkos::parallel_reduce( 1 , driver , error_flag );
+
+    ASSERT_EQ( error_flag , 0 );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type , value_type & update ) const
+  {
+    long offset ;
+
+    offset = -1 ;
+    for ( unsigned i7 = 0 ; i7 < unsigned(left.dimension_7()) ; ++i7 )
+    for ( unsigned i6 = 0 ; i6 < unsigned(left.dimension_6()) ; ++i6 )
+    for ( unsigned i5 = 0 ; i5 < unsigned(left.dimension_5()) ; ++i5 )
+    for ( unsigned i4 = 0 ; i4 < unsigned(left.dimension_4()) ; ++i4 )
+    for ( unsigned i3 = 0 ; i3 < unsigned(left.dimension_3()) ; ++i3 )
+    for ( unsigned i2 = 0 ; i2 < unsigned(left.dimension_2()) ; ++i2 )
+    for ( unsigned i1 = 0 ; i1 < unsigned(left.dimension_1()) ; ++i1 )
+    for ( unsigned i0 = 0 ; i0 < unsigned(left.dimension_0()) ; ++i0 )
+    {
+      const long j = & left( i0, i1, i2, i3, i4, i5, i6, i7 ) -
+                     & left(  0,  0,  0,  0,  0,  0,  0,  0 );
+      if ( j <= offset || left_alloc <= j ) { update |= 1 ; }
+      offset = j ;
+
+      if ( & left(i0,i1,i2,i3,i4,i5,i6,i7) !=
+           & left_stride(i0,i1,i2,i3,i4,i5,i6,i7) ) {
+        update |= 4 ;
+      }
+    }
+
+    offset = -1 ;
+    for ( unsigned i0 = 0 ; i0 < unsigned(right.dimension_0()) ; ++i0 )
+    for ( unsigned i1 = 0 ; i1 < unsigned(right.dimension_1()) ; ++i1 )
+    for ( unsigned i2 = 0 ; i2 < unsigned(right.dimension_2()) ; ++i2 )
+    for ( unsigned i3 = 0 ; i3 < unsigned(right.dimension_3()) ; ++i3 )
+    for ( unsigned i4 = 0 ; i4 < unsigned(right.dimension_4()) ; ++i4 )
+    for ( unsigned i5 = 0 ; i5 < unsigned(right.dimension_5()) ; ++i5 )
+    for ( unsigned i6 = 0 ; i6 < unsigned(right.dimension_6()) ; ++i6 )
+    for ( unsigned i7 = 0 ; i7 < unsigned(right.dimension_7()) ; ++i7 )
+    {
+      const long j = & right( i0, i1, i2, i3, i4, i5, i6, i7 ) -
+                     & right(  0,  0,  0,  0,  0,  0,  0,  0 );
+      if ( j <= offset || right_alloc <= j ) { update |= 2 ; }
+      offset = j ;
+
+      if ( & right(i0,i1,i2,i3,i4,i5,i6,i7) !=
+           & right_stride(i0,i1,i2,i3,i4,i5,i6,i7) ) {
+        update |= 8 ;
+      }
+    }
+  }
+};
+
+template< class DataType , class DeviceType >
+struct TestViewOperator_LeftAndRight< DataType , DeviceType , 7 >
+{
+  typedef typename DeviceType::execution_space  execution_space ;
+  typedef typename DeviceType::memory_space     memory_space ;
+  typedef typename execution_space::size_type   size_type ;
+
+  typedef int value_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update ,
+                    const volatile value_type & input )
+    { update |= input ; }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init( value_type & update )
+    { update = 0 ; }
+
+
+  typedef Kokkos::
+    View< DataType, Kokkos::LayoutLeft, execution_space > left_view ;
+
+  typedef Kokkos::
+    View< DataType, Kokkos::LayoutRight, execution_space > right_view ;
+
+  left_view    left ;
+  right_view   right ;
+  long         left_alloc ;
+  long         right_alloc ;
+
+  TestViewOperator_LeftAndRight()
+    : left(  "left" )
+    , right( "right" )
+    , left_alloc( allocation_count( left ) )
+    , right_alloc( allocation_count( right ) )
+    {}
+
+  static void testit()
+  {
+    TestViewOperator_LeftAndRight driver ;
+
+    int error_flag = 0 ;
+
+    Kokkos::parallel_reduce( 1 , driver , error_flag );
+
+    ASSERT_EQ( error_flag , 0 );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type , value_type & update ) const
+  {
+    long offset ;
+
+    offset = -1 ;
+    for ( unsigned i6 = 0 ; i6 < unsigned(left.dimension_6()) ; ++i6 )
+    for ( unsigned i5 = 0 ; i5 < unsigned(left.dimension_5()) ; ++i5 )
+    for ( unsigned i4 = 0 ; i4 < unsigned(left.dimension_4()) ; ++i4 )
+    for ( unsigned i3 = 0 ; i3 < unsigned(left.dimension_3()) ; ++i3 )
+    for ( unsigned i2 = 0 ; i2 < unsigned(left.dimension_2()) ; ++i2 )
+    for ( unsigned i1 = 0 ; i1 < unsigned(left.dimension_1()) ; ++i1 )
+    for ( unsigned i0 = 0 ; i0 < unsigned(left.dimension_0()) ; ++i0 )
+    {
+      const long j = & left( i0, i1, i2, i3, i4, i5, i6 ) -
+                     & left(  0,  0,  0,  0,  0,  0,  0 );
+      if ( j <= offset || left_alloc <= j ) { update |= 1 ; }
+      offset = j ;
+    }
+
+    offset = -1 ;
+    for ( unsigned i0 = 0 ; i0 < unsigned(right.dimension_0()) ; ++i0 )
+    for ( unsigned i1 = 0 ; i1 < unsigned(right.dimension_1()) ; ++i1 )
+    for ( unsigned i2 = 0 ; i2 < unsigned(right.dimension_2()) ; ++i2 )
+    for ( unsigned i3 = 0 ; i3 < unsigned(right.dimension_3()) ; ++i3 )
+    for ( unsigned i4 = 0 ; i4 < unsigned(right.dimension_4()) ; ++i4 )
+    for ( unsigned i5 = 0 ; i5 < unsigned(right.dimension_5()) ; ++i5 )
+    for ( unsigned i6 = 0 ; i6 < unsigned(right.dimension_6()) ; ++i6 )
+    {
+      const long j = & right( i0, i1, i2, i3, i4, i5, i6 ) -
+                     & right(  0,  0,  0,  0,  0,  0,  0 );
+      if ( j <= offset || right_alloc <= j ) { update |= 2 ; }
+      offset = j ;
+    }
+  }
+};
+
+template< class DataType , class DeviceType >
+struct TestViewOperator_LeftAndRight< DataType , DeviceType , 6 >
+{
+  typedef typename DeviceType::execution_space  execution_space ;
+  typedef typename DeviceType::memory_space     memory_space ;
+  typedef typename execution_space::size_type   size_type ;
+
+  typedef int value_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update ,
+                    const volatile value_type & input )
+    { update |= input ; }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init( value_type & update )
+    { update = 0 ; }
+
+
+  typedef Kokkos::
+    View< DataType, Kokkos::LayoutLeft, execution_space > left_view ;
+
+  typedef Kokkos::
+    View< DataType, Kokkos::LayoutRight, execution_space > right_view ;
+
+  left_view    left ;
+  right_view   right ;
+  long         left_alloc ;
+  long         right_alloc ;
+
+  TestViewOperator_LeftAndRight()
+    : left(  "left" )
+    , right( "right" )
+    , left_alloc( allocation_count( left ) )
+    , right_alloc( allocation_count( right ) )
+    {}
+
+  static void testit()
+  {
+    TestViewOperator_LeftAndRight driver ;
+
+    int error_flag = 0 ;
+
+    Kokkos::parallel_reduce( 1 , driver , error_flag );
+
+    ASSERT_EQ( error_flag , 0 );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type , value_type & update ) const
+  {
+    long offset ;
+
+    offset = -1 ;
+    for ( unsigned i5 = 0 ; i5 < unsigned(left.dimension_5()) ; ++i5 )
+    for ( unsigned i4 = 0 ; i4 < unsigned(left.dimension_4()) ; ++i4 )
+    for ( unsigned i3 = 0 ; i3 < unsigned(left.dimension_3()) ; ++i3 )
+    for ( unsigned i2 = 0 ; i2 < unsigned(left.dimension_2()) ; ++i2 )
+    for ( unsigned i1 = 0 ; i1 < unsigned(left.dimension_1()) ; ++i1 )
+    for ( unsigned i0 = 0 ; i0 < unsigned(left.dimension_0()) ; ++i0 )
+    {
+      const long j = & left( i0, i1, i2, i3, i4, i5 ) -
+                     & left(  0,  0,  0,  0,  0,  0 );
+      if ( j <= offset || left_alloc <= j ) { update |= 1 ; }
+      offset = j ;
+    }
+
+    offset = -1 ;
+    for ( unsigned i0 = 0 ; i0 < unsigned(right.dimension_0()) ; ++i0 )
+    for ( unsigned i1 = 0 ; i1 < unsigned(right.dimension_1()) ; ++i1 )
+    for ( unsigned i2 = 0 ; i2 < unsigned(right.dimension_2()) ; ++i2 )
+    for ( unsigned i3 = 0 ; i3 < unsigned(right.dimension_3()) ; ++i3 )
+    for ( unsigned i4 = 0 ; i4 < unsigned(right.dimension_4()) ; ++i4 )
+    for ( unsigned i5 = 0 ; i5 < unsigned(right.dimension_5()) ; ++i5 )
+    {
+      const long j = & right( i0, i1, i2, i3, i4, i5 ) -
+                     & right(  0,  0,  0,  0,  0,  0 );
+      if ( j <= offset || right_alloc <= j ) { update |= 2 ; }
+      offset = j ;
+    }
+  }
+};
+
+template< class DataType , class DeviceType >
+struct TestViewOperator_LeftAndRight< DataType , DeviceType , 5 >
+{
+  typedef typename DeviceType::execution_space  execution_space ;
+  typedef typename DeviceType::memory_space     memory_space ;
+  typedef typename execution_space::size_type   size_type ;
+
+  typedef int value_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update ,
+                    const volatile value_type & input )
+    { update |= input ; }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init( value_type & update )
+    { update = 0 ; }
+
+
+  typedef Kokkos::
+    View< DataType, Kokkos::LayoutLeft, execution_space > left_view ;
+
+  typedef Kokkos::
+    View< DataType, Kokkos::LayoutRight, execution_space > right_view ;
+
+  typedef Kokkos::
+    View< DataType, Kokkos::LayoutStride, execution_space > stride_view ;
+
+  left_view    left ;
+  right_view   right ;
+  stride_view  left_stride ;
+  stride_view  right_stride ;
+  long         left_alloc ;
+  long         right_alloc ;
+
+  TestViewOperator_LeftAndRight()
+    : left(  "left" )
+    , right( "right" )
+    , left_stride( left )
+    , right_stride( right )
+    , left_alloc( allocation_count( left ) )
+    , right_alloc( allocation_count( right ) )
+    {}
+
+  static void testit()
+  {
+    TestViewOperator_LeftAndRight driver ;
+
+    int error_flag = 0 ;
+
+    Kokkos::parallel_reduce( 1 , driver , error_flag );
+
+    ASSERT_EQ( error_flag , 0 );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type , value_type & update ) const
+  {
+    long offset ;
+
+    offset = -1 ;
+    for ( unsigned i4 = 0 ; i4 < unsigned(left.dimension_4()) ; ++i4 )
+    for ( unsigned i3 = 0 ; i3 < unsigned(left.dimension_3()) ; ++i3 )
+    for ( unsigned i2 = 0 ; i2 < unsigned(left.dimension_2()) ; ++i2 )
+    for ( unsigned i1 = 0 ; i1 < unsigned(left.dimension_1()) ; ++i1 )
+    for ( unsigned i0 = 0 ; i0 < unsigned(left.dimension_0()) ; ++i0 )
+    {
+      const long j = & left( i0, i1, i2, i3, i4 ) -
+                     & left(  0,  0,  0,  0,  0 );
+      if ( j <= offset || left_alloc <= j ) { update |= 1 ; }
+      offset = j ;
+
+      if ( & left( i0, i1, i2, i3, i4 ) !=
+           & left_stride( i0, i1, i2, i3, i4 ) ) { update |= 4 ; }
+    }
+
+    offset = -1 ;
+    for ( unsigned i0 = 0 ; i0 < unsigned(right.dimension_0()) ; ++i0 )
+    for ( unsigned i1 = 0 ; i1 < unsigned(right.dimension_1()) ; ++i1 )
+    for ( unsigned i2 = 0 ; i2 < unsigned(right.dimension_2()) ; ++i2 )
+    for ( unsigned i3 = 0 ; i3 < unsigned(right.dimension_3()) ; ++i3 )
+    for ( unsigned i4 = 0 ; i4 < unsigned(right.dimension_4()) ; ++i4 )
+    {
+      const long j = & right( i0, i1, i2, i3, i4 ) -
+                     & right(  0,  0,  0,  0,  0 );
+      if ( j <= offset || right_alloc <= j ) { update |= 2 ; }
+      offset = j ;
+
+      if ( & right( i0, i1, i2, i3, i4 ) !=
+           & right_stride( i0, i1, i2, i3, i4 ) ) { update |= 8 ; }
+    }
+  }
+};
+
+template< class DataType , class DeviceType >
+struct TestViewOperator_LeftAndRight< DataType , DeviceType , 4 >
+{
+  typedef typename DeviceType::execution_space  execution_space ;
+  typedef typename DeviceType::memory_space     memory_space ;
+  typedef typename execution_space::size_type   size_type ;
+
+  typedef int value_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update ,
+                    const volatile value_type & input )
+    { update |= input ; }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init( value_type & update )
+    { update = 0 ; }
+
+
+  typedef Kokkos::
+    View< DataType, Kokkos::LayoutLeft, execution_space > left_view ;
+
+  typedef Kokkos::
+    View< DataType, Kokkos::LayoutRight, execution_space > right_view ;
+
+  left_view    left ;
+  right_view   right ;
+  long         left_alloc ;
+  long         right_alloc ;
+
+  TestViewOperator_LeftAndRight()
+    : left(  "left" )
+    , right( "right" )
+    , left_alloc( allocation_count( left ) )
+    , right_alloc( allocation_count( right ) )
+    {}
+
+  static void testit()
+  {
+    TestViewOperator_LeftAndRight driver ;
+
+    int error_flag = 0 ;
+
+    Kokkos::parallel_reduce( 1 , driver , error_flag );
+
+    ASSERT_EQ( error_flag , 0 );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type , value_type & update ) const
+  {
+    long offset ;
+
+    offset = -1 ;
+    for ( unsigned i3 = 0 ; i3 < unsigned(left.dimension_3()) ; ++i3 )
+    for ( unsigned i2 = 0 ; i2 < unsigned(left.dimension_2()) ; ++i2 )
+    for ( unsigned i1 = 0 ; i1 < unsigned(left.dimension_1()) ; ++i1 )
+    for ( unsigned i0 = 0 ; i0 < unsigned(left.dimension_0()) ; ++i0 )
+    {
+      const long j = & left( i0, i1, i2, i3 ) -
+                     & left(  0,  0,  0,  0 );
+      if ( j <= offset || left_alloc <= j ) { update |= 1 ; }
+      offset = j ;
+    }
+
+    offset = -1 ;
+    for ( unsigned i0 = 0 ; i0 < unsigned(right.dimension_0()) ; ++i0 )
+    for ( unsigned i1 = 0 ; i1 < unsigned(right.dimension_1()) ; ++i1 )
+    for ( unsigned i2 = 0 ; i2 < unsigned(right.dimension_2()) ; ++i2 )
+    for ( unsigned i3 = 0 ; i3 < unsigned(right.dimension_3()) ; ++i3 )
+    {
+      const long j = & right( i0, i1, i2, i3 ) -
+                     & right(  0,  0,  0,  0 );
+      if ( j <= offset || right_alloc <= j ) { update |= 2 ; }
+      offset = j ;
+    }
+  }
+};
+
+template< class DataType , class DeviceType >
+struct TestViewOperator_LeftAndRight< DataType , DeviceType , 3 >
+{
+  typedef typename DeviceType::execution_space  execution_space ;
+  typedef typename DeviceType::memory_space     memory_space ;
+  typedef typename execution_space::size_type   size_type ;
+
+  typedef int value_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update ,
+                    const volatile value_type & input )
+    { update |= input ; }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init( value_type & update )
+    { update = 0 ; }
+
+
+  typedef Kokkos::
+    View< DataType, Kokkos::LayoutLeft, execution_space > left_view ;
+
+  typedef Kokkos::
+    View< DataType, Kokkos::LayoutRight, execution_space > right_view ;
+
+  typedef Kokkos::
+    View< DataType, Kokkos::LayoutStride, execution_space > stride_view ;
+
+  left_view    left ;
+  right_view   right ;
+  stride_view  left_stride ;
+  stride_view  right_stride ;
+  long         left_alloc ;
+  long         right_alloc ;
+
+  TestViewOperator_LeftAndRight()
+    : left(  std::string("left") )
+    , right( std::string("right") )
+    , left_stride( left )
+    , right_stride( right )
+    , left_alloc( allocation_count( left ) )
+    , right_alloc( allocation_count( right ) )
+    {}
+
+  static void testit()
+  {
+    TestViewOperator_LeftAndRight driver ;
+
+    int error_flag = 0 ;
+
+    Kokkos::parallel_reduce( 1 , driver , error_flag );
+
+    ASSERT_EQ( error_flag , 0 );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type , value_type & update ) const
+  {
+    long offset ;
+
+    offset = -1 ;
+    for ( unsigned i2 = 0 ; i2 < unsigned(left.dimension_2()) ; ++i2 )
+    for ( unsigned i1 = 0 ; i1 < unsigned(left.dimension_1()) ; ++i1 )
+    for ( unsigned i0 = 0 ; i0 < unsigned(left.dimension_0()) ; ++i0 )
+    {
+      const long j = & left( i0, i1, i2 ) -
+                     & left(  0,  0,  0 );
+      if ( j <= offset || left_alloc <= j ) { update |= 1 ; }
+      offset = j ;
+
+      if ( & left(i0,i1,i2) != & left_stride(i0,i1,i2) ) { update |= 4 ; }
+    }
+
+    offset = -1 ;
+    for ( unsigned i0 = 0 ; i0 < unsigned(right.dimension_0()) ; ++i0 )
+    for ( unsigned i1 = 0 ; i1 < unsigned(right.dimension_1()) ; ++i1 )
+    for ( unsigned i2 = 0 ; i2 < unsigned(right.dimension_2()) ; ++i2 )
+    {
+      const long j = & right( i0, i1, i2 ) -
+                     & right(  0,  0,  0 );
+      if ( j <= offset || right_alloc <= j ) { update |= 2 ; }
+      offset = j ;
+
+      if ( & right(i0,i1,i2) != & right_stride(i0,i1,i2) ) { update |= 8 ; }
+    }
+
+#if KOKKOS_USING_EXP_VIEW
+    for ( unsigned i0 = 0 ; i0 < unsigned(left.dimension_0()) ; ++i0 )
+    for ( unsigned i1 = 0 ; i1 < unsigned(left.dimension_1()) ; ++i1 )
+    for ( unsigned i2 = 0 ; i2 < unsigned(left.dimension_2()) ; ++i2 )
+    {
+      if ( & left(i0,i1,i2)  != & left(i0,i1,i2,0,0,0,0,0) )  { update |= 3 ; }
+      if ( & right(i0,i1,i2) != & right(i0,i1,i2,0,0,0,0,0) ) { update |= 3 ; }
+    }
+#endif
+  }
+};
+
+template< class DataType , class DeviceType >
+struct TestViewOperator_LeftAndRight< DataType , DeviceType , 2 >
+{
+  typedef typename DeviceType::execution_space  execution_space ;
+  typedef typename DeviceType::memory_space     memory_space ;
+  typedef typename execution_space::size_type   size_type ;
+
+  typedef int value_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update ,
+                    const volatile value_type & input )
+    { update |= input ; }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init( value_type & update )
+    { update = 0 ; }
+
+
+  typedef Kokkos::
+    View< DataType, Kokkos::LayoutLeft, execution_space > left_view ;
+
+  typedef Kokkos::
+    View< DataType, Kokkos::LayoutRight, execution_space > right_view ;
+
+  left_view    left ;
+  right_view   right ;
+  long         left_alloc ;
+  long         right_alloc ;
+
+  TestViewOperator_LeftAndRight()
+    : left(  "left" )
+    , right( "right" )
+    , left_alloc( allocation_count( left ) )
+    , right_alloc( allocation_count( right ) )
+    {}
+
+  static void testit()
+  {
+    TestViewOperator_LeftAndRight driver ;
+
+    int error_flag = 0 ;
+
+    Kokkos::parallel_reduce( 1 , driver , error_flag );
+
+    ASSERT_EQ( error_flag , 0 );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type , value_type & update ) const
+  {
+    long offset ;
+
+    offset = -1 ;
+    for ( unsigned i1 = 0 ; i1 < unsigned(left.dimension_1()) ; ++i1 )
+    for ( unsigned i0 = 0 ; i0 < unsigned(left.dimension_0()) ; ++i0 )
+    {
+      const long j = & left( i0, i1 ) -
+                     & left(  0,  0 );
+      if ( j <= offset || left_alloc <= j ) { update |= 1 ; }
+      offset = j ;
+    }
+
+    offset = -1 ;
+    for ( unsigned i0 = 0 ; i0 < unsigned(right.dimension_0()) ; ++i0 )
+    for ( unsigned i1 = 0 ; i1 < unsigned(right.dimension_1()) ; ++i1 )
+    {
+      const long j = & right( i0, i1 ) -
+                     & right(  0,  0 );
+      if ( j <= offset || right_alloc <= j ) { update |= 2 ; }
+      offset = j ;
+    }
+
+#if KOKKOS_USING_EXP_VIEW
+    for ( unsigned i0 = 0 ; i0 < unsigned(left.dimension_0()) ; ++i0 )
+    for ( unsigned i1 = 0 ; i1 < unsigned(left.dimension_1()) ; ++i1 )
+    {
+      if ( & left(i0,i1)  != & left(i0,i1,0,0,0,0,0,0) )  { update |= 3 ; }
+      if ( & right(i0,i1) != & right(i0,i1,0,0,0,0,0,0) ) { update |= 3 ; }
+    }
+#endif
+  }
+};
+
+template< class DataType , class DeviceType >
+struct TestViewOperator_LeftAndRight< DataType , DeviceType , 1 >
+{
+  typedef typename DeviceType::execution_space  execution_space ;
+  typedef typename DeviceType::memory_space     memory_space ;
+  typedef typename execution_space::size_type   size_type ;
+
+  typedef int value_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update ,
+                    const volatile value_type & input )
+    { update |= input ; }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init( value_type & update )
+    { update = 0 ; }
+
+
+  typedef Kokkos::
+    View< DataType, Kokkos::LayoutLeft, execution_space > left_view ;
+
+  typedef Kokkos::
+    View< DataType, Kokkos::LayoutRight, execution_space > right_view ;
+
+  typedef Kokkos::
+    View< DataType, Kokkos::LayoutStride, execution_space > stride_view ;
+
+  left_view    left ;
+  right_view   right ;
+  stride_view  left_stride ;
+  stride_view  right_stride ;
+  long         left_alloc ;
+  long         right_alloc ;
+
+  TestViewOperator_LeftAndRight()
+    : left(  "left" )
+    , right( "right" )
+    , left_stride( left )
+    , right_stride( right )
+    , left_alloc( allocation_count( left ) )
+    , right_alloc( allocation_count( right ) )
+    {}
+
+  static void testit()
+  {
+    TestViewOperator_LeftAndRight driver ;
+
+    int error_flag = 0 ;
+
+    Kokkos::parallel_reduce( 1 , driver , error_flag );
+
+    ASSERT_EQ( error_flag , 0 );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type , value_type & update ) const
+  {
+    for ( unsigned i0 = 0 ; i0 < unsigned(left.dimension_0()) ; ++i0 )
+    {
+#if KOKKOS_USING_EXP_VIEW
+      if ( & left(i0)  != & left(i0,0,0,0,0,0,0,0) )  { update |= 3 ; }
+      if ( & right(i0) != & right(i0,0,0,0,0,0,0,0) ) { update |= 3 ; }
+#endif
+      if ( & left(i0)  != & left_stride(i0) ) { update |= 4 ; }
+      if ( & right(i0) != & right_stride(i0) ) { update |= 8 ; }
+    }
+  }
+};
+
+template<class Layout, class DeviceType>
+struct TestViewMirror {
+
+  template<class MemoryTraits>
+  void static test_mirror() {
+    Kokkos::View<double*, Layout, Kokkos::HostSpace> a_org("A",1000);
+    Kokkos::View<double*, Layout, Kokkos::HostSpace, MemoryTraits> a_h = a_org;
+    auto a_h2 = Kokkos::create_mirror(Kokkos::HostSpace(),a_h);
+    auto a_d = Kokkos::create_mirror(DeviceType(),a_h);
+
+    int equal_ptr_h_h2  = (a_h.data() ==a_h2.data())?1:0;
+    int equal_ptr_h_d   = (a_h.data() ==a_d. data())?1:0;
+    int equal_ptr_h2_d  = (a_h2.data()==a_d. data())?1:0;
+
+    ASSERT_EQ(equal_ptr_h_h2,0);
+    ASSERT_EQ(equal_ptr_h_d ,0);
+    ASSERT_EQ(equal_ptr_h2_d,0);
+    
+
+    ASSERT_EQ(a_h.dimension_0(),a_h2.dimension_0());
+    ASSERT_EQ(a_h.dimension_0(),a_d .dimension_0());
+  }
+
+
+  template<class MemoryTraits>
+  void static test_mirror_view() {
+    Kokkos::View<double*, Layout, Kokkos::HostSpace> a_org("A",1000);
+    Kokkos::View<double*, Layout, Kokkos::HostSpace, MemoryTraits> a_h = a_org;
+    auto a_h2 = Kokkos::create_mirror_view(Kokkos::HostSpace(),a_h);
+    auto a_d = Kokkos::create_mirror_view(DeviceType(),a_h);
+
+    int equal_ptr_h_h2  = a_h.data() ==a_h2.data()?1:0;
+    int equal_ptr_h_d   = a_h.data() ==a_d. data()?1:0;
+    int equal_ptr_h2_d  = a_h2.data()==a_d. data()?1:0;
+
+    int is_same_memspace = std::is_same<Kokkos::HostSpace,typename DeviceType::memory_space>::value?1:0; 
+    ASSERT_EQ(equal_ptr_h_h2,1);
+    ASSERT_EQ(equal_ptr_h_d ,is_same_memspace);
+    ASSERT_EQ(equal_ptr_h2_d ,is_same_memspace);
+
+
+    ASSERT_EQ(a_h.dimension_0(),a_h2.dimension_0());
+    ASSERT_EQ(a_h.dimension_0(),a_d .dimension_0());
+  } 
+
+  void static testit() {
+    test_mirror<Kokkos::MemoryTraits<0>>();
+    test_mirror<Kokkos::MemoryTraits<Kokkos::Unmanaged>>();
+    test_mirror_view<Kokkos::MemoryTraits<0>>();
+    test_mirror_view<Kokkos::MemoryTraits<Kokkos::Unmanaged>>();
+  }
+};
+
+/*--------------------------------------------------------------------------*/
+
+template< typename T, class DeviceType >
+class TestViewAPI
+{
+public:
+  typedef DeviceType        device ;
+
+  enum { N0 = 1000 ,
+         N1 = 3 ,
+         N2 = 5 ,
+         N3 = 7 };
+
+  typedef Kokkos::View< T , device > dView0 ;
+  typedef Kokkos::View< T* , device > dView1 ;
+  typedef Kokkos::View< T*[N1] , device > dView2 ;
+  typedef Kokkos::View< T*[N1][N2] , device > dView3 ;
+  typedef Kokkos::View< T*[N1][N2][N3] , device > dView4 ;
+  typedef Kokkos::View< const T*[N1][N2][N3] , device > const_dView4 ;
+
+  typedef Kokkos::View< T****, device, Kokkos::MemoryUnmanaged > dView4_unmanaged ;
+
+  typedef typename dView0::host_mirror_space host ;
+
+  TestViewAPI()
+  {
+    run_test_mirror();
+    run_test();
+    run_test_scalar();
+    run_test_const();
+    run_test_subview();
+    run_test_subview_strided();
+    run_test_vector();
+
+    TestViewOperator< T , device >::testit();
+    TestViewOperator_LeftAndRight< int[2][3][4][2][3][4][2][3] , device >::testit();
+    TestViewOperator_LeftAndRight< int[2][3][4][2][3][4][2] , device >::testit();
+    TestViewOperator_LeftAndRight< int[2][3][4][2][3][4] , device >::testit();
+    TestViewOperator_LeftAndRight< int[2][3][4][2][3] , device >::testit();
+    TestViewOperator_LeftAndRight< int[2][3][4][2] , device >::testit();
+    TestViewOperator_LeftAndRight< int[2][3][4] , device >::testit();
+    TestViewOperator_LeftAndRight< int[2][3] , device >::testit();
+    TestViewOperator_LeftAndRight< int[2] , device >::testit();
+    TestViewMirror<Kokkos::LayoutLeft, device >::testit(); 
+    TestViewMirror<Kokkos::LayoutRight, device >::testit(); 
+
+  }
+
+  static void run_test_mirror()
+  {
+    typedef Kokkos::View< int , host > view_type ;
+    typedef typename view_type::HostMirror mirror_type ;
+
+    static_assert( std::is_same< typename view_type::memory_space
+                               , typename mirror_type::memory_space
+                               >::value , "" );
+
+    view_type a("a");
+    mirror_type am = Kokkos::create_mirror_view(a);
+    mirror_type ax = Kokkos::create_mirror(a);
+    ASSERT_EQ( & a() , & am() );
+  }
+
+  static void run_test_scalar()
+  {
+    typedef typename dView0::HostMirror  hView0 ;
+
+    dView0 dx , dy ;
+    hView0 hx , hy ;
+
+    dx = dView0( "dx" );
+    dy = dView0( "dy" );
+
+    hx = Kokkos::create_mirror( dx );
+    hy = Kokkos::create_mirror( dy );
+
+    hx() = 1 ;
+
+    Kokkos::deep_copy( dx , hx );
+    Kokkos::deep_copy( dy , dx );
+    Kokkos::deep_copy( hy , dy );
+
+    ASSERT_EQ( hx(), hy() );
+  }
+
+  static void run_test()
+  {
+    // mfh 14 Feb 2014: This test doesn't actually create instances of
+    // these types.  In order to avoid "declared but unused typedef"
+    // warnings, we declare empty instances of these types, with the
+    // usual "(void)" marker to avoid compiler warnings for unused
+    // variables.
+
+    typedef typename dView0::HostMirror  hView0 ;
+    typedef typename dView1::HostMirror  hView1 ;
+    typedef typename dView2::HostMirror  hView2 ;
+    typedef typename dView3::HostMirror  hView3 ;
+    typedef typename dView4::HostMirror  hView4 ;
+
+    {
+      hView0 thing;
+      (void) thing;
+    }
+    {
+      hView1 thing;
+      (void) thing;
+    }
+    {
+      hView2 thing;
+      (void) thing;
+    }
+    {
+      hView3 thing;
+      (void) thing;
+    }
+    {
+      hView4 thing;
+      (void) thing;
+    }
+
+    dView4 dx , dy , dz ;
+    hView4 hx , hy , hz ;
+
+    ASSERT_TRUE( dx.ptr_on_device() == 0 );
+    ASSERT_TRUE( dy.ptr_on_device() == 0 );
+    ASSERT_TRUE( dz.ptr_on_device() == 0 );
+    ASSERT_TRUE( hx.ptr_on_device() == 0 );
+    ASSERT_TRUE( hy.ptr_on_device() == 0 );
+    ASSERT_TRUE( hz.ptr_on_device() == 0 );
+    ASSERT_EQ( dx.dimension_0() , 0u );
+    ASSERT_EQ( dy.dimension_0() , 0u );
+    ASSERT_EQ( dz.dimension_0() , 0u );
+    ASSERT_EQ( hx.dimension_0() , 0u );
+    ASSERT_EQ( hy.dimension_0() , 0u );
+    ASSERT_EQ( hz.dimension_0() , 0u );
+    ASSERT_EQ( dx.dimension_1() , unsigned(N1) );
+    ASSERT_EQ( dy.dimension_1() , unsigned(N1) );
+    ASSERT_EQ( dz.dimension_1() , unsigned(N1) );
+    ASSERT_EQ( hx.dimension_1() , unsigned(N1) );
+    ASSERT_EQ( hy.dimension_1() , unsigned(N1) );
+    ASSERT_EQ( hz.dimension_1() , unsigned(N1) );
+
+    dx = dView4( "dx" , N0 );
+    dy = dView4( "dy" , N0 );
+
+    #if KOKKOS_USING_EXP_VIEW
+    ASSERT_EQ( dx.use_count() , size_t(1) );
+    #else
+    ASSERT_EQ( dx.tracker().ref_count() , size_t(1) );
+    #endif
+
+    dView4_unmanaged unmanaged_dx = dx;
+    #if KOKKOS_USING_EXP_VIEW
+    ASSERT_EQ( dx.use_count() , size_t(1) );
+    #else
+    ASSERT_EQ( dx.tracker().ref_count() , size_t(1) );
+    #endif
+
+    dView4_unmanaged unmanaged_from_ptr_dx = dView4_unmanaged(dx.ptr_on_device(),
+                                                              dx.dimension_0(),
+                                                              dx.dimension_1(),
+                                                              dx.dimension_2(),
+                                                              dx.dimension_3());
+
+    {
+      // Destruction of this view should be harmless
+      const_dView4 unmanaged_from_ptr_const_dx( dx.ptr_on_device() ,
+                                                dx.dimension_0() ,
+                                                dx.dimension_1() ,
+                                                dx.dimension_2() ,
+                                                dx.dimension_3() );
+    }
+
+    const_dView4 const_dx = dx ;
+    #if KOKKOS_USING_EXP_VIEW
+    ASSERT_EQ( dx.use_count() , size_t(2) );
+    #else
+    ASSERT_EQ( dx.tracker().ref_count() , size_t(2) );
+    #endif
+
+    {
+      const_dView4 const_dx2;
+      const_dx2 = const_dx;
+      #if KOKKOS_USING_EXP_VIEW
+      ASSERT_EQ( dx.use_count() , size_t(3) );
+      #else
+      ASSERT_EQ( dx.tracker().ref_count() , size_t(3) );
+      #endif
+
+      const_dx2 = dy;
+      #if KOKKOS_USING_EXP_VIEW
+      ASSERT_EQ( dx.use_count() , size_t(2) );
+      #else
+      ASSERT_EQ( dx.tracker().ref_count() , size_t(2) );
+      #endif
+
+      const_dView4 const_dx3(dx);
+      #if KOKKOS_USING_EXP_VIEW
+      ASSERT_EQ( dx.use_count() , size_t(3) );
+      #else
+      ASSERT_EQ( dx.tracker().ref_count() , size_t(3) );
+      #endif
+      
+      dView4_unmanaged dx4_unmanaged(dx);
+      #if KOKKOS_USING_EXP_VIEW
+      ASSERT_EQ( dx.use_count() , size_t(3) );
+      #else
+      ASSERT_EQ( dx.tracker().ref_count() , size_t(3) );
+      #endif
+    }
+
+    #if KOKKOS_USING_EXP_VIEW
+    ASSERT_EQ( dx.use_count() , size_t(2) );
+    #else
+    ASSERT_EQ( dx.tracker().ref_count() , size_t(2) );
+    #endif
+
+
+    ASSERT_FALSE( dx.ptr_on_device() == 0 );
+    ASSERT_FALSE( const_dx.ptr_on_device() == 0 );
+    ASSERT_FALSE( unmanaged_dx.ptr_on_device() == 0 );
+    ASSERT_FALSE( unmanaged_from_ptr_dx.ptr_on_device() == 0 );
+    ASSERT_FALSE( dy.ptr_on_device() == 0 );
+    ASSERT_NE( dx , dy );
+
+    ASSERT_EQ( dx.dimension_0() , unsigned(N0) );
+    ASSERT_EQ( dx.dimension_1() , unsigned(N1) );
+    ASSERT_EQ( dx.dimension_2() , unsigned(N2) );
+    ASSERT_EQ( dx.dimension_3() , unsigned(N3) );
+
+    ASSERT_EQ( dy.dimension_0() , unsigned(N0) );
+    ASSERT_EQ( dy.dimension_1() , unsigned(N1) );
+    ASSERT_EQ( dy.dimension_2() , unsigned(N2) );
+    ASSERT_EQ( dy.dimension_3() , unsigned(N3) );
+
+    ASSERT_EQ( unmanaged_from_ptr_dx.capacity(),unsigned(N0)*unsigned(N1)*unsigned(N2)*unsigned(N3) );
+
+    hx = Kokkos::create_mirror( dx );
+    hy = Kokkos::create_mirror( dy );
+
+    // T v1 = hx() ;    // Generates compile error as intended
+    // T v2 = hx(0,0) ; // Generates compile error as intended
+    // hx(0,0) = v2 ;   // Generates compile error as intended
+
+#if ! KOKKOS_USING_EXP_VIEW
+    // Testing with asynchronous deep copy with respect to device
+    {
+      size_t count = 0 ;
+      for ( size_t ip = 0 ; ip < N0 ; ++ip ) {
+      for ( size_t i1 = 0 ; i1 < hx.dimension_1() ; ++i1 ) {
+      for ( size_t i2 = 0 ; i2 < hx.dimension_2() ; ++i2 ) {
+      for ( size_t i3 = 0 ; i3 < hx.dimension_3() ; ++i3 ) {
+        hx(ip,i1,i2,i3) = ++count ;
+      }}}}
+
+
+      Kokkos::deep_copy(typename hView4::execution_space(), dx , hx );
+      Kokkos::deep_copy(typename hView4::execution_space(), dy , dx );
+      Kokkos::deep_copy(typename hView4::execution_space(), hy , dy );
+
+      for ( size_t ip = 0 ; ip < N0 ; ++ip ) {
+      for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
+      for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
+      for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
+        { ASSERT_EQ( hx(ip,i1,i2,i3) , hy(ip,i1,i2,i3) ); }
+      }}}}
+
+      Kokkos::deep_copy(typename hView4::execution_space(), dx , T(0) );
+      Kokkos::deep_copy(typename hView4::execution_space(), hx , dx );
+
+      for ( size_t ip = 0 ; ip < N0 ; ++ip ) {
+      for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
+      for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
+      for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
+        { ASSERT_EQ( hx(ip,i1,i2,i3) , T(0) ); }
+      }}}}
+    }
+
+    // Testing with asynchronous deep copy with respect to host
+    {
+      size_t count = 0 ;
+      for ( size_t ip = 0 ; ip < N0 ; ++ip ) {
+      for ( size_t i1 = 0 ; i1 < hx.dimension_1() ; ++i1 ) {
+      for ( size_t i2 = 0 ; i2 < hx.dimension_2() ; ++i2 ) {
+      for ( size_t i3 = 0 ; i3 < hx.dimension_3() ; ++i3 ) {
+        hx(ip,i1,i2,i3) = ++count ;
+      }}}}
+
+      Kokkos::deep_copy(typename dView4::execution_space(), dx , hx );
+      Kokkos::deep_copy(typename dView4::execution_space(), dy , dx );
+      Kokkos::deep_copy(typename dView4::execution_space(), hy , dy );
+
+      for ( size_t ip = 0 ; ip < N0 ; ++ip ) {
+      for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
+      for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
+      for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
+        { ASSERT_EQ( hx(ip,i1,i2,i3) , hy(ip,i1,i2,i3) ); }
+      }}}}
+
+      Kokkos::deep_copy(typename dView4::execution_space(), dx , T(0) );
+      Kokkos::deep_copy(typename dView4::execution_space(), hx , dx );
+
+      for ( size_t ip = 0 ; ip < N0 ; ++ip ) {
+      for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
+      for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
+      for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
+        { ASSERT_EQ( hx(ip,i1,i2,i3) , T(0) ); }
+      }}}}
+    }
+#endif /* #if ! KOKKOS_USING_EXP_VIEW */
+
+    // Testing with synchronous deep copy
+    {
+      size_t count = 0 ;
+      for ( size_t ip = 0 ; ip < N0 ; ++ip ) {
+      for ( size_t i1 = 0 ; i1 < hx.dimension_1() ; ++i1 ) {
+      for ( size_t i2 = 0 ; i2 < hx.dimension_2() ; ++i2 ) {
+      for ( size_t i3 = 0 ; i3 < hx.dimension_3() ; ++i3 ) {
+        hx(ip,i1,i2,i3) = ++count ;
+      }}}}
+
+      Kokkos::deep_copy( dx , hx );
+      Kokkos::deep_copy( dy , dx );
+      Kokkos::deep_copy( hy , dy );
+
+      for ( size_t ip = 0 ; ip < N0 ; ++ip ) {
+      for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
+      for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
+      for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
+        { ASSERT_EQ( hx(ip,i1,i2,i3) , hy(ip,i1,i2,i3) ); }
+      }}}}
+
+      Kokkos::deep_copy( dx , T(0) );
+      Kokkos::deep_copy( hx , dx );
+
+      for ( size_t ip = 0 ; ip < N0 ; ++ip ) {
+      for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
+      for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
+      for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
+        { ASSERT_EQ( hx(ip,i1,i2,i3) , T(0) ); }
+      }}}}
+    }
+    dz = dx ; ASSERT_EQ( dx, dz); ASSERT_NE( dy, dz);
+    dz = dy ; ASSERT_EQ( dy, dz); ASSERT_NE( dx, dz);
+
+    dx = dView4();
+    ASSERT_TRUE( dx.ptr_on_device() == 0 );
+    ASSERT_FALSE( dy.ptr_on_device() == 0 );
+    ASSERT_FALSE( dz.ptr_on_device() == 0 );
+    dy = dView4();
+    ASSERT_TRUE( dx.ptr_on_device() == 0 );
+    ASSERT_TRUE( dy.ptr_on_device() == 0 );
+    ASSERT_FALSE( dz.ptr_on_device() == 0 );
+    dz = dView4();
+    ASSERT_TRUE( dx.ptr_on_device() == 0 );
+    ASSERT_TRUE( dy.ptr_on_device() == 0 );
+    ASSERT_TRUE( dz.ptr_on_device() == 0 );
+  }
+
+  typedef T DataType[2] ;
+
+  static void
+  check_auto_conversion_to_const(
+     const Kokkos::View< const DataType , device > & arg_const ,
+     const Kokkos::View< DataType , device > & arg )
+  {
+    ASSERT_TRUE( arg_const == arg );
+  }
+
+  static void run_test_const()
+  {
+    typedef Kokkos::View< DataType , device > typeX ;
+    typedef Kokkos::View< const DataType , device > const_typeX ;
+    typedef Kokkos::View< const DataType , device , Kokkos::MemoryRandomAccess > const_typeR ;
+    typeX x( "X" );
+    const_typeX xc = x ;
+    const_typeR xr = x ;
+
+    ASSERT_TRUE( xc == x );
+    ASSERT_TRUE( x == xc );
+
+    // For CUDA the constant random access View does not return
+    // an lvalue reference due to retrieving through texture cache
+    // therefore not allowed to query the underlying pointer.
+#if defined( KOKKOS_HAVE_CUDA )
+    if ( ! std::is_same< typename device::execution_space , Kokkos::Cuda >::value )
+#endif
+    {
+      ASSERT_TRUE( x.ptr_on_device() == xr.ptr_on_device() );
+    }
+
+    // typeX xf = xc ; // setting non-const from const must not compile
+
+    check_auto_conversion_to_const( x , x );
+  }
+
+  static void run_test_subview()
+  {
+    typedef Kokkos::View< const T , device > sView ;
+
+    dView0 d0( "d0" );
+    dView1 d1( "d1" , N0 );
+    dView2 d2( "d2" , N0 );
+    dView3 d3( "d3" , N0 );
+    dView4 d4( "d4" , N0 );
+
+    sView s0 = d0 ;
+    sView s1 = Kokkos::subview( d1 , 1 );
+    sView s2 = Kokkos::subview( d2 , 1 , 1 );
+    sView s3 = Kokkos::subview( d3 , 1 , 1 , 1 );
+    sView s4 = Kokkos::subview( d4 , 1 , 1 , 1 , 1 );
+  }
+
+  static void run_test_subview_strided()
+  {
+    typedef Kokkos::View< int **** , Kokkos::LayoutLeft  , host >  view_left_4 ;
+    typedef Kokkos::View< int **** , Kokkos::LayoutRight , host >  view_right_4 ;
+    typedef Kokkos::View< int **   , Kokkos::LayoutLeft  , host >  view_left_2 ;
+    typedef Kokkos::View< int **   , Kokkos::LayoutRight , host >  view_right_2 ;
+
+    typedef Kokkos::View< int * ,  Kokkos::LayoutStride , host >  view_stride_1 ;
+    typedef Kokkos::View< int ** ,  Kokkos::LayoutStride , host >  view_stride_2 ;
+
+    view_left_2  xl2("xl2", 100 , 200 );
+    view_right_2 xr2("xr2", 100 , 200 );
+    view_stride_1  yl1 = Kokkos::subview( xl2 , 0 , Kokkos::ALL() );
+    view_stride_1  yl2 = Kokkos::subview( xl2 , 1 , Kokkos::ALL() );
+    view_stride_1  yr1 = Kokkos::subview( xr2 , 0 , Kokkos::ALL() );
+    view_stride_1  yr2 = Kokkos::subview( xr2 , 1 , Kokkos::ALL() );
+
+    ASSERT_EQ( yl1.dimension_0() , xl2.dimension_1() );
+    ASSERT_EQ( yl2.dimension_0() , xl2.dimension_1() );
+    ASSERT_EQ( yr1.dimension_0() , xr2.dimension_1() );
+    ASSERT_EQ( yr2.dimension_0() , xr2.dimension_1() );
+
+    ASSERT_EQ( & yl1(0) - & xl2(0,0) , 0 );
+    ASSERT_EQ( & yl2(0) - & xl2(1,0) , 0 );
+    ASSERT_EQ( & yr1(0) - & xr2(0,0) , 0 );
+    ASSERT_EQ( & yr2(0) - & xr2(1,0) , 0 );
+
+    view_left_4 xl4( "xl4" , 10 , 20 , 30 , 40 );
+    view_right_4 xr4( "xr4" , 10 , 20 , 30 , 40 );
+
+    view_stride_2 yl4 = Kokkos::subview( xl4 , 1 , Kokkos::ALL() , 2 , Kokkos::ALL() );
+    view_stride_2 yr4 = Kokkos::subview( xr4 , 1 , Kokkos::ALL() , 2 , Kokkos::ALL() );
+
+    ASSERT_EQ( yl4.dimension_0() , xl4.dimension_1() );
+    ASSERT_EQ( yl4.dimension_1() , xl4.dimension_3() );
+    ASSERT_EQ( yr4.dimension_0() , xr4.dimension_1() );
+    ASSERT_EQ( yr4.dimension_1() , xr4.dimension_3() );
+
+    ASSERT_EQ( & yl4(4,4) - & xl4(1,4,2,4) , 0 );
+    ASSERT_EQ( & yr4(4,4) - & xr4(1,4,2,4) , 0 );
+  }
+
+  static void run_test_vector()
+  {
+    static const unsigned Length = 1000 , Count = 8 ;
+
+    typedef Kokkos::View< T* ,  Kokkos::LayoutLeft , host > vector_type ;
+    typedef Kokkos::View< T** , Kokkos::LayoutLeft , host > multivector_type ;
+
+    typedef Kokkos::View< T* ,  Kokkos::LayoutRight , host > vector_right_type ;
+    typedef Kokkos::View< T** , Kokkos::LayoutRight , host > multivector_right_type ;
+
+    typedef Kokkos::View< const T* , Kokkos::LayoutRight, host > const_vector_right_type ;
+    typedef Kokkos::View< const T* , Kokkos::LayoutLeft , host > const_vector_type ;
+    typedef Kokkos::View< const T** , Kokkos::LayoutLeft , host > const_multivector_type ;
+
+    multivector_type mv = multivector_type( "mv" , Length , Count );
+    multivector_right_type mv_right = multivector_right_type( "mv" , Length , Count );
+
+    vector_type v1 = Kokkos::subview( mv , Kokkos::ALL() , 0 );
+    vector_type v2 = Kokkos::subview( mv , Kokkos::ALL() , 1 );
+    vector_type v3 = Kokkos::subview( mv , Kokkos::ALL() , 2 );
+
+    vector_type rv1 = Kokkos::subview( mv_right , 0 , Kokkos::ALL() );
+    vector_type rv2 = Kokkos::subview( mv_right , 1 , Kokkos::ALL() );
+    vector_type rv3 = Kokkos::subview( mv_right , 2 , Kokkos::ALL() );
+
+    multivector_type mv1 = Kokkos::subview( mv , std::make_pair( 1 , 998 ) ,
+                                                 std::make_pair( 2 , 5 ) );
+
+    multivector_right_type mvr1 =
+      Kokkos::subview( mv_right ,
+                       std::make_pair( 1 , 998 ) ,
+                       std::make_pair( 2 , 5 ) );
+
+    const_vector_type cv1 = Kokkos::subview( mv , Kokkos::ALL(), 0 );
+    const_vector_type cv2 = Kokkos::subview( mv , Kokkos::ALL(), 1 );
+    const_vector_type cv3 = Kokkos::subview( mv , Kokkos::ALL(), 2 );
+
+    vector_right_type vr1 = Kokkos::subview( mv , Kokkos::ALL() , 0 );
+    vector_right_type vr2 = Kokkos::subview( mv , Kokkos::ALL() , 1 );
+    vector_right_type vr3 = Kokkos::subview( mv , Kokkos::ALL() , 2 );
+
+    const_vector_right_type cvr1 = Kokkos::subview( mv , Kokkos::ALL() , 0 );
+    const_vector_right_type cvr2 = Kokkos::subview( mv , Kokkos::ALL() , 1 );
+    const_vector_right_type cvr3 = Kokkos::subview( mv , Kokkos::ALL() , 2 );
+
+    ASSERT_TRUE( & v1[0] == & v1(0) );
+    ASSERT_TRUE( & v1[0] == & mv(0,0) );
+    ASSERT_TRUE( & v2[0] == & mv(0,1) );
+    ASSERT_TRUE( & v3[0] == & mv(0,2) );
+
+    ASSERT_TRUE( & cv1[0] == & mv(0,0) );
+    ASSERT_TRUE( & cv2[0] == & mv(0,1) );
+    ASSERT_TRUE( & cv3[0] == & mv(0,2) );
+
+    ASSERT_TRUE( & vr1[0] == & mv(0,0) );
+    ASSERT_TRUE( & vr2[0] == & mv(0,1) );
+    ASSERT_TRUE( & vr3[0] == & mv(0,2) );
+
+    ASSERT_TRUE( & cvr1[0] == & mv(0,0) );
+    ASSERT_TRUE( & cvr2[0] == & mv(0,1) );
+    ASSERT_TRUE( & cvr3[0] == & mv(0,2) );
+
+    ASSERT_TRUE( & mv1(0,0) == & mv( 1 , 2 ) );
+    ASSERT_TRUE( & mv1(1,1) == & mv( 2 , 3 ) );
+    ASSERT_TRUE( & mv1(3,2) == & mv( 4 , 4 ) );
+    ASSERT_TRUE( & mvr1(0,0) == & mv_right( 1 , 2 ) );
+    ASSERT_TRUE( & mvr1(1,1) == & mv_right( 2 , 3 ) );
+    ASSERT_TRUE( & mvr1(3,2) == & mv_right( 4 , 4 ) );
+
+    const_vector_type c_cv1( v1 );
+    typename vector_type::const_type c_cv2( v2 );
+    typename const_vector_type::const_type c_ccv2( v2 );
+
+    const_multivector_type cmv( mv );
+    typename multivector_type::const_type cmvX( cmv );
+    typename const_multivector_type::const_type ccmvX( cmv );
+  }
+};
+
+} // namespace Test
+
+/*--------------------------------------------------------------------------*/
+
diff --git a/lib/kokkos/core/unit_test/TestViewImpl.hpp b/lib/kokkos/core/unit_test/TestViewImpl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..c34ef759d1dd41bbb9238ccdb37f2aa28955af6d
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestViewImpl.hpp
@@ -0,0 +1,289 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <stdexcept>
+#include <sstream>
+#include <iostream>
+
+#include <Kokkos_Core.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+#if KOKKOS_USING_EXP_VIEW
+
+namespace Test {
+
+template < class Device >
+void test_view_impl() {}
+
+}
+
+#else
+
+/*--------------------------------------------------------------------------*/
+
+namespace Test {
+
+struct DummyMemorySpace
+{
+  typedef DummyMemorySpace memory_space ;
+  typedef unsigned size_type ;
+};
+
+/*--------------------------------------------------------------------------*/
+
+template< class Type >
+struct DefineShape {
+  typedef typename Kokkos::Impl::AnalyzeShape<Type>::shape type ;
+};
+
+template< class Type >
+struct ExtractValueType {
+  typedef typename Kokkos::Impl::AnalyzeShape<Type>::value_type type ;
+};
+
+template< class Type >
+struct ArrayType { typedef Type type ; };
+
+template < class Device >
+void test_view_impl()
+{
+  //typedef typename Device::memory_space memory_space ; // unused
+
+  typedef ArrayType< int[100]                >::type type_01 ;
+  typedef ArrayType< int*                    >::type type_11 ;
+  typedef ArrayType< int[5][6][700]          >::type type_03 ;
+  typedef ArrayType< double*[8][9][900]      >::type type_14 ;
+  typedef ArrayType< long**                  >::type type_22 ;
+  typedef ArrayType< short **[5][6][7]       >::type type_25 ;
+  typedef ArrayType< const short **[5][6][7] >::type const_type_25 ;
+  typedef ArrayType< short***[5][6][7]       >::type type_36 ;
+  typedef ArrayType< const short***[5][6][7] >::type const_type_36 ;
+
+  // mfh 14 Feb 2014: With gcc 4.8.2 -Wall, this emits a warning:
+  //
+  // typedef ‘ok_const_25’ locally defined but not used [-Wunused-local-typedefs]
+  //
+  // It's unfortunate that this is the case, because the typedef is
+  // being used for a compile-time check!  We deal with this by
+  // declaring an instance of ok_const_25, and marking it with
+  // "(void)" so that instance doesn't emit an "unused variable"
+  // warning.
+  //
+  // typedef typename Kokkos::Impl::StaticAssertSame<
+  //    typename Kokkos::Impl::AnalyzeShape<type_25>::const_type ,
+  //    typename Kokkos::Impl::AnalyzeShape<const_type_25>::type
+  //      > ok_const_25 ;
+
+  typedef typename Kokkos::Impl::StaticAssertSame<
+    typename Kokkos::Impl::AnalyzeShape<type_25>::const_type,
+    typename Kokkos::Impl::AnalyzeShape<const_type_25>::type
+      > ok_const_25 ;
+
+  typedef typename Kokkos::Impl::StaticAssertSame<
+    typename Kokkos::Impl::AnalyzeShape<type_36>::const_type,
+    typename Kokkos::Impl::AnalyzeShape<const_type_36>::type
+      > ok_const_36 ;
+  {
+    ok_const_25 thing_25 ;
+    ok_const_36 thing_36 ;
+    (void) thing_25 ; // silence warning for unused variable
+    (void) thing_36 ; // silence warning for unused variable
+  }
+
+  ASSERT_TRUE( ( Kokkos::Impl::is_same< ExtractValueType<type_03>::type , int >::value ) );
+  ASSERT_TRUE( ( Kokkos::Impl::is_same< ExtractValueType<type_14>::type , double >::value ) );
+  ASSERT_TRUE( ( Kokkos::Impl::is_same< ExtractValueType<type_22>::type , long >::value ) );
+  ASSERT_TRUE( ( Kokkos::Impl::is_same< ExtractValueType<type_36>::type , short >::value ) );
+
+  ASSERT_FALSE( ( Kokkos::Impl::is_same< ExtractValueType<type_36>::type , int >::value ) );
+
+  typedef typename DefineShape< type_01 >::type  shape_01_type ;
+  typedef typename DefineShape< type_11 >::type  shape_11_type ;
+  typedef typename DefineShape< type_03 >::type  shape_03_type ;
+  typedef typename DefineShape< type_14 >::type  shape_14_type ;
+  typedef typename DefineShape< type_22 >::type  shape_22_type ;
+  typedef typename DefineShape< type_36 >::type  shape_36_type ;
+
+  ASSERT_TRUE( ( Kokkos::Impl::StaticAssert< shape_36_type::rank == 6 >::value ) );
+  ASSERT_TRUE( ( Kokkos::Impl::StaticAssert< shape_03_type::rank == 3 >::value ) );
+
+  shape_01_type shape_01 ; shape_01_type::assign( shape_01 );
+  shape_11_type shape_11 ; shape_11_type::assign( shape_11, 1000 );
+  shape_03_type shape_03 ; shape_03_type::assign( shape_03 );
+  shape_14_type shape_14 ; shape_14_type::assign( shape_14 , 0 );
+  shape_22_type shape_22 ; shape_22_type::assign( shape_22 , 0 , 0 );
+  shape_36_type shape_36 ; shape_36_type::assign( shape_36 , 10 , 20 , 30 );
+
+  ASSERT_TRUE( shape_01.rank_dynamic == 0u );
+  ASSERT_TRUE( shape_01.rank         == 1u );
+  ASSERT_TRUE( shape_01.N0           == 100u );
+
+  ASSERT_TRUE( shape_11.rank_dynamic == 1u );
+  ASSERT_TRUE( shape_11.rank         == 1u );
+  ASSERT_TRUE( shape_11.N0           == 1000u );
+
+  ASSERT_TRUE( shape_03.rank_dynamic == 0u );
+  ASSERT_TRUE( shape_03.rank         == 3u );
+  ASSERT_TRUE( shape_03.N0           == 5u );
+  ASSERT_TRUE( shape_03.N1           == 6u );
+  ASSERT_TRUE( shape_03.N2           == 700u );
+
+  ASSERT_TRUE( shape_14.rank_dynamic == 1u );
+  ASSERT_TRUE( shape_14.rank         == 4u );
+  ASSERT_TRUE( shape_14.N0           == 0u );
+  ASSERT_TRUE( shape_14.N1           == 8u );
+  ASSERT_TRUE( shape_14.N2           == 9u );
+  ASSERT_TRUE( shape_14.N3           == 900u );
+
+  ASSERT_TRUE( shape_22.rank_dynamic == 2u );
+  ASSERT_TRUE( shape_22.rank         == 2u );
+  ASSERT_TRUE( shape_22.N0           == 0u );
+  ASSERT_TRUE( shape_22.N1           == 0u );
+
+  ASSERT_TRUE( shape_36.rank_dynamic == 3u );
+  ASSERT_TRUE( shape_36.rank         == 6u );
+  ASSERT_TRUE( shape_36.N0           == 10u );
+  ASSERT_TRUE( shape_36.N1           == 20u );
+  ASSERT_TRUE( shape_36.N2           == 30u );
+  ASSERT_TRUE( shape_36.N3           == 5u  );
+  ASSERT_TRUE( shape_36.N4           == 6u  );
+  ASSERT_TRUE( shape_36.N5           == 7u  );
+
+
+  ASSERT_TRUE( shape_01 == shape_01 );
+  ASSERT_TRUE( shape_11 == shape_11 );
+  ASSERT_TRUE( shape_36 == shape_36 );
+  ASSERT_TRUE( shape_01 != shape_36 );
+  ASSERT_TRUE( shape_22 != shape_36 );
+
+  //------------------------------------------------------------------------
+
+  typedef Kokkos::Impl::ViewOffset< shape_01_type , Kokkos::LayoutLeft > shape_01_left_offset ;
+  typedef Kokkos::Impl::ViewOffset< shape_11_type , Kokkos::LayoutLeft > shape_11_left_offset ;
+  typedef Kokkos::Impl::ViewOffset< shape_03_type , Kokkos::LayoutLeft > shape_03_left_offset ;
+  typedef Kokkos::Impl::ViewOffset< shape_14_type , Kokkos::LayoutLeft > shape_14_left_offset ;
+  typedef Kokkos::Impl::ViewOffset< shape_22_type , Kokkos::LayoutLeft > shape_22_left_offset ;
+  typedef Kokkos::Impl::ViewOffset< shape_36_type , Kokkos::LayoutLeft > shape_36_left_offset ;
+
+  typedef Kokkos::Impl::ViewOffset< shape_01_type , Kokkos::LayoutRight > shape_01_right_offset ;
+  typedef Kokkos::Impl::ViewOffset< shape_11_type , Kokkos::LayoutRight > shape_11_right_offset ;
+  typedef Kokkos::Impl::ViewOffset< shape_03_type , Kokkos::LayoutRight > shape_03_right_offset ;
+  typedef Kokkos::Impl::ViewOffset< shape_14_type , Kokkos::LayoutRight > shape_14_right_offset ;
+  typedef Kokkos::Impl::ViewOffset< shape_22_type , Kokkos::LayoutRight > shape_22_right_offset ;
+  typedef Kokkos::Impl::ViewOffset< shape_36_type , Kokkos::LayoutRight > shape_36_right_offset ;
+
+  ASSERT_TRUE( ! shape_01_left_offset::has_padding );
+  ASSERT_TRUE( ! shape_11_left_offset::has_padding );
+  ASSERT_TRUE( ! shape_03_left_offset::has_padding );
+  ASSERT_TRUE(   shape_14_left_offset::has_padding );
+  ASSERT_TRUE(   shape_22_left_offset::has_padding );
+  ASSERT_TRUE(   shape_36_left_offset::has_padding );
+
+  ASSERT_TRUE( ! shape_01_right_offset::has_padding );
+  ASSERT_TRUE( ! shape_11_right_offset::has_padding );
+  ASSERT_TRUE( ! shape_03_right_offset::has_padding );
+  ASSERT_TRUE( ! shape_14_right_offset::has_padding );
+  ASSERT_TRUE(   shape_22_right_offset::has_padding );
+  ASSERT_TRUE(   shape_36_right_offset::has_padding );
+
+  //------------------------------------------------------------------------
+
+  typedef Kokkos::Impl::ViewOffset< shape_01_type , Kokkos::LayoutStride > shape_01_stride_offset ;
+  typedef Kokkos::Impl::ViewOffset< shape_36_type , Kokkos::LayoutStride > shape_36_stride_offset ;
+
+  {
+    shape_01_stride_offset stride_offset_01 ;
+
+    stride_offset_01.assign( 1, stride_offset_01.N0, 0,0,0,0,0,0,0 );
+
+    ASSERT_EQ( int(stride_offset_01.S[0]) , int(1) );
+    ASSERT_EQ( int(stride_offset_01.S[1]) , int(stride_offset_01.N0) );
+  }
+
+  {
+    shape_36_stride_offset stride_offset_36 ;
+
+    size_t str[7] ;
+    str[5] = 1 ;
+    str[4] = str[5] * stride_offset_36.N5 ;
+    str[3] = str[4] * stride_offset_36.N4 ;
+    str[2] = str[3] * stride_offset_36.N3 ;
+    str[1] = str[2] * 100 ;
+    str[0] = str[1] * 200 ;
+    str[6] = str[0] * 300 ;
+
+    stride_offset_36.assign( str[0] , str[1] , str[2] , str[3] , str[4] , str[5] , str[6] , 0 , 0 );
+
+    ASSERT_EQ( size_t(stride_offset_36.S[6]) , size_t(str[6]) );
+    ASSERT_EQ( size_t(stride_offset_36.N2)   , size_t(100) );
+    ASSERT_EQ( size_t(stride_offset_36.N1)   , size_t(200) );
+    ASSERT_EQ( size_t(stride_offset_36.N0)   , size_t(300) );
+  }
+
+  //------------------------------------------------------------------------
+
+  {
+    const int rank = 6 ;
+    const int order[] = { 5 , 3 , 1 , 0 , 2 , 4 };
+    const unsigned dim[] = { 2 , 3 , 5 , 7 , 11 , 13 };
+    Kokkos::LayoutStride stride_6 = Kokkos::LayoutStride::order_dimensions( rank , order , dim );
+    size_t n = 1 ;
+    for ( int i = 0 ; i < rank ; ++i ) {
+      ASSERT_EQ( size_t(dim[i]) , size_t( stride_6.dimension[i] ) );
+      ASSERT_EQ( size_t(n) , size_t( stride_6.stride[ order[i] ] ) );
+      n *= dim[order[i]] ;
+    }
+  }
+
+  //------------------------------------------------------------------------
+}
+
+} /* namespace Test */
+
+#endif
+
+/*--------------------------------------------------------------------------*/
+
diff --git a/lib/kokkos/core/unit_test/TestViewMapping.hpp b/lib/kokkos/core/unit_test/TestViewMapping.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..eddb81bed5cfaa855dc51a43d4a560bc69030543
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestViewMapping.hpp
@@ -0,0 +1,1307 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <stdexcept>
+#include <sstream>
+#include <iostream>
+
+#include <Kokkos_Core.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Test {
+
+template< class Space >
+void test_view_mapping()
+{
+  typedef typename Space::execution_space ExecSpace ;
+
+  typedef Kokkos::Experimental::Impl::ViewDimension<>  dim_0 ;
+  typedef Kokkos::Experimental::Impl::ViewDimension<2> dim_s2 ;
+  typedef Kokkos::Experimental::Impl::ViewDimension<2,3> dim_s2_s3 ;
+  typedef Kokkos::Experimental::Impl::ViewDimension<2,3,4> dim_s2_s3_s4 ;
+
+  typedef Kokkos::Experimental::Impl::ViewDimension<0> dim_s0 ;
+  typedef Kokkos::Experimental::Impl::ViewDimension<0,3> dim_s0_s3 ;
+  typedef Kokkos::Experimental::Impl::ViewDimension<0,3,4> dim_s0_s3_s4 ;
+
+  typedef Kokkos::Experimental::Impl::ViewDimension<0,0> dim_s0_s0 ;
+  typedef Kokkos::Experimental::Impl::ViewDimension<0,0,4> dim_s0_s0_s4 ;
+
+  typedef Kokkos::Experimental::Impl::ViewDimension<0,0,0> dim_s0_s0_s0 ;
+  typedef Kokkos::Experimental::Impl::ViewDimension<0,0,0,0> dim_s0_s0_s0_s0 ;
+  typedef Kokkos::Experimental::Impl::ViewDimension<0,0,0,0,0> dim_s0_s0_s0_s0_s0 ;
+  typedef Kokkos::Experimental::Impl::ViewDimension<0,0,0,0,0,0> dim_s0_s0_s0_s0_s0_s0 ;
+  typedef Kokkos::Experimental::Impl::ViewDimension<0,0,0,0,0,0,0> dim_s0_s0_s0_s0_s0_s0_s0 ;
+  typedef Kokkos::Experimental::Impl::ViewDimension<0,0,0,0,0,0,0,0> dim_s0_s0_s0_s0_s0_s0_s0_s0 ;
+
+  // Fully static dimensions should not be larger than an int
+  ASSERT_LE( sizeof(dim_0) , sizeof(int) );
+  ASSERT_LE( sizeof(dim_s2) , sizeof(int) );
+  ASSERT_LE( sizeof(dim_s2_s3) , sizeof(int) );
+  ASSERT_LE( sizeof(dim_s2_s3_s4) , sizeof(int) );
+
+  // Rank 1 is size_t
+  ASSERT_EQ( sizeof(dim_s0) , sizeof(size_t) );
+  ASSERT_EQ( sizeof(dim_s0_s3) , sizeof(size_t) );
+  ASSERT_EQ( sizeof(dim_s0_s3_s4) , sizeof(size_t) );
+
+  // Allow for padding
+  ASSERT_LE( sizeof(dim_s0_s0) , 2 * sizeof(size_t) );
+  ASSERT_LE( sizeof(dim_s0_s0_s4) , 2 * sizeof(size_t) );
+
+  ASSERT_LE( sizeof(dim_s0_s0_s0) , 4 * sizeof(size_t) );
+  ASSERT_EQ( sizeof(dim_s0_s0_s0_s0) , 4 * sizeof(unsigned) );
+  ASSERT_LE( sizeof(dim_s0_s0_s0_s0_s0) , 6 * sizeof(unsigned) );
+  ASSERT_EQ( sizeof(dim_s0_s0_s0_s0_s0_s0) , 6 * sizeof(unsigned) );
+  ASSERT_LE( sizeof(dim_s0_s0_s0_s0_s0_s0_s0) , 8 * sizeof(unsigned) );
+  ASSERT_EQ( sizeof(dim_s0_s0_s0_s0_s0_s0_s0_s0) , 8 * sizeof(unsigned) );
+
+  ASSERT_EQ( int(dim_0::rank) , int(0) );
+  ASSERT_EQ( int(dim_0::rank_dynamic) , int(0) );
+
+  ASSERT_EQ( int(dim_s2::rank) , int(1) );
+  ASSERT_EQ( int(dim_s2::rank_dynamic) , int(0) );
+
+  ASSERT_EQ( int(dim_s2_s3::rank) , int(2) );
+  ASSERT_EQ( int(dim_s2_s3::rank_dynamic) , int(0) );
+
+  ASSERT_EQ( int(dim_s2_s3_s4::rank) , int(3) );
+  ASSERT_EQ( int(dim_s2_s3_s4::rank_dynamic) , int(0) );
+
+  ASSERT_EQ( int(dim_s0::rank) , int(1) );
+  ASSERT_EQ( int(dim_s0::rank_dynamic) , int(1) );
+
+  ASSERT_EQ( int(dim_s0_s3::rank) , int(2) );
+  ASSERT_EQ( int(dim_s0_s3::rank_dynamic) , int(1) );
+
+  ASSERT_EQ( int(dim_s0_s3_s4::rank) , int(3) );
+  ASSERT_EQ( int(dim_s0_s3_s4::rank_dynamic) , int(1) );
+
+  ASSERT_EQ( int(dim_s0_s0_s4::rank) , int(3) );
+  ASSERT_EQ( int(dim_s0_s0_s4::rank_dynamic) , int(2) );
+
+  ASSERT_EQ( int(dim_s0_s0_s0::rank) , int(3) );
+  ASSERT_EQ( int(dim_s0_s0_s0::rank_dynamic) , int(3) );
+
+  ASSERT_EQ( int(dim_s0_s0_s0_s0::rank) , int(4) );
+  ASSERT_EQ( int(dim_s0_s0_s0_s0::rank_dynamic) , int(4) );
+
+  ASSERT_EQ( int(dim_s0_s0_s0_s0_s0::rank) , int(5) );
+  ASSERT_EQ( int(dim_s0_s0_s0_s0_s0::rank_dynamic) , int(5) );
+
+  ASSERT_EQ( int(dim_s0_s0_s0_s0_s0_s0::rank) , int(6) );
+  ASSERT_EQ( int(dim_s0_s0_s0_s0_s0_s0::rank_dynamic) , int(6) );
+
+  ASSERT_EQ( int(dim_s0_s0_s0_s0_s0_s0_s0::rank) , int(7) );
+  ASSERT_EQ( int(dim_s0_s0_s0_s0_s0_s0_s0::rank_dynamic) , int(7) );
+
+  ASSERT_EQ( int(dim_s0_s0_s0_s0_s0_s0_s0_s0::rank) , int(8) );
+  ASSERT_EQ( int(dim_s0_s0_s0_s0_s0_s0_s0_s0::rank_dynamic) , int(8) );
+
+  dim_s0          d1( 2, 3, 4, 5, 6, 7, 8, 9 ); 
+  dim_s0_s0       d2( 2, 3, 4, 5, 6, 7, 8, 9 );
+  dim_s0_s0_s0    d3( 2, 3, 4, 5, 6, 7, 8, 9 );
+  dim_s0_s0_s0_s0 d4( 2, 3, 4, 5, 6, 7, 8, 9 );
+
+  ASSERT_EQ( d1.N0 , 2 );
+  ASSERT_EQ( d2.N0 , 2 );
+  ASSERT_EQ( d3.N0 , 2 );
+  ASSERT_EQ( d4.N0 , 2 );
+
+  ASSERT_EQ( d1.N1 , 1 );
+  ASSERT_EQ( d2.N1 , 3 );
+  ASSERT_EQ( d3.N1 , 3 );
+  ASSERT_EQ( d4.N1 , 3 );
+
+  ASSERT_EQ( d1.N2 , 1 );
+  ASSERT_EQ( d2.N2 , 1 );
+  ASSERT_EQ( d3.N2 , 4 );
+  ASSERT_EQ( d4.N2 , 4 );
+
+  ASSERT_EQ( d1.N3 , 1 );
+  ASSERT_EQ( d2.N3 , 1 );
+  ASSERT_EQ( d3.N3 , 1 );
+  ASSERT_EQ( d4.N3 , 5 );
+
+  //----------------------------------------
+
+  typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s0 , Kokkos::LayoutStride >  stride_s0_s0_s0 ;
+
+  //----------------------------------------
+  // Static dimension
+  {
+    typedef Kokkos::Experimental::Impl::ViewOffset< dim_s2_s3_s4 , Kokkos::LayoutLeft > left_s2_s3_s4 ;
+
+    ASSERT_EQ( sizeof(left_s2_s3_s4) , sizeof(dim_s2_s3_s4) );
+
+    left_s2_s3_s4 off3 ;
+
+    stride_s0_s0_s0  stride3( off3 );
+
+    ASSERT_EQ( off3.stride_0() , 1 );
+    ASSERT_EQ( off3.stride_1() , 2 );
+    ASSERT_EQ( off3.stride_2() , 6 );
+    ASSERT_EQ( off3.span() , 24 );
+
+    ASSERT_EQ( off3.stride_0() , stride3.stride_0() );
+    ASSERT_EQ( off3.stride_1() , stride3.stride_1() );
+    ASSERT_EQ( off3.stride_2() , stride3.stride_2() );
+    ASSERT_EQ( off3.span() , stride3.span() );
+
+    int offset = 0 ;
+
+    for ( int k = 0 ; k < 4 ; ++k ){
+    for ( int j = 0 ; j < 3 ; ++j ){
+    for ( int i = 0 ; i < 2 ; ++i , ++offset ){
+      ASSERT_EQ( off3(i,j,k) , offset );
+      ASSERT_EQ( stride3(i,j,k) , off3(i,j,k) );
+    }}}
+  }
+
+  //----------------------------------------
+  // Small dimension is unpadded
+  {
+    typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s4 , Kokkos::LayoutLeft > left_s0_s0_s4 ;
+
+    left_s0_s0_s4 dyn_off3( std::integral_constant<unsigned,sizeof(int)>()
+                          , Kokkos::LayoutLeft( 2, 3, 0, 0, 0, 0, 0, 0 ) );
+
+    stride_s0_s0_s0  stride3( dyn_off3 );
+
+    ASSERT_EQ( dyn_off3.m_dim.rank , 3 );
+    ASSERT_EQ( dyn_off3.m_dim.N0 , 2 );
+    ASSERT_EQ( dyn_off3.m_dim.N1 , 3 );
+    ASSERT_EQ( dyn_off3.m_dim.N2 , 4 );
+    ASSERT_EQ( dyn_off3.m_dim.N3 , 1 );
+    ASSERT_EQ( dyn_off3.size() , 2 * 3 * 4 );
+
+    const Kokkos::LayoutLeft layout = dyn_off3.layout();
+
+    ASSERT_EQ( layout.dimension[0] , 2 );
+    ASSERT_EQ( layout.dimension[1] , 3 );
+    ASSERT_EQ( layout.dimension[2] , 4 );
+    ASSERT_EQ( layout.dimension[3] , 1 );
+    ASSERT_EQ( layout.dimension[4] , 1 );
+    ASSERT_EQ( layout.dimension[5] , 1 );
+    ASSERT_EQ( layout.dimension[6] , 1 );
+    ASSERT_EQ( layout.dimension[7] , 1 );
+
+    ASSERT_EQ( stride3.m_dim.rank , 3 );
+    ASSERT_EQ( stride3.m_dim.N0 , 2 );
+    ASSERT_EQ( stride3.m_dim.N1 , 3 );
+    ASSERT_EQ( stride3.m_dim.N2 , 4 );
+    ASSERT_EQ( stride3.m_dim.N3 , 1 );
+    ASSERT_EQ( stride3.size() , 2 * 3 * 4 );
+
+    int offset = 0 ;
+
+    for ( int k = 0 ; k < 4 ; ++k ){
+    for ( int j = 0 ; j < 3 ; ++j ){
+    for ( int i = 0 ; i < 2 ; ++i , ++offset ){
+      ASSERT_EQ( offset , dyn_off3(i,j,k) );
+      ASSERT_EQ( stride3(i,j,k) , dyn_off3(i,j,k) );
+    }}}
+
+    ASSERT_EQ( dyn_off3.span() , offset );
+    ASSERT_EQ( stride3.span() , dyn_off3.span() );
+  }
+
+  // Large dimension is likely padded
+  {
+    constexpr int N0 = 2000 ;
+    constexpr int N1 = 300 ;
+
+    typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s4 , Kokkos::LayoutLeft > left_s0_s0_s4 ;
+
+    left_s0_s0_s4 dyn_off3( std::integral_constant<unsigned,sizeof(int)>()
+                          , Kokkos::LayoutLeft( N0, N1, 0, 0, 0, 0, 0, 0 ) );
+
+    stride_s0_s0_s0  stride3( dyn_off3 );
+
+    ASSERT_EQ( dyn_off3.m_dim.rank , 3 );
+    ASSERT_EQ( dyn_off3.m_dim.N0 , N0 );
+    ASSERT_EQ( dyn_off3.m_dim.N1 , N1 );
+    ASSERT_EQ( dyn_off3.m_dim.N2 , 4 );
+    ASSERT_EQ( dyn_off3.m_dim.N3 , 1 );
+    ASSERT_EQ( dyn_off3.size() , N0 * N1 * 4 );
+
+    ASSERT_EQ( stride3.m_dim.rank , 3 );
+    ASSERT_EQ( stride3.m_dim.N0 , N0 );
+    ASSERT_EQ( stride3.m_dim.N1 , N1 );
+    ASSERT_EQ( stride3.m_dim.N2 , 4 );
+    ASSERT_EQ( stride3.m_dim.N3 , 1 );
+    ASSERT_EQ( stride3.size() , N0 * N1 * 4 );
+    ASSERT_EQ( stride3.span() , dyn_off3.span() );
+
+    int offset = 0 ;
+
+    for ( int k = 0 ; k < 4 ; ++k ){
+    for ( int j = 0 ; j < N1 ; ++j ){
+    for ( int i = 0 ; i < N0 ; ++i ){
+      ASSERT_LE( offset , dyn_off3(i,j,k) );
+      ASSERT_EQ( stride3(i,j,k) , dyn_off3(i,j,k) );
+      offset = dyn_off3(i,j,k) + 1 ;
+    }}}
+
+    ASSERT_LE( offset , dyn_off3.span() );
+  }
+
+  //----------------------------------------
+  // Static dimension
+  {
+    typedef Kokkos::Experimental::Impl::ViewOffset< dim_s2_s3_s4 , Kokkos::LayoutRight > right_s2_s3_s4 ;
+
+    ASSERT_EQ( sizeof(right_s2_s3_s4) , sizeof(dim_s2_s3_s4) );
+
+    right_s2_s3_s4 off3 ;
+
+    stride_s0_s0_s0  stride3( off3 );
+
+    ASSERT_EQ( off3.stride_0() , 12 );
+    ASSERT_EQ( off3.stride_1() , 4 );
+    ASSERT_EQ( off3.stride_2() , 1 );
+
+    ASSERT_EQ( off3.dimension_0() , stride3.dimension_0() );
+    ASSERT_EQ( off3.dimension_1() , stride3.dimension_1() );
+    ASSERT_EQ( off3.dimension_2() , stride3.dimension_2() );
+    ASSERT_EQ( off3.stride_0() , stride3.stride_0() );
+    ASSERT_EQ( off3.stride_1() , stride3.stride_1() );
+    ASSERT_EQ( off3.stride_2() , stride3.stride_2() );
+    ASSERT_EQ( off3.span() , stride3.span() );
+
+    int offset = 0 ;
+
+    for ( int i = 0 ; i < 2 ; ++i ){
+    for ( int j = 0 ; j < 3 ; ++j ){
+    for ( int k = 0 ; k < 4 ; ++k , ++offset ){
+      ASSERT_EQ( off3(i,j,k) , offset );
+      ASSERT_EQ( off3(i,j,k) , stride3(i,j,k) );
+    }}}
+
+    ASSERT_EQ( off3.span() , offset );
+  }
+
+  //----------------------------------------
+  // Small dimension is unpadded
+  {
+    typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s4 , Kokkos::LayoutRight > right_s0_s0_s4 ;
+
+    right_s0_s0_s4 dyn_off3( std::integral_constant<unsigned,sizeof(int)>()
+                           , Kokkos::LayoutRight( 2, 3, 0, 0, 0, 0, 0, 0 ) );
+
+    stride_s0_s0_s0  stride3( dyn_off3 );
+
+    ASSERT_EQ( dyn_off3.m_dim.rank , 3 );
+    ASSERT_EQ( dyn_off3.m_dim.N0 , 2 );
+    ASSERT_EQ( dyn_off3.m_dim.N1 , 3 );
+    ASSERT_EQ( dyn_off3.m_dim.N2 , 4 );
+    ASSERT_EQ( dyn_off3.m_dim.N3 , 1 );
+    ASSERT_EQ( dyn_off3.size() , 2 * 3 * 4 );
+
+    ASSERT_EQ( dyn_off3.dimension_0() , stride3.dimension_0() );
+    ASSERT_EQ( dyn_off3.dimension_1() , stride3.dimension_1() );
+    ASSERT_EQ( dyn_off3.dimension_2() , stride3.dimension_2() );
+    ASSERT_EQ( dyn_off3.stride_0() , stride3.stride_0() );
+    ASSERT_EQ( dyn_off3.stride_1() , stride3.stride_1() );
+    ASSERT_EQ( dyn_off3.stride_2() , stride3.stride_2() );
+    ASSERT_EQ( dyn_off3.span() , stride3.span() );
+
+    int offset = 0 ;
+
+    for ( int i = 0 ; i < 2 ; ++i ){
+    for ( int j = 0 ; j < 3 ; ++j ){
+    for ( int k = 0 ; k < 4 ; ++k , ++offset ){
+      ASSERT_EQ( offset , dyn_off3(i,j,k) );
+      ASSERT_EQ( dyn_off3(i,j,k) , stride3(i,j,k) );
+    }}}
+
+    ASSERT_EQ( dyn_off3.span() , offset );
+  }
+
+  // Large dimension is likely padded
+  {
+    constexpr int N0 = 2000 ;
+    constexpr int N1 = 300 ;
+
+    typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s4 , Kokkos::LayoutRight > right_s0_s0_s4 ;
+
+    right_s0_s0_s4 dyn_off3( std::integral_constant<unsigned,sizeof(int)>()
+                           , Kokkos::LayoutRight( N0, N1, 0, 0, 0, 0, 0, 0 ) );
+
+    stride_s0_s0_s0  stride3( dyn_off3 );
+
+    ASSERT_EQ( dyn_off3.m_dim.rank , 3 );
+    ASSERT_EQ( dyn_off3.m_dim.N0 , N0 );
+    ASSERT_EQ( dyn_off3.m_dim.N1 , N1 );
+    ASSERT_EQ( dyn_off3.m_dim.N2 , 4 );
+    ASSERT_EQ( dyn_off3.m_dim.N3 , 1 );
+    ASSERT_EQ( dyn_off3.size() , N0 * N1 * 4 );
+
+    ASSERT_EQ( dyn_off3.dimension_0() , stride3.dimension_0() );
+    ASSERT_EQ( dyn_off3.dimension_1() , stride3.dimension_1() );
+    ASSERT_EQ( dyn_off3.dimension_2() , stride3.dimension_2() );
+    ASSERT_EQ( dyn_off3.stride_0() , stride3.stride_0() );
+    ASSERT_EQ( dyn_off3.stride_1() , stride3.stride_1() );
+    ASSERT_EQ( dyn_off3.stride_2() , stride3.stride_2() );
+    ASSERT_EQ( dyn_off3.span() , stride3.span() );
+
+    int offset = 0 ;
+
+    for ( int i = 0 ; i < N0 ; ++i ){
+    for ( int j = 0 ; j < N1 ; ++j ){
+    for ( int k = 0 ; k < 4 ; ++k ){
+      ASSERT_LE( offset , dyn_off3(i,j,k) );
+      ASSERT_EQ( dyn_off3(i,j,k) , stride3(i,j,k) );
+      offset = dyn_off3(i,j,k) + 1 ;
+    }}}
+
+    ASSERT_LE( offset , dyn_off3.span() );
+  }
+
+  //----------------------------------------
+  // Subview
+  {
+    // Mapping rank 4 to rank 3
+    typedef Kokkos::Experimental::Impl::SubviewExtents<4,3> SubviewExtents ;
+
+    constexpr int N0 = 1000 ;
+    constexpr int N1 = 2000 ;
+    constexpr int N2 = 3000 ;
+    constexpr int N3 = 4000 ;
+
+    Kokkos::Experimental::Impl::ViewDimension<N0,N1,N2,N3> dim ;
+
+    SubviewExtents tmp( dim
+                      , N0 / 2
+                      , Kokkos::Experimental::ALL
+                      , std::pair<int,int>( N2 / 4 , 10 + N2 / 4 )
+                      , Kokkos::pair<int,int>( N3 / 4 , 20 + N3 / 4 )
+                      );
+
+    ASSERT_EQ( tmp.domain_offset(0) , N0 / 2 );
+    ASSERT_EQ( tmp.domain_offset(1) , 0 );
+    ASSERT_EQ( tmp.domain_offset(2) , N2 / 4 );
+    ASSERT_EQ( tmp.domain_offset(3) , N3 / 4 );
+
+    ASSERT_EQ( tmp.range_index(0) , 1 );
+    ASSERT_EQ( tmp.range_index(1) , 2 );
+    ASSERT_EQ( tmp.range_index(2) , 3 );
+
+    ASSERT_EQ( tmp.range_extent(0) , N1 );
+    ASSERT_EQ( tmp.range_extent(1) , 10 );
+    ASSERT_EQ( tmp.range_extent(2) , 20 );
+  }
+  //----------------------------------------
+  {
+    constexpr int N0 = 2000 ;
+    constexpr int N1 = 300 ;
+
+    constexpr int sub_N0 = 1000 ;
+    constexpr int sub_N1 = 200 ;
+    constexpr int sub_N2 = 4 ;
+
+    typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s4 , Kokkos::LayoutLeft > left_s0_s0_s4 ;
+
+    left_s0_s0_s4 dyn_off3( std::integral_constant<unsigned,sizeof(int)>()
+                          , Kokkos::LayoutLeft( N0, N1, 0, 0, 0, 0, 0, 0 ) );
+
+    Kokkos::Experimental::Impl::SubviewExtents< 3 , 3 >
+      sub( dyn_off3.m_dim
+         , Kokkos::pair<int,int>(0,sub_N0)
+         , Kokkos::pair<int,int>(0,sub_N1)
+         , Kokkos::pair<int,int>(0,sub_N2)
+         );
+
+    stride_s0_s0_s0  stride3( dyn_off3 , sub );
+
+    ASSERT_EQ( stride3.dimension_0() , sub_N0 );
+    ASSERT_EQ( stride3.dimension_1() , sub_N1 );
+    ASSERT_EQ( stride3.dimension_2() , sub_N2 );
+    ASSERT_EQ( stride3.size() , sub_N0 * sub_N1 * sub_N2 );
+
+    ASSERT_EQ( dyn_off3.stride_0() , stride3.stride_0() );
+    ASSERT_EQ( dyn_off3.stride_1() , stride3.stride_1() );
+    ASSERT_EQ( dyn_off3.stride_2() , stride3.stride_2() );
+    ASSERT_GE( dyn_off3.span()   , stride3.span() );
+
+    for ( int k = 0 ; k < sub_N2 ; ++k ){
+    for ( int j = 0 ; j < sub_N1 ; ++j ){
+    for ( int i = 0 ; i < sub_N0 ; ++i ){
+      ASSERT_EQ( stride3(i,j,k) , dyn_off3(i,j,k) );
+    }}}
+  }
+
+  {
+    constexpr int N0 = 2000 ;
+    constexpr int N1 = 300 ;
+
+    constexpr int sub_N0 = 1000 ;
+    constexpr int sub_N1 = 200 ;
+    constexpr int sub_N2 = 4 ;
+
+    typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s4 , Kokkos::LayoutRight > right_s0_s0_s4 ;
+
+    right_s0_s0_s4 dyn_off3( std::integral_constant<unsigned,sizeof(int)>()
+                           , Kokkos::LayoutRight( N0, N1, 0, 0, 0, 0, 0, 0 ) );
+
+    Kokkos::Experimental::Impl::SubviewExtents< 3 , 3 >
+      sub( dyn_off3.m_dim
+         , Kokkos::pair<int,int>(0,sub_N0)
+         , Kokkos::pair<int,int>(0,sub_N1)
+         , Kokkos::pair<int,int>(0,sub_N2)
+         );
+
+    stride_s0_s0_s0  stride3( dyn_off3 , sub );
+
+    ASSERT_EQ( stride3.dimension_0() , sub_N0 );
+    ASSERT_EQ( stride3.dimension_1() , sub_N1 );
+    ASSERT_EQ( stride3.dimension_2() , sub_N2 );
+    ASSERT_EQ( stride3.size() , sub_N0 * sub_N1 * sub_N2 );
+
+    ASSERT_EQ( dyn_off3.stride_0() , stride3.stride_0() );
+    ASSERT_EQ( dyn_off3.stride_1() , stride3.stride_1() );
+    ASSERT_EQ( dyn_off3.stride_2() , stride3.stride_2() );
+    ASSERT_GE( dyn_off3.span()   , stride3.span() );
+
+    for ( int i = 0 ; i < sub_N0 ; ++i ){
+    for ( int j = 0 ; j < sub_N1 ; ++j ){
+    for ( int k = 0 ; k < sub_N2 ; ++k ){
+      ASSERT_EQ( stride3(i,j,k) , dyn_off3(i,j,k) );
+    }}}
+  }
+
+  //----------------------------------------
+  // view data analysis
+  {
+    using namespace Kokkos::Experimental::Impl ;
+    static_assert( rank_dynamic<>::value == 0 , "" );
+    static_assert( rank_dynamic<1>::value == 0 , "" );
+    static_assert( rank_dynamic<0>::value == 1 , "" );
+    static_assert( rank_dynamic<0,1>::value == 1 , "" );
+    static_assert( rank_dynamic<0,0,1>::value == 2 , "" );
+  }
+
+  {
+    using namespace Kokkos::Experimental::Impl ;
+
+    typedef ViewArrayAnalysis< int[] >                 a_int_r1 ;
+    typedef ViewArrayAnalysis< int**[4][5][6] >        a_int_r5 ;
+    typedef ViewArrayAnalysis< const int[] >           a_const_int_r1 ;
+    typedef ViewArrayAnalysis< const int**[4][5][6] >  a_const_int_r5 ;
+
+    static_assert( a_int_r1::dimension::rank == 1 , "" );
+    static_assert( a_int_r1::dimension::rank_dynamic == 1 , "" );
+    static_assert( std::is_same< typename a_int_r1::dimension , ViewDimension<0> >::value , "" );
+    static_assert( std::is_same< typename a_int_r1::non_const_value_type , int >::value , "" );
+
+    static_assert( a_const_int_r1::dimension::rank == 1 , "" );
+    static_assert( a_const_int_r1::dimension::rank_dynamic == 1 , "" );
+    static_assert( std::is_same< typename a_const_int_r1::dimension , ViewDimension<0> >::value , "" );
+    static_assert( std::is_same< typename a_const_int_r1::non_const_value_type , int >::value , "" );
+
+    static_assert( a_const_int_r5::dimension::rank == 5 , "" );
+    static_assert( a_const_int_r5::dimension::rank_dynamic == 2 , "" );
+
+    static_assert( std::is_same< typename a_const_int_r5::dimension , ViewDimension<0,0,4,5,6> >::value , "" );
+
+    static_assert( std::is_same< typename a_const_int_r5::non_const_value_type , int >::value , "" );
+
+    static_assert( a_int_r5::dimension::rank == 5 , "" );
+    static_assert( a_int_r5::dimension::rank_dynamic == 2 , "" );
+    static_assert( std::is_same< typename a_int_r5::dimension , ViewDimension<0,0,4,5,6> >::value , "" );
+    static_assert( std::is_same< typename a_int_r5::non_const_value_type , int >::value , "" );
+  }
+
+  {
+    using namespace Kokkos::Experimental::Impl ;
+
+    typedef int t_i4[4] ;
+
+    // Dimensions of t_i4 are appended to the multdimensional array.
+    typedef ViewArrayAnalysis< t_i4 ***[3] > a_int_r5 ;
+
+    static_assert( a_int_r5::dimension::rank == 5 , "" );
+    static_assert( a_int_r5::dimension::rank_dynamic == 3 , "" );
+    static_assert( a_int_r5::dimension::ArgN0 == 0 , "" );
+    static_assert( a_int_r5::dimension::ArgN1 == 0 , "" );
+    static_assert( a_int_r5::dimension::ArgN2 == 0 , "" );
+    static_assert( a_int_r5::dimension::ArgN3 == 3 , "" );
+    static_assert( a_int_r5::dimension::ArgN4 == 4 , "" );
+    static_assert( std::is_same< typename a_int_r5::non_const_value_type , int >::value , "" );
+  }
+
+  {
+    using namespace Kokkos::Experimental::Impl ;
+
+    typedef ViewDataAnalysis< const int[] , void >  a_const_int_r1 ;
+
+    static_assert( std::is_same< typename a_const_int_r1::specialize , void >::value , "" );
+    static_assert( std::is_same< typename a_const_int_r1::dimension , Kokkos::Experimental::Impl::ViewDimension<0> >::value , "" );
+
+    static_assert( std::is_same< typename a_const_int_r1::type , const int * >::value , "" );
+    static_assert( std::is_same< typename a_const_int_r1::value_type , const int >::value , "" );
+
+    static_assert( std::is_same< typename a_const_int_r1::scalar_array_type , const int * >::value , "" );
+    static_assert( std::is_same< typename a_const_int_r1::const_type , const int * >::value , "" );
+    static_assert( std::is_same< typename a_const_int_r1::const_value_type , const int >::value , "" );
+    static_assert( std::is_same< typename a_const_int_r1::const_scalar_array_type , const int * >::value , "" );
+    static_assert( std::is_same< typename a_const_int_r1::non_const_type , int * >::value , "" );
+    static_assert( std::is_same< typename a_const_int_r1::non_const_value_type , int >::value , "" );
+
+    typedef ViewDataAnalysis< const int**[4] , void >  a_const_int_r3 ;
+
+    static_assert( std::is_same< typename a_const_int_r3::specialize , void >::value , "" );
+
+    static_assert( std::is_same< typename a_const_int_r3::dimension , Kokkos::Experimental::Impl::ViewDimension<0,0,4> >::value , "" );
+
+    static_assert( std::is_same< typename a_const_int_r3::type , const int**[4] >::value , "" );
+    static_assert( std::is_same< typename a_const_int_r3::value_type , const int >::value , "" );
+    static_assert( std::is_same< typename a_const_int_r3::scalar_array_type , const int**[4] >::value , "" );
+    static_assert( std::is_same< typename a_const_int_r3::const_type , const int**[4] >::value , "" );
+    static_assert( std::is_same< typename a_const_int_r3::const_value_type , const int >::value , "" );
+    static_assert( std::is_same< typename a_const_int_r3::const_scalar_array_type , const int**[4] >::value , "" );
+    static_assert( std::is_same< typename a_const_int_r3::non_const_type , int**[4] >::value , "" );
+    static_assert( std::is_same< typename a_const_int_r3::non_const_value_type , int >::value , "" );
+    static_assert( std::is_same< typename a_const_int_r3::non_const_scalar_array_type , int**[4] >::value , "" );
+
+
+    // std::cout << "typeid(const int**[4]).name() = " << typeid(const int**[4]).name() << std::endl ;
+  }
+
+  //----------------------------------------
+
+  {
+    constexpr int N = 10 ;
+
+    typedef Kokkos::Experimental::View<int*,Space>        T ;
+    typedef Kokkos::Experimental::View<const int*,Space>  C ;
+
+    int data[N] ;
+
+    T vr1(data,N); // view of non-const
+    C cr1(vr1);    // view of const from view of non-const
+    C cr2( (const int *) data , N );
+
+    // Generate static_assert error:
+    // T tmp( cr1 );
+
+    ASSERT_EQ( vr1.span() , N );
+    ASSERT_EQ( cr1.span() , N );
+    ASSERT_EQ( vr1.data() , & data[0] );
+    ASSERT_EQ( cr1.data() , & data[0] );
+
+    ASSERT_TRUE( ( std::is_same< typename T::data_type           , int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::const_data_type     , const int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::non_const_data_type , int* >::value ) );
+
+    ASSERT_TRUE( ( std::is_same< typename T::scalar_array_type           , int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::const_scalar_array_type     , const int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::non_const_scalar_array_type , int* >::value ) );
+
+    ASSERT_TRUE( ( std::is_same< typename T::value_type           , int >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::const_value_type     , const int >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::non_const_value_type , int >::value ) );
+
+    ASSERT_TRUE( ( std::is_same< typename T::memory_space , typename Space::memory_space >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::reference_type , int & >::value ) );
+
+    ASSERT_EQ( T::Rank , 1 );
+
+    ASSERT_TRUE( ( std::is_same< typename C::data_type           , const int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename C::const_data_type     , const int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename C::non_const_data_type , int* >::value ) );
+
+    ASSERT_TRUE( ( std::is_same< typename C::scalar_array_type           , const int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename C::const_scalar_array_type     , const int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename C::non_const_scalar_array_type , int* >::value ) );
+
+    ASSERT_TRUE( ( std::is_same< typename C::value_type           , const int >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename C::const_value_type     , const int >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename C::non_const_value_type , int >::value ) );
+
+    ASSERT_TRUE( ( std::is_same< typename C::memory_space , typename Space::memory_space >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename C::reference_type , const int & >::value ) );
+
+    ASSERT_EQ( C::Rank , 1 );
+
+    ASSERT_EQ( vr1.dimension_0() , N );
+
+    if ( Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< typename Space::memory_space , Kokkos::HostSpace >::value ) {
+      for ( int i = 0 ; i < N ; ++i ) data[i] = i + 1 ;
+      for ( int i = 0 ; i < N ; ++i ) ASSERT_EQ( vr1[i] , i + 1 );
+      for ( int i = 0 ; i < N ; ++i ) ASSERT_EQ( cr1[i] , i + 1 );
+
+      {
+        T tmp( vr1 );
+        for ( int i = 0 ; i < N ; ++i ) ASSERT_EQ( tmp[i] , i + 1 );
+        for ( int i = 0 ; i < N ; ++i ) vr1(i) = i + 2 ;
+        for ( int i = 0 ; i < N ; ++i ) ASSERT_EQ( tmp[i] , i + 2 );
+      }
+
+      for ( int i = 0 ; i < N ; ++i ) ASSERT_EQ( vr1[i] , i + 2 );
+    }
+  }
+
+
+  {
+    constexpr int N = 10 ;
+    typedef Kokkos::Experimental::View<int*,Space>        T ;
+    typedef Kokkos::Experimental::View<const int*,Space>  C ;
+
+    T vr1("vr1",N);
+    C cr1(vr1);
+
+    ASSERT_TRUE( ( std::is_same< typename T::data_type           , int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::const_data_type     , const int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::non_const_data_type , int* >::value ) );
+
+    ASSERT_TRUE( ( std::is_same< typename T::scalar_array_type           , int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::const_scalar_array_type     , const int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::non_const_scalar_array_type , int* >::value ) );
+
+    ASSERT_TRUE( ( std::is_same< typename T::value_type           , int >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::const_value_type     , const int >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::non_const_value_type , int >::value ) );
+
+    ASSERT_TRUE( ( std::is_same< typename T::memory_space , typename Space::memory_space >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::reference_type , int & >::value ) );
+    ASSERT_EQ( T::Rank , 1 );
+ 
+    ASSERT_EQ( vr1.dimension_0() , N );
+
+    if ( Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< typename Space::memory_space , Kokkos::HostSpace >::value ) {
+      for ( int i = 0 ; i < N ; ++i ) vr1(i) = i + 1 ;
+      for ( int i = 0 ; i < N ; ++i ) ASSERT_EQ( vr1[i] , i + 1 );
+      for ( int i = 0 ; i < N ; ++i ) ASSERT_EQ( cr1[i] , i + 1 );
+
+      {
+        T tmp( vr1 );
+        for ( int i = 0 ; i < N ; ++i ) ASSERT_EQ( tmp[i] , i + 1 );
+        for ( int i = 0 ; i < N ; ++i ) vr1(i) = i + 2 ;
+        for ( int i = 0 ; i < N ; ++i ) ASSERT_EQ( tmp[i] , i + 2 );
+      }
+
+      for ( int i = 0 ; i < N ; ++i ) ASSERT_EQ( vr1[i] , i + 2 );
+    }
+  }
+
+  // Testing proper handling of zero-length allocations
+  {
+    constexpr int N = 0 ;
+    typedef Kokkos::Experimental::View<int*,Space>        T ;
+    typedef Kokkos::Experimental::View<const int*,Space>  C ;
+
+    T vr1("vr1",N);
+    C cr1(vr1);
+
+    ASSERT_EQ( vr1.dimension_0() , 0 );
+    ASSERT_EQ( cr1.dimension_0() , 0 );
+  }
+
+
+  // Testing using space instance for allocation.
+  // The execution space of the memory space must be available for view data initialization
+
+  if ( std::is_same< ExecSpace , typename ExecSpace::memory_space::execution_space >::value ) {
+
+    using namespace Kokkos::Experimental ;
+
+    typedef typename ExecSpace::memory_space  memory_space ;
+    typedef View<int*,memory_space>           V ;
+
+    constexpr int N = 10 ;
+
+    memory_space mem_space ;
+
+    V v( "v" , N );
+    V va( view_alloc() , N );
+    V vb( view_alloc( "vb" ) , N );
+    V vc( view_alloc( "vc" , AllowPadding ) , N );
+    V vd( view_alloc( "vd" , WithoutInitializing ) , N );
+    V ve( view_alloc( "ve" , WithoutInitializing , AllowPadding ) , N );
+    V vf( view_alloc( "vf" , mem_space , WithoutInitializing , AllowPadding ) , N );
+    V vg( view_alloc( mem_space , "vg" , WithoutInitializing , AllowPadding ) , N );
+    V vh( view_alloc( WithoutInitializing , AllowPadding ) , N );
+    V vi( view_alloc( WithoutInitializing ) , N );
+    V vj( view_alloc( std::string("vj") , AllowPadding ) , N );
+    V vk( view_alloc( mem_space , std::string("vk") , AllowPadding ) , N );
+  }
+
+  {
+    typedef Kokkos::Experimental::ViewTraits<int***,Kokkos::LayoutStride,ExecSpace>  traits_t ;
+    typedef Kokkos::Experimental::Impl::ViewDimension<0,0,0>                         dims_t ;
+    typedef Kokkos::Experimental::Impl::ViewOffset< dims_t , Kokkos::LayoutStride >  offset_t ;
+
+    Kokkos::LayoutStride stride ;
+
+    stride.dimension[0] = 3 ;
+    stride.dimension[1] = 4 ;
+    stride.dimension[2] = 5 ;
+    stride.stride[0] = 4 ;
+    stride.stride[1] = 1 ;
+    stride.stride[2] = 12 ;
+
+    const offset_t offset( std::integral_constant<unsigned,0>() , stride );
+
+    ASSERT_EQ( offset.dimension_0() , 3 );
+    ASSERT_EQ( offset.dimension_1() , 4 );
+    ASSERT_EQ( offset.dimension_2() , 5 );
+
+    ASSERT_EQ( offset.stride_0() , 4 );
+    ASSERT_EQ( offset.stride_1() , 1 );
+    ASSERT_EQ( offset.stride_2() , 12 );
+
+    ASSERT_EQ( offset.span() , 60 );
+    ASSERT_TRUE( offset.span_is_contiguous() );
+
+    Kokkos::Experimental::Impl::ViewMapping< traits_t , void >
+      v( Kokkos::Experimental::Impl::ViewCtorProp<int*>((int*)0), stride );
+  }
+
+  {
+    typedef Kokkos::Experimental::View<int**,Space>  V ;
+    typedef typename V::HostMirror  M ;
+
+    constexpr int N0 = 10 ;
+    constexpr int N1 = 11 ;
+
+    V a("a",N0,N1);
+    M b = Kokkos::Experimental::create_mirror(a);
+    M c = Kokkos::Experimental::create_mirror_view(a);
+    M d ;
+
+    for ( int i0 = 0 ; i0 < N0 ; ++i0 )
+    for ( int i1 = 0 ; i1 < N1 ; ++i1 )
+      b(i0,i1) = 1 + i0 + i1 * N0 ;
+
+    Kokkos::Experimental::deep_copy( a , b );
+    Kokkos::Experimental::deep_copy( c , a );
+
+    for ( int i0 = 0 ; i0 < N0 ; ++i0 )
+    for ( int i1 = 0 ; i1 < N1 ; ++i1 )
+      ASSERT_EQ( b(i0,i1) , c(i0,i1) );
+
+    Kokkos::Experimental::resize( b , 5 , 6 );
+    Kokkos::Experimental::realloc( c , 5 , 6 );
+    Kokkos::Experimental::realloc( d , 5 , 6 );
+
+    ASSERT_EQ( b.dimension_0() , 5 );
+    ASSERT_EQ( b.dimension_1() , 6 );
+    ASSERT_EQ( c.dimension_0() , 5 );
+    ASSERT_EQ( c.dimension_1() , 6 );
+    ASSERT_EQ( d.dimension_0() , 5 );
+    ASSERT_EQ( d.dimension_1() , 6 );
+  }
+
+  {
+    typedef Kokkos::Experimental::View<int*,Space> V ;
+    typedef Kokkos::Experimental::View<int*,Space,Kokkos::MemoryUnmanaged> U ;
+
+
+    V a("a",10);
+
+    ASSERT_EQ( a.use_count() , 1 );
+
+    V b = a ;
+
+    ASSERT_EQ( a.use_count() , 2 );
+    ASSERT_EQ( b.use_count() , 2 );
+
+    {
+      U c = b ; // 'c' is compile-time unmanaged
+
+      ASSERT_EQ( a.use_count() , 2 );
+      ASSERT_EQ( b.use_count() , 2 );
+      ASSERT_EQ( c.use_count() , 2 );
+
+      V d = c ; // 'd' is run-time unmanaged
+
+      ASSERT_EQ( a.use_count() , 2 );
+      ASSERT_EQ( b.use_count() , 2 );
+      ASSERT_EQ( c.use_count() , 2 );
+      ASSERT_EQ( d.use_count() , 2 );
+    }
+
+    ASSERT_EQ( a.use_count() , 2 );
+    ASSERT_EQ( b.use_count() , 2 );
+
+    b = V();
+
+    ASSERT_EQ( a.use_count() , 1 );
+    ASSERT_EQ( b.use_count() , 0 );
+
+#if KOKKOS_USING_EXP_VIEW && ! defined ( KOKKOS_CUDA_USE_LAMBDA )
+    /* Cannot launch host lambda when CUDA lambda is enabled */
+
+    typedef typename Kokkos::Impl::is_space< Space >::host_execution_space
+      host_exec_space ;
+
+    Kokkos::parallel_for(
+      Kokkos::RangePolicy< host_exec_space >(0,10) ,
+      KOKKOS_LAMBDA( int i ){
+        // 'a' is captured by copy and the capture mechanism
+        // converts 'a' to an unmanaged copy.
+        // When the parallel dispatch accepts a move for the lambda
+        // this count should become 1
+        ASSERT_EQ( a.use_count() , 2 );
+        V x = a ;
+        ASSERT_EQ( a.use_count() , 2 );
+        ASSERT_EQ( x.use_count() , 2 );
+      });
+#endif /* #if ! defined ( KOKKOS_CUDA_USE_LAMBDA ) */
+  }
+}
+
+template< class Space >
+struct TestViewMappingSubview
+{
+  typedef typename Space::execution_space ExecSpace ;
+  typedef typename Space::memory_space    MemSpace ;
+
+  typedef Kokkos::pair<int,int> range ;
+
+  enum { AN = 10 };
+  typedef Kokkos::Experimental::View<int*,ExecSpace>  AT ;
+  typedef Kokkos::Experimental::View<const int*,ExecSpace>  ACT ;
+  typedef Kokkos::Experimental::Subview< AT , range >  AS ;
+
+  enum { BN0 = 10 , BN1 = 11 , BN2 = 12 };
+  typedef Kokkos::Experimental::View<int***,ExecSpace>  BT ;
+  typedef Kokkos::Experimental::Subview< BT , range , range , range >  BS ;
+
+  enum { CN0 = 10 , CN1 = 11 , CN2 = 12 };
+  typedef Kokkos::Experimental::View<int***[13][14],ExecSpace>  CT ;
+  typedef Kokkos::Experimental::Subview< CT , range , range , range , int , int >  CS ;
+
+  enum { DN0 = 10 , DN1 = 11 , DN2 = 12 , DN3 = 13 , DN4 = 14 };
+  typedef Kokkos::Experimental::View<int***[DN3][DN4],ExecSpace>  DT ;
+  typedef Kokkos::Experimental::Subview< DT , int , range , range , range , int >  DS ;
+
+
+  typedef Kokkos::Experimental::View<int***[13][14],Kokkos::LayoutLeft,ExecSpace>  DLT ;
+  typedef Kokkos::Experimental::Subview< DLT , range , int , int , int , int >  DLS1 ;
+
+  static_assert( DLS1::rank == 1 && std::is_same< typename DLS1::array_layout , Kokkos::LayoutLeft >::value
+               , "Subview layout error for rank 1 subview of left-most range of LayoutLeft" );
+
+  typedef Kokkos::Experimental::View<int***[13][14],Kokkos::LayoutRight,ExecSpace>  DRT ;
+  typedef Kokkos::Experimental::Subview< DRT , int , int , int , int , range >  DRS1 ;
+
+  static_assert( DRS1::rank == 1 && std::is_same< typename DRS1::array_layout , Kokkos::LayoutRight >::value
+               , "Subview layout error for rank 1 subview of right-most range of LayoutRight" );
+
+  AT Aa ;
+  AS Ab ;
+  ACT Ac ;
+  BT Ba ;
+  BS Bb ;
+  CT Ca ;
+  CS Cb ;
+  DT Da ;
+  DS Db ;
+
+  TestViewMappingSubview()
+    : Aa("Aa",AN)
+    , Ab( Kokkos::Experimental::subview( Aa , std::pair<int,int>(1,AN-1) ) )
+    , Ac( Aa , std::pair<int,int>(1,AN-1) )
+    , Ba("Ba",BN0,BN1,BN2)
+    , Bb( Kokkos::Experimental::subview( Ba
+                                        , std::pair<int,int>(1,BN0-1)
+                                        , std::pair<int,int>(1,BN1-1)
+                                        , std::pair<int,int>(1,BN2-1)
+                                        ) )
+    , Ca("Ca",CN0,CN1,CN2)
+    , Cb( Kokkos::Experimental::subview( Ca
+                                        , std::pair<int,int>(1,CN0-1)
+                                        , std::pair<int,int>(1,CN1-1)
+                                        , std::pair<int,int>(1,CN2-1)
+                                        , 1
+                                        , 2
+                                        ) )
+    , Da("Da",DN0,DN1,DN2)
+    , Db( Kokkos::Experimental::subview( Da
+                                        , 1
+                                        , std::pair<int,int>(1,DN1-1)
+                                        , std::pair<int,int>(1,DN2-1)
+                                        , std::pair<int,int>(1,DN3-1)
+                                        , 2
+                                        ) )
+    {
+    }
+
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int , long & error_count ) const
+    {
+      auto Ad = Kokkos::Experimental::subview< Kokkos::MemoryUnmanaged >( Aa , Kokkos::pair<int,int>(1,AN-1) );
+
+      for ( int i = 1 ; i < AN-1 ; ++i ) if( & Aa[i] != & Ab[i-1] ) ++error_count ;
+      for ( int i = 1 ; i < AN-1 ; ++i ) if( & Aa[i] != & Ac[i-1] ) ++error_count ;
+      for ( int i = 1 ; i < AN-1 ; ++i ) if( & Aa[i] != & Ad[i-1] ) ++error_count ;
+
+      for ( int i2 = 1 ; i2 < BN2-1 ; ++i2 ) {
+      for ( int i1 = 1 ; i1 < BN1-1 ; ++i1 ) {
+      for ( int i0 = 1 ; i0 < BN0-1 ; ++i0 ) {
+        if ( & Ba(i0,i1,i2) != & Bb(i0-1,i1-1,i2-1) ) ++error_count ;
+      }}}
+
+      for ( int i2 = 1 ; i2 < CN2-1 ; ++i2 ) {
+      for ( int i1 = 1 ; i1 < CN1-1 ; ++i1 ) {
+      for ( int i0 = 1 ; i0 < CN0-1 ; ++i0 ) {
+        if ( & Ca(i0,i1,i2,1,2) != & Cb(i0-1,i1-1,i2-1) ) ++error_count ;
+      }}}
+
+      for ( int i2 = 1 ; i2 < DN3-1 ; ++i2 ) {
+      for ( int i1 = 1 ; i1 < DN2-1 ; ++i1 ) {
+      for ( int i0 = 1 ; i0 < DN1-1 ; ++i0 ) {
+        if ( & Da(1,i0,i1,i2,2) != & Db(i0-1,i1-1,i2-1) ) ++error_count ;
+      }}}
+    }
+
+  static void run()
+  {
+    TestViewMappingSubview self ;
+
+    ASSERT_EQ( self.Aa.dimension_0() , AN );
+    ASSERT_EQ( self.Ab.dimension_0() , AN - 2 );
+    ASSERT_EQ( self.Ac.dimension_0() , AN - 2 );
+    ASSERT_EQ( self.Ba.dimension_0() , BN0 );
+    ASSERT_EQ( self.Ba.dimension_1() , BN1 );
+    ASSERT_EQ( self.Ba.dimension_2() , BN2 );
+    ASSERT_EQ( self.Bb.dimension_0() , BN0 - 2 );
+    ASSERT_EQ( self.Bb.dimension_1() , BN1 - 2 );
+    ASSERT_EQ( self.Bb.dimension_2() , BN2 - 2 );
+
+    ASSERT_EQ( self.Ca.dimension_0() , CN0 );
+    ASSERT_EQ( self.Ca.dimension_1() , CN1 );
+    ASSERT_EQ( self.Ca.dimension_2() , CN2 );
+    ASSERT_EQ( self.Ca.dimension_3() , 13 );
+    ASSERT_EQ( self.Ca.dimension_4() , 14 );
+    ASSERT_EQ( self.Cb.dimension_0() , CN0 - 2 );
+    ASSERT_EQ( self.Cb.dimension_1() , CN1 - 2 );
+    ASSERT_EQ( self.Cb.dimension_2() , CN2 - 2 );
+
+    ASSERT_EQ( self.Da.dimension_0() , DN0 );
+    ASSERT_EQ( self.Da.dimension_1() , DN1 );
+    ASSERT_EQ( self.Da.dimension_2() , DN2 );
+    ASSERT_EQ( self.Da.dimension_3() , DN3 );
+    ASSERT_EQ( self.Da.dimension_4() , DN4 );
+
+    ASSERT_EQ( self.Db.dimension_0() , DN1 - 2 );
+    ASSERT_EQ( self.Db.dimension_1() , DN2 - 2 );
+    ASSERT_EQ( self.Db.dimension_2() , DN3 - 2 );
+
+    ASSERT_EQ( self.Da.stride_1() , self.Db.stride_0() );
+    ASSERT_EQ( self.Da.stride_2() , self.Db.stride_1() );
+    ASSERT_EQ( self.Da.stride_3() , self.Db.stride_2() );
+
+    long error_count = -1 ;
+    Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >(0,1) , self , error_count );
+    ASSERT_EQ( error_count , 0 );
+  }
+
+};
+
+template< class Space >
+void test_view_mapping_subview()
+{
+  typedef typename Space::execution_space ExecSpace ;
+
+  TestViewMappingSubview< ExecSpace >::run();
+}
+
+/*--------------------------------------------------------------------------*/
+
+template< class ViewType >
+struct TestViewMapOperator {
+
+  static_assert( ViewType::reference_type_is_lvalue_reference
+               , "Test only valid for lvalue reference type" );
+
+  const ViewType v ;
+
+  KOKKOS_INLINE_FUNCTION
+  void test_left( size_t i0 , long & error_count ) const
+    {
+      typename ViewType::value_type * const base_ptr = & v(0,0,0,0,0,0,0,0);
+      const size_t n1 = v.dimension_1();
+      const size_t n2 = v.dimension_2();
+      const size_t n3 = v.dimension_3();
+      const size_t n4 = v.dimension_4();
+      const size_t n5 = v.dimension_5();
+      const size_t n6 = v.dimension_6();
+      const size_t n7 = v.dimension_7();
+
+      long offset = 0 ;
+
+      for ( size_t i7 = 0 ; i7 < n7 ; ++i7 )
+      for ( size_t i6 = 0 ; i6 < n6 ; ++i6 )
+      for ( size_t i5 = 0 ; i5 < n5 ; ++i5 )
+      for ( size_t i4 = 0 ; i4 < n4 ; ++i4 )
+      for ( size_t i3 = 0 ; i3 < n3 ; ++i3 )
+      for ( size_t i2 = 0 ; i2 < n2 ; ++i2 )
+      for ( size_t i1 = 0 ; i1 < n1 ; ++i1 )
+      {
+        const long d = & v(i0,i1,i2,i3,i4,i5,i6,i7) - base_ptr ;
+        if ( d < offset ) ++error_count ;
+        offset = d ;
+      }
+
+      if ( v.span() <= size_t(offset) ) ++error_count ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void test_right( size_t i0 , long & error_count ) const
+    {
+      typename ViewType::value_type * const base_ptr = & v(0,0,0,0,0,0,0,0);
+      const size_t n1 = v.dimension_1();
+      const size_t n2 = v.dimension_2();
+      const size_t n3 = v.dimension_3();
+      const size_t n4 = v.dimension_4();
+      const size_t n5 = v.dimension_5();
+      const size_t n6 = v.dimension_6();
+      const size_t n7 = v.dimension_7();
+
+      long offset = 0 ;
+
+      for ( size_t i1 = 0 ; i1 < n1 ; ++i1 )
+      for ( size_t i2 = 0 ; i2 < n2 ; ++i2 )
+      for ( size_t i3 = 0 ; i3 < n3 ; ++i3 )
+      for ( size_t i4 = 0 ; i4 < n4 ; ++i4 )
+      for ( size_t i5 = 0 ; i5 < n5 ; ++i5 )
+      for ( size_t i6 = 0 ; i6 < n6 ; ++i6 )
+      for ( size_t i7 = 0 ; i7 < n7 ; ++i7 )
+      {
+        const long d = & v(i0,i1,i2,i3,i4,i5,i6,i7) - base_ptr ;
+        if ( d < offset ) ++error_count ;
+        offset = d ;
+      }
+
+      if ( v.span() <= size_t(offset) ) ++error_count ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( size_t i , long & error_count ) const
+    {
+      if ( std::is_same< typename ViewType::array_layout , Kokkos::LayoutLeft >::value )
+        test_left(i,error_count);
+      else if ( std::is_same< typename ViewType::array_layout , Kokkos::LayoutRight >::value )
+        test_right(i,error_count);
+    }
+
+  constexpr static size_t N0 = 10 ;
+  constexpr static size_t N1 =  9 ;
+  constexpr static size_t N2 =  8 ;
+  constexpr static size_t N3 =  7 ;
+  constexpr static size_t N4 =  6 ;
+  constexpr static size_t N5 =  5 ;
+  constexpr static size_t N6 =  4 ;
+  constexpr static size_t N7 =  3 ;
+
+  TestViewMapOperator() : v( "Test" , N0, N1, N2, N3, N4, N5, N6, N7 ) {}
+
+  static void run()
+    {
+      TestViewMapOperator self ;
+
+      ASSERT_EQ( self.v.dimension_0() , ( 0 < ViewType::rank ? N0 : 1 ) );
+      ASSERT_EQ( self.v.dimension_1() , ( 1 < ViewType::rank ? N1 : 1 ) );
+      ASSERT_EQ( self.v.dimension_2() , ( 2 < ViewType::rank ? N2 : 1 ) );
+      ASSERT_EQ( self.v.dimension_3() , ( 3 < ViewType::rank ? N3 : 1 ) );
+      ASSERT_EQ( self.v.dimension_4() , ( 4 < ViewType::rank ? N4 : 1 ) );
+      ASSERT_EQ( self.v.dimension_5() , ( 5 < ViewType::rank ? N5 : 1 ) );
+      ASSERT_EQ( self.v.dimension_6() , ( 6 < ViewType::rank ? N6 : 1 ) );
+      ASSERT_EQ( self.v.dimension_7() , ( 7 < ViewType::rank ? N7 : 1 ) );
+
+      ASSERT_LE( self.v.dimension_0()*
+                 self.v.dimension_1()*
+                 self.v.dimension_2()*
+                 self.v.dimension_3()*
+                 self.v.dimension_4()*
+                 self.v.dimension_5()*
+                 self.v.dimension_6()*
+                 self.v.dimension_7()
+               , self.v.span() );
+
+      long error_count ;
+      Kokkos::RangePolicy< typename ViewType::execution_space > range(0,self.v.dimension_0());
+      Kokkos::parallel_reduce( range , self , error_count );
+      ASSERT_EQ( 0 , error_count );
+    }
+};
+
+
+template< class Space >
+void test_view_mapping_operator()
+{
+  typedef typename Space::execution_space ExecSpace ;
+
+  TestViewMapOperator< Kokkos::Experimental::View<int,Kokkos::LayoutLeft,ExecSpace> >::run();
+  TestViewMapOperator< Kokkos::Experimental::View<int*,Kokkos::LayoutLeft,ExecSpace> >::run();
+  TestViewMapOperator< Kokkos::Experimental::View<int**,Kokkos::LayoutLeft,ExecSpace> >::run();
+  TestViewMapOperator< Kokkos::Experimental::View<int***,Kokkos::LayoutLeft,ExecSpace> >::run();
+  TestViewMapOperator< Kokkos::Experimental::View<int****,Kokkos::LayoutLeft,ExecSpace> >::run();
+  TestViewMapOperator< Kokkos::Experimental::View<int*****,Kokkos::LayoutLeft,ExecSpace> >::run();
+  TestViewMapOperator< Kokkos::Experimental::View<int******,Kokkos::LayoutLeft,ExecSpace> >::run();
+  TestViewMapOperator< Kokkos::Experimental::View<int*******,Kokkos::LayoutLeft,ExecSpace> >::run();
+
+  TestViewMapOperator< Kokkos::Experimental::View<int,Kokkos::LayoutRight,ExecSpace> >::run();
+  TestViewMapOperator< Kokkos::Experimental::View<int*,Kokkos::LayoutRight,ExecSpace> >::run();
+  TestViewMapOperator< Kokkos::Experimental::View<int**,Kokkos::LayoutRight,ExecSpace> >::run();
+  TestViewMapOperator< Kokkos::Experimental::View<int***,Kokkos::LayoutRight,ExecSpace> >::run();
+  TestViewMapOperator< Kokkos::Experimental::View<int****,Kokkos::LayoutRight,ExecSpace> >::run();
+  TestViewMapOperator< Kokkos::Experimental::View<int*****,Kokkos::LayoutRight,ExecSpace> >::run();
+  TestViewMapOperator< Kokkos::Experimental::View<int******,Kokkos::LayoutRight,ExecSpace> >::run();
+  TestViewMapOperator< Kokkos::Experimental::View<int*******,Kokkos::LayoutRight,ExecSpace> >::run();
+}
+
+/*--------------------------------------------------------------------------*/
+
+template< class Space >
+struct TestViewMappingAtomic {
+  typedef typename Space::execution_space ExecSpace ;
+  typedef typename Space::memory_space    MemSpace ;
+
+  typedef Kokkos::MemoryTraits< Kokkos::Atomic >  mem_trait ;
+
+  typedef Kokkos::Experimental::View< int * , ExecSpace > T ;
+  typedef Kokkos::Experimental::View< int * , ExecSpace , mem_trait >  T_atom ;
+
+  T      x ;
+  T_atom x_atom ;
+
+  constexpr static size_t N = 100000 ;
+
+  struct TagInit {};
+  struct TagUpdate {};
+  struct TagVerify {};
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const TagInit & , const int i ) const
+    { x(i) = i ; }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const TagUpdate & , const int i ) const
+    { x_atom(i%2) += 1 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const TagVerify & , const int i , long & error_count ) const
+    {
+       if ( i < 2 ) { if ( x(i) != int(i + N / 2) ) ++error_count ; }
+       else         { if ( x(i) != int(i) ) ++error_count ; }
+    }
+
+  TestViewMappingAtomic()
+    : x("x",N)
+    , x_atom( x )
+    {}
+
+  static void run()
+    {
+      ASSERT_TRUE( T::reference_type_is_lvalue_reference );
+      ASSERT_FALSE( T_atom::reference_type_is_lvalue_reference );
+
+      TestViewMappingAtomic self ;
+      Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace , TagInit >(0,N) , self );
+      Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace , TagUpdate >(0,N) , self );
+      long error_count = -1 ;
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace , TagVerify >(0,N) , self , error_count );
+      ASSERT_EQ( 0 , error_count );
+    }
+};
+
+/*--------------------------------------------------------------------------*/
+
+template< class Space >
+struct TestViewMappingClassValue {
+  typedef typename Space::execution_space ExecSpace ;
+  typedef typename Space::memory_space    MemSpace ;
+
+  struct ValueType {
+    KOKKOS_INLINE_FUNCTION
+    ValueType()
+    {
+#if 0
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA )
+      printf("TestViewMappingClassValue construct on Cuda\n");
+#elif defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      printf("TestViewMappingClassValue construct on Host\n");
+#else
+      printf("TestViewMappingClassValue construct unknown\n");
+#endif
+#endif
+    }
+    KOKKOS_INLINE_FUNCTION
+    ~ValueType()
+    {
+#if 0
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA )
+      printf("TestViewMappingClassValue destruct on Cuda\n");
+#elif defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      printf("TestViewMappingClassValue destruct on Host\n");
+#else
+      printf("TestViewMappingClassValue destruct unknown\n");
+#endif
+#endif
+    }
+  };
+
+  static void run()
+  {
+    using namespace Kokkos::Experimental ;
+    ExecSpace::fence();
+    {
+      View< ValueType , ExecSpace > a("a");
+      ExecSpace::fence();
+    }
+    ExecSpace::fence();
+  }
+};
+
+} /* namespace Test */
+
+/*--------------------------------------------------------------------------*/
+
diff --git a/lib/kokkos/core/unit_test/TestViewOfClass.hpp b/lib/kokkos/core/unit_test/TestViewOfClass.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..9b23a5d5597e2260e1a73b9f9b5b6b50a911567e
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestViewOfClass.hpp
@@ -0,0 +1,163 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+#include <stdexcept>
+#include <sstream>
+#include <iostream>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Test {
+
+template< class Space >
+struct NestedView {
+
+  Kokkos::View<int*,Space> member ;
+
+public:
+
+  KOKKOS_INLINE_FUNCTION
+  NestedView() : member()
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  NestedView & operator = ( const Kokkos::View<int*,Space> & lhs )
+    {
+      member = lhs ;
+      if ( member.dimension_0() ) Kokkos::atomic_add( & member(0) , 1 );
+      return *this ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  ~NestedView()
+  { 
+    if ( member.dimension_0() ) {
+      Kokkos::atomic_add( & member(0) , -1 );
+    }
+  }
+};
+
+template< class Space >
+struct NestedViewFunctor {
+
+  Kokkos::View< NestedView<Space> * , Space > nested ;
+  Kokkos::View<int*,Space>                    array ;
+
+  NestedViewFunctor( 
+    const Kokkos::View< NestedView<Space> * , Space > & arg_nested ,
+    const Kokkos::View<int*,Space>                    & arg_array )
+  : nested( arg_nested )
+  , array(  arg_array )
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int i ) const
+    { nested[i] = array ; }
+};
+
+
+template< class Space >
+void view_nested_view()
+{
+  Kokkos::View<int*,Space> tracking("tracking",1);
+
+  typename Kokkos::View<int*,Space>::HostMirror
+     host_tracking = Kokkos::create_mirror( tracking );
+
+  {
+    Kokkos::View< NestedView<Space> * , Space > a("a_nested_view",2);
+
+    Kokkos::parallel_for( Kokkos::RangePolicy<Space>(0,2) , NestedViewFunctor<Space>( a , tracking ) );
+    Kokkos::deep_copy( host_tracking , tracking );
+    ASSERT_EQ( 2 , host_tracking(0) );
+
+    Kokkos::View< NestedView<Space> * , Space > b("b_nested_view",2);
+    Kokkos::parallel_for( Kokkos::RangePolicy<Space>(0,2) , NestedViewFunctor<Space>( b , tracking ) );
+    Kokkos::deep_copy( host_tracking , tracking );
+    ASSERT_EQ( 4 , host_tracking(0) );
+
+  }
+  Kokkos::deep_copy( host_tracking , tracking );
+
+#if KOKKOS_USING_EXP_VIEW
+  ASSERT_EQ( 0 , host_tracking(0) );
+#endif
+
+}
+
+}
+
+#if ! KOKKOS_USING_EXP_VIEW
+
+namespace Kokkos {
+namespace Impl {
+
+template< class ExecSpace , class S >
+struct ViewDefaultConstruct< ExecSpace , Test::NestedView<S> , true >
+{
+  typedef Test::NestedView<S> type ;
+  type * const m_ptr ;
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  void operator()( const typename ExecSpace::size_type& i ) const
+    { new(m_ptr+i) type(); }
+
+  ViewDefaultConstruct( type * pointer , size_t capacity )
+    : m_ptr( pointer )
+    {
+      Kokkos::RangePolicy< ExecSpace > range( 0 , capacity );
+      parallel_for( range , *this );
+      ExecSpace::fence();
+    }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+#endif
+
+/*--------------------------------------------------------------------------*/
+
diff --git a/lib/kokkos/core/unit_test/TestViewSubview.hpp b/lib/kokkos/core/unit_test/TestViewSubview.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..3846354b8c368f5c8505d84b4931a9105a6a14aa
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestViewSubview.hpp
@@ -0,0 +1,874 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+#include <stdexcept>
+#include <sstream>
+#include <iostream>
+
+/*--------------------------------------------------------------------------*/
+
+namespace TestViewSubview {
+
+template<class Layout, class Space>
+struct getView {
+  static
+    Kokkos::View<double**,Layout,Space> get(int n, int m) {
+      return Kokkos::View<double**,Layout,Space>("G",n,m);
+  }
+};
+
+template<class Space>
+struct getView<Kokkos::LayoutStride,Space> {
+  static
+    Kokkos::View<double**,Kokkos::LayoutStride,Space> get(int n, int m) {
+      const int rank = 2 ;
+      const int order[] = { 0, 1 };
+      const unsigned dim[] = { unsigned(n), unsigned(m) };
+      Kokkos::LayoutStride stride = Kokkos::LayoutStride::order_dimensions( rank , order , dim );
+      return Kokkos::View<double**,Kokkos::LayoutStride,Space>("G",stride);
+  }
+};
+
+template<class ViewType, class Space>
+struct fill_1D {
+  typedef typename Space::execution_space execution_space;
+  typedef typename ViewType::size_type size_type;
+  ViewType a;
+  double val;
+  fill_1D(ViewType a_, double val_):a(a_),val(val_) {
+  }
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int i) const {
+    a(i) = val;
+  }
+};
+
+template<class ViewType, class Space>
+struct fill_2D {
+  typedef typename Space::execution_space execution_space;
+  typedef typename ViewType::size_type size_type;
+  ViewType a;
+  double val;
+  fill_2D(ViewType a_, double val_):a(a_),val(val_) {
+  }
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int i) const{
+    for(int j = 0; j < static_cast<int>(a.dimension_1()); j++)
+      a(i,j) = val;
+  }
+};
+
+template<class Layout, class Space>
+void test_auto_1d ()
+{
+  typedef Kokkos::View<double**, Layout, Space> mv_type;
+  typedef typename mv_type::size_type size_type;
+  const double ZERO = 0.0;
+  const double ONE = 1.0;
+  const double TWO = 2.0;
+
+  const size_type numRows = 10;
+  const size_type numCols = 3;
+
+  mv_type X = getView<Layout,Space>::get(numRows, numCols);
+  typename mv_type::HostMirror X_h = Kokkos::create_mirror_view (X);
+
+  fill_2D<mv_type,Space> f1(X, ONE);
+  Kokkos::parallel_for(X.dimension_0(),f1);
+  Kokkos::deep_copy (X_h, X);
+  for (size_type j = 0; j < numCols; ++j) {
+    for (size_type i = 0; i < numRows; ++i) {
+      ASSERT_TRUE(X_h(i,j) == ONE);
+    }
+  }
+
+  fill_2D<mv_type,Space> f2(X, 0.0);
+  Kokkos::parallel_for(X.dimension_0(),f2);
+  Kokkos::deep_copy (X_h, X);
+  for (size_type j = 0; j < numCols; ++j) {
+    for (size_type i = 0; i < numRows; ++i) {
+      ASSERT_TRUE(X_h(i,j) == ZERO);
+    }
+  }
+
+  fill_2D<mv_type,Space> f3(X, TWO);
+  Kokkos::parallel_for(X.dimension_0(),f3);
+  Kokkos::deep_copy (X_h, X);
+  for (size_type j = 0; j < numCols; ++j) {
+    for (size_type i = 0; i < numRows; ++i) {
+      ASSERT_TRUE(X_h(i,j) == TWO);
+    }
+  }
+
+  for (size_type j = 0; j < numCols; ++j) {
+    auto X_j = Kokkos::subview (X, Kokkos::ALL(), j);
+
+    fill_1D<decltype(X_j),Space> f4(X_j, ZERO);
+    Kokkos::parallel_for(X_j.dimension_0(),f4);
+    Kokkos::deep_copy (X_h, X);
+    for (size_type i = 0; i < numRows; ++i) {
+      ASSERT_TRUE(X_h(i,j) == ZERO);
+    }
+
+    for (size_type jj = 0; jj < numCols; ++jj) {
+      auto X_jj = Kokkos::subview (X, Kokkos::ALL(), jj);
+      fill_1D<decltype(X_jj),Space> f5(X_jj, ONE);
+      Kokkos::parallel_for(X_jj.dimension_0(),f5);
+      Kokkos::deep_copy (X_h, X);
+      for (size_type i = 0; i < numRows; ++i) {
+        ASSERT_TRUE(X_h(i,jj) == ONE);
+      }
+    }
+  }
+}
+
+template<class LD, class LS, class Space>
+void test_1d_strided_assignment_impl(bool a, bool b, bool c, bool d, int n, int m) {
+  Kokkos::View<double**,LS,Space> l2d("l2d",n,m);
+
+  int col = n>2?2:0;
+  int row = m>2?2:0;
+
+  if(Kokkos::Impl::VerifyExecutionCanAccessMemorySpace<Kokkos::HostSpace,Space>::value) {
+  if(a) {
+    Kokkos::View<double*,LD,Space> l1da = Kokkos::subview(l2d,Kokkos::ALL(),row);
+    ASSERT_TRUE( & l1da(0) == & l2d(0,row) );
+    if(n>1)
+      ASSERT_TRUE( & l1da(1) == & l2d(1,row) );
+  }
+  if(b && n>13) {
+    Kokkos::View<double*,LD,Space> l1db = Kokkos::subview(l2d,std::pair<unsigned,unsigned>(2,13),row);
+    ASSERT_TRUE( & l1db(0) == & l2d(2,row) );
+    ASSERT_TRUE( & l1db(1) == & l2d(3,row) );
+  }
+  if(c) {
+    Kokkos::View<double*,LD,Space> l1dc = Kokkos::subview(l2d,col,Kokkos::ALL());
+    ASSERT_TRUE( & l1dc(0) == & l2d(col,0) );
+    if(m>1)
+      ASSERT_TRUE( & l1dc(1) == & l2d(col,1) );
+  }
+  if(d && m>13) {
+    Kokkos::View<double*,LD,Space> l1dd = Kokkos::subview(l2d,col,std::pair<unsigned,unsigned>(2,13));
+    ASSERT_TRUE( & l1dd(0) == & l2d(col,2) );
+    ASSERT_TRUE( & l1dd(1) == & l2d(col,3) );
+  }
+  }
+
+}
+
+template<class Space >
+void test_1d_strided_assignment() {
+  test_1d_strided_assignment_impl<Kokkos::LayoutStride,Kokkos::LayoutLeft,Space>(true,true,true,true,17,3);
+  test_1d_strided_assignment_impl<Kokkos::LayoutStride,Kokkos::LayoutRight,Space>(true,true,true,true,17,3);
+
+  test_1d_strided_assignment_impl<Kokkos::LayoutLeft,Kokkos::LayoutLeft,Space>(true,true,false,false,17,3);
+  test_1d_strided_assignment_impl<Kokkos::LayoutRight,Kokkos::LayoutLeft,Space>(true,true,false,false,17,3);
+  test_1d_strided_assignment_impl<Kokkos::LayoutLeft,Kokkos::LayoutRight,Space>(false,false,true,true,17,3);
+  test_1d_strided_assignment_impl<Kokkos::LayoutRight,Kokkos::LayoutRight,Space>(false,false,true,true,17,3);
+
+  test_1d_strided_assignment_impl<Kokkos::LayoutLeft,Kokkos::LayoutLeft,Space>(true,true,false,false,17,1);
+  test_1d_strided_assignment_impl<Kokkos::LayoutLeft,Kokkos::LayoutLeft,Space>(true,true,true,true,1,17);
+  test_1d_strided_assignment_impl<Kokkos::LayoutRight,Kokkos::LayoutLeft,Space>(true,true,true,true,1,17);
+  test_1d_strided_assignment_impl<Kokkos::LayoutRight,Kokkos::LayoutLeft,Space>(true,true,false,false,17,1);
+
+  test_1d_strided_assignment_impl<Kokkos::LayoutLeft,Kokkos::LayoutRight,Space>(true,true,true,true,17,1);
+  test_1d_strided_assignment_impl<Kokkos::LayoutLeft,Kokkos::LayoutRight,Space>(false,false,true,true,1,17);
+  test_1d_strided_assignment_impl<Kokkos::LayoutRight,Kokkos::LayoutRight,Space>(false,false,true,true,1,17);
+  test_1d_strided_assignment_impl<Kokkos::LayoutRight,Kokkos::LayoutRight,Space>(true,true,true,true,17,1);
+}
+
+template< class Space >
+void test_left_0()
+{
+  typedef Kokkos::View< int [2][3][4][5][2][3][4][5] , Kokkos::LayoutLeft , Space >
+    view_static_8_type ;
+
+  if(Kokkos::Impl::VerifyExecutionCanAccessMemorySpace<Kokkos::HostSpace,Space>::value) {
+
+  view_static_8_type  x_static_8("x_static_left_8");
+
+  ASSERT_TRUE( x_static_8.is_contiguous() );
+
+  Kokkos::View<int,Kokkos::LayoutLeft,Space> x0 = Kokkos::subview( x_static_8 , 0, 0, 0, 0, 0, 0, 0, 0 );
+
+  ASSERT_TRUE( x0.is_contiguous() );
+  ASSERT_TRUE( & x0() == & x_static_8(0,0,0,0,0,0,0,0) );
+
+  Kokkos::View<int*,Kokkos::LayoutLeft,Space> x1 =
+    Kokkos::subview( x_static_8, Kokkos::pair<int,int>(0,2), 1, 2, 3, 0, 1, 2, 3 );
+
+  ASSERT_TRUE( x1.is_contiguous() );
+  ASSERT_TRUE( & x1(0) == & x_static_8(0,1,2,3,0,1,2,3) );
+  ASSERT_TRUE( & x1(1) == & x_static_8(1,1,2,3,0,1,2,3) );
+
+  Kokkos::View<int**,Kokkos::LayoutLeft,Space> x2 =
+    Kokkos::subview( x_static_8, Kokkos::pair<int,int>(0,2), 1, 2, 3
+                               , Kokkos::pair<int,int>(0,2), 1, 2, 3 );
+
+  ASSERT_TRUE( ! x2.is_contiguous() );
+  ASSERT_TRUE( & x2(0,0) == & x_static_8(0,1,2,3,0,1,2,3) );
+  ASSERT_TRUE( & x2(1,0) == & x_static_8(1,1,2,3,0,1,2,3) );
+  ASSERT_TRUE( & x2(0,1) == & x_static_8(0,1,2,3,1,1,2,3) );
+  ASSERT_TRUE( & x2(1,1) == & x_static_8(1,1,2,3,1,1,2,3) );
+
+  // Kokkos::View<int**,Kokkos::LayoutLeft,Space> error_2 =
+  Kokkos::View<int**,Kokkos::LayoutStride,Space> sx2 =
+    Kokkos::subview( x_static_8, 1, Kokkos::pair<int,int>(0,2), 2, 3
+                               , Kokkos::pair<int,int>(0,2), 1, 2, 3 );
+
+  ASSERT_TRUE( ! sx2.is_contiguous() );
+  ASSERT_TRUE( & sx2(0,0) == & x_static_8(1,0,2,3,0,1,2,3) );
+  ASSERT_TRUE( & sx2(1,0) == & x_static_8(1,1,2,3,0,1,2,3) );
+  ASSERT_TRUE( & sx2(0,1) == & x_static_8(1,0,2,3,1,1,2,3) );
+  ASSERT_TRUE( & sx2(1,1) == & x_static_8(1,1,2,3,1,1,2,3) );
+
+  Kokkos::View<int****,Kokkos::LayoutStride,Space> sx4 =
+    Kokkos::subview( x_static_8, 0, Kokkos::pair<int,int>(0,2) /* of [3] */
+                               , 1, Kokkos::pair<int,int>(1,3) /* of [5] */
+                               , 1, Kokkos::pair<int,int>(0,2) /* of [3] */
+                               , 2, Kokkos::pair<int,int>(2,4) /* of [5] */
+                   );
+
+  ASSERT_TRUE( ! sx4.is_contiguous() );
+
+  for ( int i0 = 0 ; i0 < (int) sx4.dimension_0() ; ++i0 )
+  for ( int i1 = 0 ; i1 < (int) sx4.dimension_1() ; ++i1 )
+  for ( int i2 = 0 ; i2 < (int) sx4.dimension_2() ; ++i2 )
+  for ( int i3 = 0 ; i3 < (int) sx4.dimension_3() ; ++i3 ) {
+    ASSERT_TRUE( & sx4(i0,i1,i2,i3) == & x_static_8(0,0+i0, 1,1+i1, 1,0+i2, 2,2+i3) );
+  }
+
+  }
+}
+
+template< class Space >
+void test_left_1()
+{
+  typedef Kokkos::View< int ****[2][3][4][5] , Kokkos::LayoutLeft , Space >
+    view_type ;
+
+  if(Kokkos::Impl::VerifyExecutionCanAccessMemorySpace<Kokkos::HostSpace,Space>::value) {
+
+  view_type  x8("x_left_8",2,3,4,5);
+
+  ASSERT_TRUE( x8.is_contiguous() );
+
+  Kokkos::View<int,Kokkos::LayoutLeft,Space> x0 = Kokkos::subview( x8 , 0, 0, 0, 0, 0, 0, 0, 0 );
+
+  ASSERT_TRUE( x0.is_contiguous() );
+  ASSERT_TRUE( & x0() == & x8(0,0,0,0,0,0,0,0) );
+
+  Kokkos::View<int*,Kokkos::LayoutLeft,Space> x1 =
+    Kokkos::subview( x8, Kokkos::pair<int,int>(0,2), 1, 2, 3, 0, 1, 2, 3 );
+
+  ASSERT_TRUE( x1.is_contiguous() );
+  ASSERT_TRUE( & x1(0) == & x8(0,1,2,3,0,1,2,3) );
+  ASSERT_TRUE( & x1(1) == & x8(1,1,2,3,0,1,2,3) );
+
+  Kokkos::View<int**,Kokkos::LayoutLeft,Space> x2 =
+    Kokkos::subview( x8, Kokkos::pair<int,int>(0,2), 1, 2, 3
+                               , Kokkos::pair<int,int>(0,2), 1, 2, 3 );
+
+  ASSERT_TRUE( ! x2.is_contiguous() );
+  ASSERT_TRUE( & x2(0,0) == & x8(0,1,2,3,0,1,2,3) );
+  ASSERT_TRUE( & x2(1,0) == & x8(1,1,2,3,0,1,2,3) );
+  ASSERT_TRUE( & x2(0,1) == & x8(0,1,2,3,1,1,2,3) );
+  ASSERT_TRUE( & x2(1,1) == & x8(1,1,2,3,1,1,2,3) );
+
+  // Kokkos::View<int**,Kokkos::LayoutLeft,Space> error_2 =
+  Kokkos::View<int**,Kokkos::LayoutStride,Space> sx2 =
+    Kokkos::subview( x8, 1, Kokkos::pair<int,int>(0,2), 2, 3
+                               , Kokkos::pair<int,int>(0,2), 1, 2, 3 );
+
+  ASSERT_TRUE( ! sx2.is_contiguous() );
+  ASSERT_TRUE( & sx2(0,0) == & x8(1,0,2,3,0,1,2,3) );
+  ASSERT_TRUE( & sx2(1,0) == & x8(1,1,2,3,0,1,2,3) );
+  ASSERT_TRUE( & sx2(0,1) == & x8(1,0,2,3,1,1,2,3) );
+  ASSERT_TRUE( & sx2(1,1) == & x8(1,1,2,3,1,1,2,3) );
+
+  Kokkos::View<int****,Kokkos::LayoutStride,Space> sx4 =
+    Kokkos::subview( x8, 0, Kokkos::pair<int,int>(0,2) /* of [3] */
+                       , 1, Kokkos::pair<int,int>(1,3) /* of [5] */
+                       , 1, Kokkos::pair<int,int>(0,2) /* of [3] */
+                       , 2, Kokkos::pair<int,int>(2,4) /* of [5] */
+                   );
+
+  ASSERT_TRUE( ! sx4.is_contiguous() );
+
+  for ( int i0 = 0 ; i0 < (int) sx4.dimension_0() ; ++i0 )
+  for ( int i1 = 0 ; i1 < (int) sx4.dimension_1() ; ++i1 )
+  for ( int i2 = 0 ; i2 < (int) sx4.dimension_2() ; ++i2 )
+  for ( int i3 = 0 ; i3 < (int) sx4.dimension_3() ; ++i3 ) {
+    ASSERT_TRUE( & sx4(i0,i1,i2,i3) == & x8(0,0+i0, 1,1+i1, 1,0+i2, 2,2+i3) );
+  }
+
+  }
+}
+
+template< class Space >
+void test_left_2()
+{
+  typedef Kokkos::View< int **** , Kokkos::LayoutLeft , Space > view_type ;
+
+  if(Kokkos::Impl::VerifyExecutionCanAccessMemorySpace<Kokkos::HostSpace,Space>::value) {
+
+  view_type  x4("x4",2,3,4,5);
+
+  ASSERT_TRUE( x4.is_contiguous() );
+
+  Kokkos::View<int,Kokkos::LayoutLeft,Space> x0 = Kokkos::subview( x4 , 0, 0, 0, 0 );
+
+  ASSERT_TRUE( x0.is_contiguous() );
+  ASSERT_TRUE( & x0() == & x4(0,0,0,0) );
+
+  Kokkos::View<int*,Kokkos::LayoutLeft,Space> x1 =
+    Kokkos::subview( x4, Kokkos::pair<int,int>(0,2), 1, 2, 3 );
+
+  ASSERT_TRUE( x1.is_contiguous() );
+  ASSERT_TRUE( & x1(0) == & x4(0,1,2,3) );
+  ASSERT_TRUE( & x1(1) == & x4(1,1,2,3) );
+
+  Kokkos::View<int**,Kokkos::LayoutLeft,Space> x2 =
+    Kokkos::subview( x4, Kokkos::pair<int,int>(0,2), 1, Kokkos::pair<int,int>(1,3), 2 );
+
+  ASSERT_TRUE( ! x2.is_contiguous() );
+  ASSERT_TRUE( & x2(0,0) == & x4(0,1,1,2) );
+  ASSERT_TRUE( & x2(1,0) == & x4(1,1,1,2) );
+  ASSERT_TRUE( & x2(0,1) == & x4(0,1,2,2) );
+  ASSERT_TRUE( & x2(1,1) == & x4(1,1,2,2) );
+
+  // Kokkos::View<int**,Kokkos::LayoutLeft,Space> error_2 =
+  Kokkos::View<int**,Kokkos::LayoutStride,Space> sx2 =
+    Kokkos::subview( x4, 1, Kokkos::pair<int,int>(0,2)
+                       , 2, Kokkos::pair<int,int>(1,4) );
+
+  ASSERT_TRUE( ! sx2.is_contiguous() );
+  ASSERT_TRUE( & sx2(0,0) == & x4(1,0,2,1) );
+  ASSERT_TRUE( & sx2(1,0) == & x4(1,1,2,1) );
+  ASSERT_TRUE( & sx2(0,1) == & x4(1,0,2,2) );
+  ASSERT_TRUE( & sx2(1,1) == & x4(1,1,2,2) );
+  ASSERT_TRUE( & sx2(0,2) == & x4(1,0,2,3) );
+  ASSERT_TRUE( & sx2(1,2) == & x4(1,1,2,3) );
+
+  Kokkos::View<int****,Kokkos::LayoutStride,Space> sx4 =
+    Kokkos::subview( x4, Kokkos::pair<int,int>(1,2) /* of [2] */
+                       , Kokkos::pair<int,int>(1,3) /* of [3] */
+                       , Kokkos::pair<int,int>(0,4) /* of [4] */
+                       , Kokkos::pair<int,int>(2,4) /* of [5] */
+                   );
+
+  ASSERT_TRUE( ! sx4.is_contiguous() );
+
+  for ( int i0 = 0 ; i0 < (int) sx4.dimension_0() ; ++i0 )
+  for ( int i1 = 0 ; i1 < (int) sx4.dimension_1() ; ++i1 )
+  for ( int i2 = 0 ; i2 < (int) sx4.dimension_2() ; ++i2 )
+  for ( int i3 = 0 ; i3 < (int) sx4.dimension_3() ; ++i3 ) {
+    ASSERT_TRUE( & sx4(i0,i1,i2,i3) == & x4( 1+i0, 1+i1, 0+i2, 2+i3 ) );
+  }
+
+  }
+}
+
+template< class Space >
+void test_left_3()
+{
+  typedef Kokkos::View< int ** , Kokkos::LayoutLeft , Space > view_type ;
+
+  if(Kokkos::Impl::VerifyExecutionCanAccessMemorySpace<Kokkos::HostSpace,Space>::value) {
+
+  view_type  xm("x4",10,5);
+
+  ASSERT_TRUE( xm.is_contiguous() );
+
+  Kokkos::View<int,Kokkos::LayoutLeft,Space> x0 = Kokkos::subview( xm , 5, 3 );
+
+  ASSERT_TRUE( x0.is_contiguous() );
+  ASSERT_TRUE( & x0() == & xm(5,3) );
+
+  Kokkos::View<int*,Kokkos::LayoutLeft,Space> x1 =
+    Kokkos::subview( xm, Kokkos::ALL(), 3 );
+
+  ASSERT_TRUE( x1.is_contiguous() );
+  for ( int i = 0 ; i < int(xm.dimension_0()) ; ++i ) {
+    ASSERT_TRUE( & x1(i) == & xm(i,3) );
+  }
+
+  Kokkos::View<int**,Kokkos::LayoutLeft,Space> x2 =
+    Kokkos::subview( xm, Kokkos::pair<int,int>(1,9), Kokkos::ALL() );
+
+  ASSERT_TRUE( ! x2.is_contiguous() );
+  for ( int j = 0 ; j < int(x2.dimension_1()) ; ++j )
+  for ( int i = 0 ; i < int(x2.dimension_0()) ; ++i ) {
+    ASSERT_TRUE( & x2(i,j) == & xm(1+i,j) );
+  }
+
+  Kokkos::View<int**,Kokkos::LayoutLeft,Space> x2c =
+    Kokkos::subview( xm, Kokkos::ALL(), std::pair<int,int>(2,4) );
+
+  ASSERT_TRUE( x2c.is_contiguous() );
+  for ( int j = 0 ; j < int(x2c.dimension_1()) ; ++j )
+  for ( int i = 0 ; i < int(x2c.dimension_0()) ; ++i ) {
+    ASSERT_TRUE( & x2c(i,j) == & xm(i,2+j) );
+  }
+
+  Kokkos::View<int**,Kokkos::LayoutLeft,Space> x2_n1 =
+    Kokkos::subview( xm , std::pair<int,int>(1,1) , Kokkos::ALL() );
+
+  ASSERT_TRUE( x2_n1.dimension_0() == 0 );
+  ASSERT_TRUE( x2_n1.dimension_1() == xm.dimension_1() );
+
+  Kokkos::View<int**,Kokkos::LayoutLeft,Space> x2_n2 =
+    Kokkos::subview( xm , Kokkos::ALL() , std::pair<int,int>(1,1) );
+
+  ASSERT_TRUE( x2_n2.dimension_0() == xm.dimension_0() );
+  ASSERT_TRUE( x2_n2.dimension_1() == 0 );
+
+  }
+}
+
+//----------------------------------------------------------------------------
+
+template< class Space >
+void test_right_0()
+{
+  typedef Kokkos::View< int [2][3][4][5][2][3][4][5] , Kokkos::LayoutRight , Space >
+    view_static_8_type ;
+
+  if(Kokkos::Impl::VerifyExecutionCanAccessMemorySpace<Kokkos::HostSpace,Space>::value) {
+
+  view_static_8_type  x_static_8("x_static_right_8");
+
+  Kokkos::View<int,Kokkos::LayoutRight,Space> x0 = Kokkos::subview( x_static_8 , 0, 0, 0, 0, 0, 0, 0, 0 );
+
+  ASSERT_TRUE( & x0() == & x_static_8(0,0,0,0,0,0,0,0) );
+
+  Kokkos::View<int*,Kokkos::LayoutRight,Space> x1 =
+    Kokkos::subview( x_static_8, 0, 1, 2, 3, 0, 1, 2, Kokkos::pair<int,int>(1,3) );
+
+  ASSERT_TRUE( x1.dimension_0() == 2 );
+  ASSERT_TRUE( & x1(0) == & x_static_8(0,1,2,3,0,1,2,1) );
+  ASSERT_TRUE( & x1(1) == & x_static_8(0,1,2,3,0,1,2,2) );
+
+  Kokkos::View<int**,Kokkos::LayoutRight,Space> x2 =
+    Kokkos::subview( x_static_8, 0, 1, 2, Kokkos::pair<int,int>(1,3)
+                               , 0, 1, 2, Kokkos::pair<int,int>(1,3) );
+
+  ASSERT_TRUE( x2.dimension_0() == 2 );
+  ASSERT_TRUE( x2.dimension_1() == 2 );
+  ASSERT_TRUE( & x2(0,0) == & x_static_8(0,1,2,1,0,1,2,1) );
+  ASSERT_TRUE( & x2(1,0) == & x_static_8(0,1,2,2,0,1,2,1) );
+  ASSERT_TRUE( & x2(0,1) == & x_static_8(0,1,2,1,0,1,2,2) );
+  ASSERT_TRUE( & x2(1,1) == & x_static_8(0,1,2,2,0,1,2,2) );
+
+  // Kokkos::View<int**,Kokkos::LayoutRight,Space> error_2 =
+  Kokkos::View<int**,Kokkos::LayoutStride,Space> sx2 =
+    Kokkos::subview( x_static_8, 1, Kokkos::pair<int,int>(0,2), 2, 3
+                               , Kokkos::pair<int,int>(0,2), 1, 2, 3 );
+
+  ASSERT_TRUE( sx2.dimension_0() == 2 );
+  ASSERT_TRUE( sx2.dimension_1() == 2 );
+  ASSERT_TRUE( & sx2(0,0) == & x_static_8(1,0,2,3,0,1,2,3) );
+  ASSERT_TRUE( & sx2(1,0) == & x_static_8(1,1,2,3,0,1,2,3) );
+  ASSERT_TRUE( & sx2(0,1) == & x_static_8(1,0,2,3,1,1,2,3) );
+  ASSERT_TRUE( & sx2(1,1) == & x_static_8(1,1,2,3,1,1,2,3) );
+
+  Kokkos::View<int****,Kokkos::LayoutStride,Space> sx4 =
+    Kokkos::subview( x_static_8, 0, Kokkos::pair<int,int>(0,2) /* of [3] */
+                               , 1, Kokkos::pair<int,int>(1,3) /* of [5] */
+                               , 1, Kokkos::pair<int,int>(0,2) /* of [3] */
+                               , 2, Kokkos::pair<int,int>(2,4) /* of [5] */
+                   );
+
+  ASSERT_TRUE( sx4.dimension_0() == 2 );
+  ASSERT_TRUE( sx4.dimension_1() == 2 );
+  ASSERT_TRUE( sx4.dimension_2() == 2 );
+  ASSERT_TRUE( sx4.dimension_3() == 2 );
+  for ( int i0 = 0 ; i0 < (int) sx4.dimension_0() ; ++i0 )
+  for ( int i1 = 0 ; i1 < (int) sx4.dimension_1() ; ++i1 )
+  for ( int i2 = 0 ; i2 < (int) sx4.dimension_2() ; ++i2 )
+  for ( int i3 = 0 ; i3 < (int) sx4.dimension_3() ; ++i3 ) {
+    ASSERT_TRUE( & sx4(i0,i1,i2,i3) == & x_static_8(0, 0+i0, 1, 1+i1, 1, 0+i2, 2, 2+i3) );
+  }
+
+  }
+}
+
+template< class Space >
+void test_right_1()
+{
+  typedef Kokkos::View< int ****[2][3][4][5] , Kokkos::LayoutRight , Space >
+    view_type ;
+
+  if(Kokkos::Impl::VerifyExecutionCanAccessMemorySpace<Kokkos::HostSpace,Space>::value) {
+
+  view_type  x8("x_right_8",2,3,4,5);
+
+  Kokkos::View<int,Kokkos::LayoutRight,Space> x0 = Kokkos::subview( x8 , 0, 0, 0, 0, 0, 0, 0, 0 );
+
+  ASSERT_TRUE( & x0() == & x8(0,0,0,0,0,0,0,0) );
+
+  Kokkos::View<int*,Kokkos::LayoutRight,Space> x1 =
+    Kokkos::subview( x8, 0, 1, 2, 3, 0, 1, 2, Kokkos::pair<int,int>(1,3) );
+
+  ASSERT_TRUE( & x1(0) == & x8(0,1,2,3,0,1,2,1) );
+  ASSERT_TRUE( & x1(1) == & x8(0,1,2,3,0,1,2,2) );
+
+  Kokkos::View<int**,Kokkos::LayoutRight,Space> x2 =
+    Kokkos::subview( x8, 0, 1, 2, Kokkos::pair<int,int>(1,3)
+                               , 0, 1, 2, Kokkos::pair<int,int>(1,3) );
+
+  ASSERT_TRUE( & x2(0,0) == & x8(0,1,2,1,0,1,2,1) );
+  ASSERT_TRUE( & x2(1,0) == & x8(0,1,2,2,0,1,2,1) );
+  ASSERT_TRUE( & x2(0,1) == & x8(0,1,2,1,0,1,2,2) );
+  ASSERT_TRUE( & x2(1,1) == & x8(0,1,2,2,0,1,2,2) );
+
+  // Kokkos::View<int**,Kokkos::LayoutRight,Space> error_2 =
+  Kokkos::View<int**,Kokkos::LayoutStride,Space> sx2 =
+    Kokkos::subview( x8, 1, Kokkos::pair<int,int>(0,2), 2, 3
+                               , Kokkos::pair<int,int>(0,2), 1, 2, 3 );
+
+  ASSERT_TRUE( & sx2(0,0) == & x8(1,0,2,3,0,1,2,3) );
+  ASSERT_TRUE( & sx2(1,0) == & x8(1,1,2,3,0,1,2,3) );
+  ASSERT_TRUE( & sx2(0,1) == & x8(1,0,2,3,1,1,2,3) );
+  ASSERT_TRUE( & sx2(1,1) == & x8(1,1,2,3,1,1,2,3) );
+
+  Kokkos::View<int****,Kokkos::LayoutStride,Space> sx4 =
+    Kokkos::subview( x8, 0, Kokkos::pair<int,int>(0,2) /* of [3] */
+                       , 1, Kokkos::pair<int,int>(1,3) /* of [5] */
+                       , 1, Kokkos::pair<int,int>(0,2) /* of [3] */
+                       , 2, Kokkos::pair<int,int>(2,4) /* of [5] */
+                   );
+
+  for ( int i0 = 0 ; i0 < (int) sx4.dimension_0() ; ++i0 )
+  for ( int i1 = 0 ; i1 < (int) sx4.dimension_1() ; ++i1 )
+  for ( int i2 = 0 ; i2 < (int) sx4.dimension_2() ; ++i2 )
+  for ( int i3 = 0 ; i3 < (int) sx4.dimension_3() ; ++i3 ) {
+    ASSERT_TRUE( & sx4(i0,i1,i2,i3) == & x8(0,0+i0, 1,1+i1, 1,0+i2, 2,2+i3) );
+  }
+
+  }
+}
+
+template< class Space >
+void test_right_3()
+{
+  typedef Kokkos::View< int ** , Kokkos::LayoutRight , Space > view_type ;
+
+  if(Kokkos::Impl::VerifyExecutionCanAccessMemorySpace<Kokkos::HostSpace,Space>::value) {
+
+  view_type  xm("x4",10,5);
+
+  ASSERT_TRUE( xm.is_contiguous() );
+
+  Kokkos::View<int,Kokkos::LayoutRight,Space> x0 = Kokkos::subview( xm , 5, 3 );
+
+  ASSERT_TRUE( x0.is_contiguous() );
+  ASSERT_TRUE( & x0() == & xm(5,3) );
+
+  Kokkos::View<int*,Kokkos::LayoutRight,Space> x1 =
+    Kokkos::subview( xm, 3, Kokkos::ALL() );
+
+  ASSERT_TRUE( x1.is_contiguous() );
+  for ( int i = 0 ; i < int(xm.dimension_1()) ; ++i ) {
+    ASSERT_TRUE( & x1(i) == & xm(3,i) );
+  }
+
+  Kokkos::View<int**,Kokkos::LayoutRight,Space> x2c =
+    Kokkos::subview( xm, Kokkos::pair<int,int>(1,9), Kokkos::ALL() );
+
+  ASSERT_TRUE( x2c.is_contiguous() );
+  for ( int j = 0 ; j < int(x2c.dimension_1()) ; ++j )
+  for ( int i = 0 ; i < int(x2c.dimension_0()) ; ++i ) {
+    ASSERT_TRUE( & x2c(i,j) == & xm(1+i,j) );
+  }
+
+  Kokkos::View<int**,Kokkos::LayoutRight,Space> x2 =
+    Kokkos::subview( xm, Kokkos::ALL(), std::pair<int,int>(2,4) );
+
+  ASSERT_TRUE( ! x2.is_contiguous() );
+  for ( int j = 0 ; j < int(x2.dimension_1()) ; ++j )
+  for ( int i = 0 ; i < int(x2.dimension_0()) ; ++i ) {
+    ASSERT_TRUE( & x2(i,j) == & xm(i,2+j) );
+  }
+
+  Kokkos::View<int**,Kokkos::LayoutRight,Space> x2_n1 =
+    Kokkos::subview( xm , std::pair<int,int>(1,1) , Kokkos::ALL() );
+
+  ASSERT_TRUE( x2_n1.dimension_0() == 0 );
+  ASSERT_TRUE( x2_n1.dimension_1() == xm.dimension_1() );
+
+  Kokkos::View<int**,Kokkos::LayoutRight,Space> x2_n2 =
+    Kokkos::subview( xm , Kokkos::ALL() , std::pair<int,int>(1,1) );
+
+  ASSERT_TRUE( x2_n2.dimension_0() == xm.dimension_0() );
+  ASSERT_TRUE( x2_n2.dimension_1() == 0 );
+
+  }
+}
+
+namespace Impl {
+
+constexpr int N0=113;
+constexpr int N1=11;
+constexpr int N2=17;
+constexpr int N3=5;
+constexpr int N4=7;
+
+template<class SubView,class View>
+void test_Check1D(SubView a, View b, std::pair<int,int> range) {
+  int errors = 0;
+  for(int i=0;i<range.second-range.first;i++) {
+    if(a(i)!=b(i+range.first))
+      errors++;
+  }
+  if(errors>0)
+    std::cout << "Error Suviews test_Check1D: " << errors <<std::endl;
+  ASSERT_TRUE( errors == 0 );
+}
+
+template<class SubView,class View>
+void test_Check1D2D(SubView a, View b, int i0, std::pair<int,int> range) {
+  int errors = 0;
+  for(int i1=0;i1<range.second-range.first;i1++) {
+    if(a(i1)!=b(i0,i1+range.first))
+      errors++;
+  }
+  if(errors>0)
+    std::cout << "Error Suviews test_Check1D2D: " << errors <<std::endl;
+  ASSERT_TRUE( errors == 0 );
+}
+
+template<class SubView,class View>
+void test_Check2D3D(SubView a, View b, int i0, std::pair<int,int> range1, std::pair<int,int> range2) {
+  int errors = 0;
+  for(int i1=0;i1<range1.second-range1.first;i1++) {
+    for(int i2=0;i2<range2.second-range2.first;i2++) {
+      if(a(i1,i2)!=b(i0,i1+range1.first,i2+range2.first))
+        errors++;
+    }
+  }
+  if(errors>0)
+    std::cout << "Error Suviews test_Check2D3D: " << errors <<std::endl;
+  ASSERT_TRUE( errors == 0 );
+}
+
+template<class SubView,class View>
+void test_Check3D5D(SubView a, View b, int i0, int i1, std::pair<int,int> range2, std::pair<int,int> range3, std::pair<int,int> range4) {
+  int errors = 0;
+  for(int i2=0;i2<range2.second-range2.first;i2++) {
+    for(int i3=0;i3<range3.second-range3.first;i3++) {
+      for(int i4=0;i4<range4.second-range4.first;i4++) {
+        if(a(i2,i3,i4)!=b(i0,i1,i2+range2.first,i3+range3.first,i4+range4.first))
+          errors++;
+      }
+    }
+  }
+  if(errors>0)
+    std::cout << "Error Suviews test_Check3D5D: " << errors <<std::endl;
+  ASSERT_TRUE( errors == 0 );
+}
+
+template<class Space, class LayoutSub, class Layout, class LayoutOrg>
+void test_1d_assign_impl() {
+
+  { //Breaks
+    Kokkos::View<int*,LayoutOrg,Space> a("A",N0);
+    Kokkos::fence();
+    for(int i=0; i<N0; i++)
+      a(i) = i;
+
+    Kokkos::View<int[N0],Layout,Space> a1(a);
+    Kokkos::fence();
+    test_Check1D(a1,a,std::pair<int,int>(0,N0));
+
+    Kokkos::View<int[N0],LayoutSub,Space> a2(a1);
+    Kokkos::fence();
+    test_Check1D(a2,a,std::pair<int,int>(0,N0));
+    a1 = a;
+    test_Check1D(a1,a,std::pair<int,int>(0,N0));
+
+    //Runtime Fail expected
+    //Kokkos::View<int[N1]> afail1(a);
+
+    //Compile Time Fail expected
+    //Kokkos::View<int[N1]> afail2(a1);
+  }
+
+  { // Works
+    Kokkos::View<int[N0],LayoutOrg,Space> a("A");
+    Kokkos::View<int*,Layout,Space> a1(a);
+    Kokkos::fence();
+    test_Check1D(a1,a,std::pair<int,int>(0,N0));
+    a1 = a;
+    Kokkos::fence();
+    test_Check1D(a1,a,std::pair<int,int>(0,N0));
+  }
+}
+
+template<class Space, class Type, class TypeSub,class LayoutSub, class Layout, class LayoutOrg>
+void test_2d_subview_3d_impl_type() {
+  Kokkos::View<int***,LayoutOrg,Space> a_org("A",N0,N1,N2);
+  Kokkos::View<Type,Layout,Space> a(a_org);
+  for(int i0=0; i0<N0; i0++)
+    for(int i1=0; i1<N1; i1++)
+      for(int i2=0; i2<N2; i2++)
+        a(i0,i1,i2) = i0*1000000+i1*1000+i2;
+  Kokkos::View<TypeSub,LayoutSub,Space> a1;
+  a1 = Kokkos::subview(a,3,Kokkos::ALL(),Kokkos::ALL());
+  Kokkos::fence();
+  test_Check2D3D(a1,a,3,std::pair<int,int>(0,N1),std::pair<int,int>(0,N2));
+
+  Kokkos::View<TypeSub,LayoutSub,Space> a2(a,3,Kokkos::ALL(),Kokkos::ALL());
+  Kokkos::fence();
+  test_Check2D3D(a2,a,3,std::pair<int,int>(0,N1),std::pair<int,int>(0,N2));
+}
+
+template<class Space, class LayoutSub, class Layout, class LayoutOrg>
+void test_2d_subview_3d_impl_layout() {
+  test_2d_subview_3d_impl_type<Space,int[N0][N1][N2],int[N1][N2],LayoutSub, Layout, LayoutOrg>();
+  test_2d_subview_3d_impl_type<Space,int[N0][N1][N2],int*   [N2],LayoutSub, Layout, LayoutOrg>();
+  test_2d_subview_3d_impl_type<Space,int[N0][N1][N2],int**      ,LayoutSub, Layout, LayoutOrg>();
+
+  test_2d_subview_3d_impl_type<Space,int*   [N1][N2],int[N1][N2],LayoutSub, Layout, LayoutOrg>();
+  test_2d_subview_3d_impl_type<Space,int*   [N1][N2],int*   [N2],LayoutSub, Layout, LayoutOrg>();
+  test_2d_subview_3d_impl_type<Space,int*   [N1][N2],int**      ,LayoutSub, Layout, LayoutOrg>();
+
+  test_2d_subview_3d_impl_type<Space,int**      [N2],int[N1][N2],LayoutSub, Layout, LayoutOrg>();
+  test_2d_subview_3d_impl_type<Space,int**      [N2],int*   [N2],LayoutSub, Layout, LayoutOrg>();
+  test_2d_subview_3d_impl_type<Space,int**      [N2],int**      ,LayoutSub, Layout, LayoutOrg>();
+
+  test_2d_subview_3d_impl_type<Space,int***         ,int[N1][N2],LayoutSub, Layout, LayoutOrg>();
+  test_2d_subview_3d_impl_type<Space,int***         ,int*   [N2],LayoutSub, Layout, LayoutOrg>();
+  test_2d_subview_3d_impl_type<Space,int***         ,int**      ,LayoutSub, Layout, LayoutOrg>();
+}
+
+template<class Space, class Type, class TypeSub,class LayoutSub, class Layout, class LayoutOrg>
+void test_2d_subview_5d_impl_type() {
+  Kokkos::View<int*****,LayoutOrg,Space> a_org("A",N0,N1,N2,N3,N4);
+  Kokkos::View<Type,Layout,Space> a(a_org);
+  for(int i0=0; i0<N0; i0++)
+    for(int i1=0; i1<N1; i1++)
+      for(int i2=0; i2<N2; i2++)
+        for(int i3=0; i3<N3; i3++)
+          for(int i4=0; i4<N4; i4++)
+            a(i0,i1,i2,i3,i4) = i0*1000000+i1*10000+i2*100+i3*10+i4;
+  Kokkos::View<TypeSub,LayoutSub,Space> a1;
+  a1 = Kokkos::subview(a,3,5,Kokkos::ALL(),Kokkos::ALL(),Kokkos::ALL());
+  Kokkos::fence();
+  test_Check3D5D(a1,a,3,5,std::pair<int,int>(0,N2),std::pair<int,int>(0,N3),std::pair<int,int>(0,N4));
+
+  Kokkos::View<TypeSub,LayoutSub,Space> a2(a,3,5,Kokkos::ALL(),Kokkos::ALL(),Kokkos::ALL());
+  Kokkos::fence();
+  test_Check3D5D(a2,a,3,5,std::pair<int,int>(0,N2),std::pair<int,int>(0,N3),std::pair<int,int>(0,N4));
+}
+
+template<class Space, class LayoutSub, class Layout, class LayoutOrg>
+void test_2d_subview_5d_impl_layout() {
+  test_2d_subview_5d_impl_type<Space, int[N0][N1][N2][N3][N4],int[N2][N3][N4],LayoutSub, Layout, LayoutOrg>();
+  test_2d_subview_5d_impl_type<Space, int[N0][N1][N2][N3][N4],int*   [N3][N4],LayoutSub, Layout, LayoutOrg>();
+  test_2d_subview_5d_impl_type<Space, int[N0][N1][N2][N3][N4],int**      [N4],LayoutSub, Layout, LayoutOrg>();
+  test_2d_subview_5d_impl_type<Space, int[N0][N1][N2][N3][N4],int***         ,LayoutSub, Layout, LayoutOrg>();
+
+  test_2d_subview_5d_impl_type<Space, int*   [N1][N2][N3][N4],int[N2][N3][N4],LayoutSub, Layout, LayoutOrg>();
+  test_2d_subview_5d_impl_type<Space, int*   [N1][N2][N3][N4],int*   [N3][N4],LayoutSub, Layout, LayoutOrg>();
+  test_2d_subview_5d_impl_type<Space, int*   [N1][N2][N3][N4],int**      [N4],LayoutSub, Layout, LayoutOrg>();
+  test_2d_subview_5d_impl_type<Space, int*   [N1][N2][N3][N4],int***         ,LayoutSub, Layout, LayoutOrg>();
+
+  test_2d_subview_5d_impl_type<Space, int**      [N2][N3][N4],int[N2][N3][N4],LayoutSub, Layout, LayoutOrg>();
+  test_2d_subview_5d_impl_type<Space, int**      [N2][N3][N4],int*   [N3][N4],LayoutSub, Layout, LayoutOrg>();
+  test_2d_subview_5d_impl_type<Space, int**      [N2][N3][N4],int**      [N4],LayoutSub, Layout, LayoutOrg>();
+  test_2d_subview_5d_impl_type<Space, int**      [N2][N3][N4],int***         ,LayoutSub, Layout, LayoutOrg>();
+
+  test_2d_subview_5d_impl_type<Space, int***         [N3][N4],int[N2][N3][N4],LayoutSub, Layout, LayoutOrg>();
+  test_2d_subview_5d_impl_type<Space, int***         [N3][N4],int*   [N3][N4],LayoutSub, Layout, LayoutOrg>();
+  test_2d_subview_5d_impl_type<Space, int***         [N3][N4],int**      [N4],LayoutSub, Layout, LayoutOrg>();
+  test_2d_subview_5d_impl_type<Space, int***         [N3][N4],int***         ,LayoutSub, Layout, LayoutOrg>();
+
+  test_2d_subview_5d_impl_type<Space, int****            [N4],int[N2][N3][N4],LayoutSub, Layout, LayoutOrg>();
+  test_2d_subview_5d_impl_type<Space, int****            [N4],int*   [N3][N4],LayoutSub, Layout, LayoutOrg>();
+  test_2d_subview_5d_impl_type<Space, int****            [N4],int**      [N4],LayoutSub, Layout, LayoutOrg>();
+  test_2d_subview_5d_impl_type<Space, int****            [N4],int***         ,LayoutSub, Layout, LayoutOrg>();
+
+  test_2d_subview_5d_impl_type<Space, int*****               ,int[N2][N3][N4],LayoutSub, Layout, LayoutOrg>();
+  test_2d_subview_5d_impl_type<Space, int*****               ,int*   [N3][N4],LayoutSub, Layout, LayoutOrg>();
+  test_2d_subview_5d_impl_type<Space, int*****               ,int**      [N4],LayoutSub, Layout, LayoutOrg>();
+  test_2d_subview_5d_impl_type<Space, int*****               ,int***         ,LayoutSub, Layout, LayoutOrg>();
+}
+}
+
+template< class Space >
+void test_1d_assign() {
+  Impl::test_1d_assign_impl<Space,Kokkos::LayoutLeft  ,Kokkos::LayoutLeft  ,Kokkos::LayoutLeft  >();
+  //Impl::test_1d_assign_impl<Space,Kokkos::LayoutRight ,Kokkos::LayoutLeft  ,Kokkos::LayoutLeft  >();
+  Impl::test_1d_assign_impl<Space,Kokkos::LayoutStride,Kokkos::LayoutLeft  ,Kokkos::LayoutLeft  >();
+  //Impl::test_1d_assign_impl<Space,Kokkos::LayoutLeft  ,Kokkos::LayoutRight ,Kokkos::LayoutLeft  >();
+  Impl::test_1d_assign_impl<Space,Kokkos::LayoutRight ,Kokkos::LayoutRight ,Kokkos::LayoutRight  >();
+  Impl::test_1d_assign_impl<Space,Kokkos::LayoutStride,Kokkos::LayoutRight ,Kokkos::LayoutRight  >();
+  //Impl::test_1d_assign_impl<Space,Kokkos::LayoutLeft  ,Kokkos::LayoutStride,Kokkos::LayoutLeft  >();
+  //Impl::test_1d_assign_impl<Space,Kokkos::LayoutRight ,Kokkos::LayoutStride,Kokkos::LayoutLeft  >();
+  Impl::test_1d_assign_impl<Space,Kokkos::LayoutStride,Kokkos::LayoutStride,Kokkos::LayoutLeft  >();
+}
+
+template<class Space >
+void test_2d_subview_3d() {
+  Impl::test_2d_subview_3d_impl_layout<Space,Kokkos::LayoutRight ,Kokkos::LayoutRight, Kokkos::LayoutRight>();
+  Impl::test_2d_subview_3d_impl_layout<Space,Kokkos::LayoutStride,Kokkos::LayoutRight, Kokkos::LayoutRight>();
+  Impl::test_2d_subview_3d_impl_layout<Space,Kokkos::LayoutStride,Kokkos::LayoutStride,Kokkos::LayoutRight>();
+  Impl::test_2d_subview_3d_impl_layout<Space,Kokkos::LayoutStride,Kokkos::LayoutLeft,  Kokkos::LayoutLeft>();
+  Impl::test_2d_subview_3d_impl_layout<Space,Kokkos::LayoutStride,Kokkos::LayoutStride,Kokkos::LayoutLeft>();
+}
+
+template<class Space >
+void test_2d_subview_5d() {
+  Impl::test_2d_subview_5d_impl_layout<Space,Kokkos::LayoutStride,Kokkos::LayoutRight, Kokkos::LayoutRight>();
+  Impl::test_2d_subview_5d_impl_layout<Space,Kokkos::LayoutStride,Kokkos::LayoutStride,Kokkos::LayoutRight>();
+  Impl::test_2d_subview_5d_impl_layout<Space,Kokkos::LayoutStride,Kokkos::LayoutLeft,  Kokkos::LayoutLeft>();
+  Impl::test_2d_subview_5d_impl_layout<Space,Kokkos::LayoutStride,Kokkos::LayoutStride,Kokkos::LayoutLeft>();
+}
+
+}
+//----------------------------------------------------------------------------
+
diff --git a/lib/kokkos/core/unit_test/UnitTestMain.cpp b/lib/kokkos/core/unit_test/UnitTestMain.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f952ab3db51028aff0a0ebfe313b2639e353ab87
--- /dev/null
+++ b/lib/kokkos/core/unit_test/UnitTestMain.cpp
@@ -0,0 +1,50 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+int main(int argc, char *argv[]) {
+  ::testing::InitGoogleTest(&argc,argv);
+  return RUN_ALL_TESTS();
+}
+
diff --git a/lib/kokkos/doc/Doxyfile b/lib/kokkos/doc/Doxyfile
new file mode 100644
index 0000000000000000000000000000000000000000..bc5c7486b27fc55ede35359b969af0a8008f960b
--- /dev/null
+++ b/lib/kokkos/doc/Doxyfile
@@ -0,0 +1,127 @@
+#
+# Include the global look and feel options
+#
+@INCLUDE               = ../../common/Doxyfile
+#
+# Package options
+#
+PROJECT_NAME           = "Kokkos Core Kernels Package"
+PROJECT_NUMBER         = "Version of the Day"
+OUTPUT_DIRECTORY       = .
+OUTPUT_LANGUAGE        = English
+
+EXTRACT_ALL            = NO
+EXTRACT_PRIVATE        = NO
+EXTRACT_STATIC         = YES
+HIDE_UNDOC_MEMBERS     = YES
+HIDE_UNDOC_CLASSES     = YES
+BRIEF_MEMBER_DESC      = YES
+REPEAT_BRIEF           = YES
+ALWAYS_DETAILED_SEC    = YES
+FULL_PATH_NAMES        = NO
+STRIP_FROM_PATH        = 
+INTERNAL_DOCS          = NO
+CLASS_DIAGRAMS         = YES
+SOURCE_BROWSER         = YES
+INLINE_SOURCES         = NO
+STRIP_CODE_COMMENTS    = YES
+REFERENCED_BY_RELATION = NO
+REFERENCES_RELATION    = NO
+CASE_SENSE_NAMES       = YES
+HIDE_SCOPE_NAMES       = NO
+VERBATIM_HEADERS       = YES
+SHOW_INCLUDE_FILES     = YES
+#JAVADOC_AUTOBRIEF      = YES
+INHERIT_DOCS           = YES
+INLINE_INHERITED_MEMB  = YES
+INLINE_INFO            = YES
+SORT_MEMBER_DOCS       = NO
+TAB_SIZE               = 2
+ENABLED_SECTIONS       = 
+SORT_BRIEF_DOCS        = NO
+GENERATE_TODOLIST      = YES
+GENERATE_TESTLIST      = YES
+QUIET                  = NO
+WARNINGS               = YES
+WARN_IF_UNDOCUMENTED   = YES
+WARN_FORMAT            = "$file:$line: $text"
+
+#
+# INPUT: Where to find files that Doxygen should process.  ../classic
+# has a doc/ subdirectory with its own Doxyfile that points to its own
+# files.  The other Kokkos subpackages don't currently have their own
+# Doxyfile files, so we have to do it manually here.
+#
+# mfh 26 Sep 2013: I've only added those directories in the Core
+# subpackage that constitute the "public interface" of that
+# subpackage.  Please feel free to include additional subdirectories
+# of ../core if you want to generate their documentation as well.
+#
+# mfh 26 Sep 2013: I've only added the Kokkos subpackages here that I
+# think are ready for Doxygen documentation generation.  Please feel
+# free to amend this list as you see fit.
+#
+
+INPUT                  = index.doc ../classic ../core/src ../containers/src ../linalg/src
+FILE_PATTERNS          = *.hpp *.cpp *.cuh *.cu
+RECURSIVE              = NO
+EXCLUDE_PATTERNS       = *.x *.o *.out
+EXAMPLE_PATH           = 
+EXAMPLE_RECURSIVE       = YES
+EXAMPLE_PATTERNS       = *.cpp *.hpp
+IMAGE_PATH             = 
+INPUT_FILTER           = 
+ALPHABETICAL_INDEX     = YES
+COLS_IN_ALPHA_INDEX    = 4
+IGNORE_PREFIX          = 
+#
+# What diagrams are created
+#
+CLASS_GRAPH            = YES
+COLLABORATION_GRAPH    = NO
+INCLUDE_GRAPH          = NO
+INCLUDED_BY_GRAPH      = NO
+GRAPHICAL_HIERARCHY    = YES
+#
+# Preprocessing
+#
+ENABLE_PREPROCESSING   = YES
+MACRO_EXPANSION        = YES
+EXPAND_ONLY_PREDEF     = YES
+SEARCH_INCLUDES        = YES
+INCLUDE_FILE_PATTERNS  = 
+PREDEFINED             = DOXYGEN_SHOULD_SKIP_THIS DOXYGEN_USE_ONLY
+INCLUDE_PATH           = ../src
+EXPAND_AS_DEFINED      = 
+#
+# Links to other packages
+#
+TAGFILES               = ../../common/tag_files/teuchos.tag=../../../teuchos/doc/html ../../common/tag_files/epetra.tag=../../../epetra/doc/html \
+                         ../../common/tag_files/belos.tag=../../../belos/doc/html ../../common/tag_files/anasazi.tag=../../../anasazi/doc/html \
+                         ../../common/tag_files/kokkos.tag=../../../kokkos/doc/html 
+GENERATE_TAGFILE       = ../../common/tag_files/tpetra.tag
+ALLEXTERNALS           = NO
+EXTERNAL_GROUPS        = NO
+#
+# Environment
+#
+PERL_PATH              = /usr/bin/perl
+HAVE_DOT               = YES
+DOT_PATH               = 
+MAX_DOT_GRAPH_WIDTH    = 1024
+MAX_DOT_GRAPH_HEIGHT   = 1024
+#
+# What kind of documentation is generated
+#
+#GENERATE_HTML          = YES
+#HTML_OUTPUT            = html
+#HTML_HEADER            = includes/header.html
+#HTML_FOOTER            = includes/footer.html
+#HTML_STYLESHEET        = includes/stylesheet.css
+#HTML_ALIGN_MEMBERS     = YES
+GENERATE_HTMLHELP      = NO
+DISABLE_INDEX          = NO
+GENERATE_LATEX         = NO
+GENERATE_RTF           = NO
+GENERATE_MAN           = NO
+GENERATE_XML           = NO
diff --git a/lib/kokkos/doc/Kokkos_PG.pdf b/lib/kokkos/doc/Kokkos_PG.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..3c415698c0d9fec315f317b71db19f2a019b6f6e
Binary files /dev/null and b/lib/kokkos/doc/Kokkos_PG.pdf differ
diff --git a/lib/kokkos/doc/README b/lib/kokkos/doc/README
new file mode 100644
index 0000000000000000000000000000000000000000..31e75f365c21a116a1fb736097f4f524e8d1e021
--- /dev/null
+++ b/lib/kokkos/doc/README
@@ -0,0 +1,32 @@
+Kokkos uses the Doxygen tool for providing three documentation
+sources:
+- man pages
+- Latex User Guide
+- HTML Online User Guide.
+
+Man Pages
+
+Man pages are available for all files and functions in the directory
+TRILINOS_HOME/doc/kokkos/man, where TRILINOS_HOME is the location of your
+copy of Trilinos.  To use these pages with the Unix man utility, add
+the directory to your man path as follows:
+
+setenv MANPATH `echo $MANPATH`:TRILINOS_HOME/doc/kokkos/man
+
+
+LaTeX User Guide
+
+A postscript version of this guide is in
+TRILINOS_HOME/doc/kokkos/latex/user_guide.ps.  The LaTeX source is in the
+directory TRILINOS_HOME/doc/kokkos/latex.
+
+HTML Online User Guide
+
+The online guide is initiated by pointing your browser to
+TRILINOS_HOME/doc/kokkos/html/index.html
+
+Any question, comments or suggestions are welcome.  Please send to
+Mike Heroux at 
+
+320-845-7695
+maherou@sandia.gov
diff --git a/lib/kokkos/doc/build_docs b/lib/kokkos/doc/build_docs
new file mode 100755
index 0000000000000000000000000000000000000000..da1d3e4f6e061804b1fb2fe21b356b691494df5d
--- /dev/null
+++ b/lib/kokkos/doc/build_docs
@@ -0,0 +1,15 @@
+#!/bin/sh
+
+if [ $TRILINOS_HOME ]; then
+  echo "TRILINOS_HOME has already been set!"
+else
+  echo "TRILINOS_HOME has not been set.  Setting it!"
+  export TRILINOS_HOME=`pwd`/../../..
+fi
+
+echo
+echo "Generating main Kokkos doxygen documentation ..."
+echo
+
+doxygen Doxyfile
+
diff --git a/lib/kokkos/doc/index.doc b/lib/kokkos/doc/index.doc
new file mode 100644
index 0000000000000000000000000000000000000000..27a9e4f2e7b90e11bbcde7309e9bf1544e3b386f
--- /dev/null
+++ b/lib/kokkos/doc/index.doc
@@ -0,0 +1,72 @@
+/*! 
+\mainpage Trilinos/Kokkos: Shared-memory programming interface and computational kernels
+
+\section Kokkos_Intro Introduction
+
+The %Kokkos package has two main components.  The first, sometimes
+called "%Kokkos Array" or just "%Kokkos," implements a
+performance-portable shared-memory parallel programming model and data
+containers.  The second, called "%Kokkos Classic," consists of
+computational kernels that support the %Tpetra package.
+
+\section Kokkos_Kokkos The %Kokkos programming model
+
+%Kokkos implements a performance-portable shared-memory parallel
+programming model and data containers.  It lets you write an algorithm
+once, and just change a template parameter to get the optimal data
+layout for your hardware.  %Kokkos has back-ends for the following
+parallel programming models:
+
+- Kokkos::Threads: POSIX Threads (Pthreads)
+- Kokkos::OpenMP: OpenMP
+- Kokkos::Cuda: NVIDIA's CUDA programming model for graphics
+  processing units (GPUs)
+- Kokkos::Serial: No thread parallelism
+
+%Kokkos also has optimizations for shared-memory parallel systems with
+nonuniform memory access (NUMA).  Its containers can hold data of any
+primitive ("plain old") data type (and some aggregate types).  %Kokkos
+Array may be used as a stand-alone programming model.
+
+%Kokkos' parallel operations include the following:
+
+- parallel_for: a thread-parallel "for loop"
+- parallel_reduce: a thread-parallel reduction
+- parallel_scan: a thread-parallel prefix scan operation
+
+as well as expert-level platform-independent interfaces to thread
+"teams," per-team "shared memory," synchronization, and atomic update
+operations.
+
+%Kokkos' data containers include the following:
+
+- Kokkos::View: A multidimensional array suitable for thread-parallel
+  operations.  Its layout (e.g., row-major or column-major) is
+  optimized by default for the particular thread-parallel device.
+- Kokkos::Vector: A drop-in replacement for std::vector that eases
+  porting from standard sequential C++ data structures to %Kokkos'
+  parallel data structures.
+- Kokkos::UnorderedMap: A parallel lookup table comparable in
+  functionality to std::unordered_map.
+
+%Kokkos also uses the above basic containers to implement higher-level
+data structures, like sparse graphs and matrices.
+
+A good place to start learning about %Kokkos would be <a href="http://trilinos.sandia.gov/events/trilinos_user_group_2013/presentations/2013-11-TUG-Kokkos-Tutorial.pdf">these tutorial slides</a> from the 2013 Trilinos Users' Group meeting.
+
+\section Kokkos_Classic %Kokkos Classic
+
+"%Kokkos Classic" consists of computational kernels that support the
+%Tpetra package.  These kernels include sparse matrix-vector multiply,
+sparse triangular solve, Gauss-Seidel, and dense vector operations.
+They are templated on the type of objects (\c Scalar) on which they
+operate.  This component was not meant to be visible to users; it is
+an implementation detail of the %Tpetra distributed linear algebra
+package.  
+
+%Kokkos Classic also implements a shared-memory parallel programming
+model.  This inspired and preceded the %Kokkos programming model
+described in the previous section.  Users should consider the %Kokkos
+Classic programming model deprecated, and prefer the new %Kokkos
+programming model.
+*/
diff --git a/lib/kokkos/example/CMakeLists.txt b/lib/kokkos/example/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3809cc2ea57a26ba1b3003a2e706fee912ccabc9
--- /dev/null
+++ b/lib/kokkos/example/CMakeLists.txt
@@ -0,0 +1,20 @@
+
+
+# Subpackage name must match what appears in kokkos/cmake/Dependencies.cmake
+#
+TRIBITS_SUBPACKAGE(Example)
+
+TRIBITS_ADD_EXAMPLE_DIRECTORIES(query_device)
+TRIBITS_ADD_EXAMPLE_DIRECTORIES(fixture)
+TRIBITS_ADD_EXAMPLE_DIRECTORIES(feint)
+TRIBITS_ADD_EXAMPLE_DIRECTORIES(fenl)
+TRIBITS_ADD_EXAMPLE_DIRECTORIES(multi_fem)
+TRIBITS_ADD_EXAMPLE_DIRECTORIES(md_skeleton)
+TRIBITS_ADD_EXAMPLE_DIRECTORIES(global_2_local_ids)
+TRIBITS_ADD_EXAMPLE_DIRECTORIES(grow_array)
+TRIBITS_ADD_EXAMPLE_DIRECTORIES(sort_array)
+if(NOT Kokkos_ENABLE_Cuda)
+TRIBITS_ADD_EXAMPLE_DIRECTORIES(tutorial)
+endif()
+TRIBITS_SUBPACKAGE_POSTPROCESS()
+
diff --git a/lib/kokkos/example/README b/lib/kokkos/example/README
new file mode 100644
index 0000000000000000000000000000000000000000..ec64004842b0f254de2f1d67a9cb5c272bf15607
--- /dev/null
+++ b/lib/kokkos/example/README
@@ -0,0 +1,16 @@
+This directory contains example application proxies that use different
+parts of Kokkos.  If you are looking for the FENL ("finite element
+nonlinear" solve) example, it has moved into the LinAlg subpackage of
+Tpetra.
+
+MANIFEST:
+
+  - common:  Header files used by different examples
+  - feint:   Unstructured finite-element method
+  - fixture: Some other finite-element method example
+  - global_2_local_ids: Example of global-to-local index lookup
+  - grow_array:   Parallel dynamic memory allocation
+  - md_skeleton:  Molecular dynamics
+  - query_device: Kokkos' HWLOC wrapper for querying device topology
+  - sort_array:   Parallel sort
+  - tutorial:     Kokkos tutorial (START HERE)
diff --git a/lib/kokkos/example/cmake/Dependencies.cmake b/lib/kokkos/example/cmake/Dependencies.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..0d86e78712b85db319a17029e66e96292a410573
--- /dev/null
+++ b/lib/kokkos/example/cmake/Dependencies.cmake
@@ -0,0 +1,4 @@
+TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
+  LIB_REQUIRED_DEP_PACKAGES KokkosCore KokkosContainers KokkosAlgorithms
+  TEST_OPTIONAL_DEP_TPLS CUSPARSE MKL
+  )
diff --git a/lib/kokkos/example/common/VectorImport.hpp b/lib/kokkos/example/common/VectorImport.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..8ecd74d463c08f3624cf2be2d44b0ca1e4d008ad
--- /dev/null
+++ b/lib/kokkos/example/common/VectorImport.hpp
@@ -0,0 +1,294 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_VECTORIMPORT_HPP
+#define KOKKOS_VECTORIMPORT_HPP
+
+#include <utility>
+#include <limits>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+
+#include <Kokkos_Core.hpp>
+
+#include <WrapMPI.hpp>
+
+namespace Kokkos {
+namespace Example {
+
+template< class CommMessageType , class CommIdentType , class VectorType >
+struct VectorImport ;
+
+} // namespace Example
+} // namespace Kokkos
+
+#if ! defined( KOKKOS_HAVE_MPI )
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Example {
+
+template< class CommMessageType , class CommIdentType , class VectorType >
+struct VectorImport {
+
+  const MPI_Comm comm ;
+  const unsigned count_owned ;
+  const unsigned count_receive ;
+
+  VectorImport( MPI_Comm arg_comm ,
+                const CommMessageType & ,
+                const CommMessageType & ,
+                const CommIdentType   & ,
+                const unsigned arg_count_owned ,
+                const unsigned arg_count_receive )
+    : comm( arg_comm )
+    , count_owned( arg_count_owned )
+    , count_receive( arg_count_receive )
+    {}
+
+  inline
+  void operator()( const VectorType & ) const {}
+};
+
+
+} // namespace Example
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#else /* defined( KOKKOS_HAVE_MPI ) */
+
+namespace Kokkos {
+namespace Example {
+
+template< class CommMessageType , class CommIdentType , class VectorType >
+class VectorImport {
+private:
+
+  // rank == 1 or array_layout == LayoutRight
+  enum { OK = Kokkos::Impl::StaticAssert<
+           ( VectorType::rank == 1 ) ||
+           Kokkos::Impl::is_same< typename VectorType::array_layout , Kokkos::LayoutRight >::value
+         >::value };
+
+  typedef typename VectorType::HostMirror HostVectorType ;
+
+  enum { ReceiveInPlace =
+    Kokkos::Impl::is_same< typename VectorType::memory_space ,
+                           typename HostVectorType::memory_space >::value };
+
+  const CommMessageType  recv_msg ;
+  const CommMessageType  send_msg ;
+  const CommIdentType    send_nodeid ;
+  VectorType             send_buffer ;
+  HostVectorType         host_send_buffer ;
+  HostVectorType         host_recv_buffer ;
+  unsigned               chunk ;
+
+public:
+
+  const MPI_Comm         comm ;
+  const unsigned         count_owned ;
+  const unsigned         count_receive ;
+
+  struct Pack {
+    typedef typename VectorType::execution_space execution_space ;
+    const CommIdentType  index ;
+    const VectorType     source ;
+    const VectorType     buffer ;
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()( const unsigned i ) const
+      { buffer( i ) = source( index(i) ); }
+
+    Pack( const CommIdentType  & arg_index ,
+          const VectorType     & arg_source ,
+          const VectorType     & arg_buffer )
+      : index( arg_index )
+      , source( arg_source )
+      , buffer( arg_buffer )
+    {
+      Kokkos::parallel_for( index.dimension_0() , *this );
+      execution_space::fence();
+    }
+  };
+
+  VectorImport( MPI_Comm arg_comm ,
+                const CommMessageType & arg_recv_msg ,
+                const CommMessageType & arg_send_msg ,
+                const CommIdentType   & arg_send_nodeid ,
+                const unsigned arg_count_owned ,
+                const unsigned arg_count_receive )
+    : recv_msg( arg_recv_msg )
+    , send_msg( arg_send_msg )
+    , send_nodeid( arg_send_nodeid )
+    , send_buffer()
+    , host_send_buffer()
+    , host_recv_buffer()
+    , comm( arg_comm )
+    , count_owned( arg_count_owned )
+    , count_receive( arg_count_receive )
+    {
+      if ( ! ReceiveInPlace ) {
+        host_recv_buffer = HostVectorType("recv_buffer",count_receive);
+      }
+
+      unsigned send_count = 0 ;
+      for ( unsigned i = 0 ; i < send_msg.dimension_0() ; ++i ) { send_count += send_msg(i,1); }
+      send_buffer      = VectorType("send_buffer",send_count);
+      host_send_buffer = Kokkos::create_mirror_view( send_buffer );
+    }
+
+  inline
+  void operator()( const VectorType & v ) const
+  {
+    typedef typename VectorType::value_type  scalar_type ;
+
+    const int mpi_tag = 42 ;
+    const unsigned chunk = v.dimension_1();
+
+    // Subvector for receives
+    const std::pair<unsigned,unsigned> recv_range( count_owned , count_owned + count_receive );
+    const VectorType recv_vector = Kokkos::subview( v , recv_range );
+
+    std::vector< MPI_Request > recv_request( recv_msg.dimension_0() , MPI_REQUEST_NULL );
+
+    { // Post receives
+      scalar_type * ptr =
+        ReceiveInPlace ? recv_vector.ptr_on_device() : host_recv_buffer.ptr_on_device();
+
+      for ( size_t i = 0 ; i < recv_msg.dimension_0() ; ++i ) {
+        const int proc  = recv_msg(i,0);
+        const int count = recv_msg(i,1) * chunk ;
+
+        MPI_Irecv( ptr , count * sizeof(scalar_type) , MPI_BYTE ,
+                   proc , mpi_tag , comm , & recv_request[i] );
+
+        ptr += count ;
+      }
+    }
+
+    MPI_Barrier( comm );
+
+    { // Pack and send 
+      const Pack pack( send_nodeid , v , send_buffer );
+
+      Kokkos::deep_copy( host_send_buffer , send_buffer );
+
+      scalar_type * ptr = host_send_buffer.ptr_on_device();
+
+      for ( size_t i = 0 ; i < send_msg.dimension_0() ; ++i ) {
+        const int proc  = send_msg(i,0);
+        const int count = send_msg(i,1) * chunk ;
+
+        // MPI_Ssend blocks until
+        // (1) a receive is matched for the message and
+        // (2) the send buffer can be re-used.
+        //
+        // It is suggested that MPI_Ssend will have the best performance:
+        // http://www.mcs.anl.gov/research/projects/mpi/sendmode.html .
+
+        MPI_Ssend( ptr ,
+                   count * sizeof(scalar_type) , MPI_BYTE ,
+                   proc , mpi_tag , comm );
+
+        ptr += count ;
+      }
+    }
+
+    // Wait for receives and verify:
+
+    for ( size_t i = 0 ; i < recv_msg.dimension_0() ; ++i ) {
+      MPI_Status recv_status ;
+      int recv_which = 0 ;
+      int recv_size  = 0 ;
+
+      MPI_Waitany( recv_msg.dimension_0() , & recv_request[0] , & recv_which , & recv_status );
+
+      const int recv_proc = recv_status.MPI_SOURCE ;
+
+      MPI_Get_count( & recv_status , MPI_BYTE , & recv_size );
+
+      // Verify message properly received:
+
+      const int  expected_proc = recv_msg(recv_which,0);
+      const int  expected_size = recv_msg(recv_which,1) * chunk * sizeof(scalar_type);
+
+      if ( ( expected_proc != recv_proc ) ||
+           ( expected_size != recv_size ) ) {
+
+        int local_rank  = 0 ;
+
+        MPI_Comm_rank( comm , & local_rank );
+
+        std::ostringstream msg ;
+        msg << "VectorImport error:"
+            << " P" << local_rank
+            << " received from P" << recv_proc
+            << " size "     << recv_size
+            << " expected " << expected_size
+            << " from P"    << expected_proc ;
+        throw std::runtime_error( msg.str() );
+      }
+    }
+
+    // Copy received data to device memory.
+
+    if ( ! ReceiveInPlace ) { Kokkos::deep_copy( recv_vector , host_recv_buffer ); }
+  }
+};
+
+} // namespace Example
+} // namespace Kokkos
+
+#endif
+
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_VECTORIMPORT_HPP */
+
+
diff --git a/lib/kokkos/example/common/WrapMPI.hpp b/lib/kokkos/example/common/WrapMPI.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..c75e4bc5c7e2a41753d32f7d6967f65ffbf86581
--- /dev/null
+++ b/lib/kokkos/example/common/WrapMPI.hpp
@@ -0,0 +1,103 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_EXAMPLE_WRAP_MPI
+#define KOKKOS_EXAMPLE_WRAP_MPI
+
+#include <Kokkos_Macros.hpp>
+#include <string>
+
+#if defined( KOKKOS_HAVE_MPI )
+
+#include <mpi.h>
+
+namespace Kokkos {
+namespace Example {
+
+inline
+double all_reduce( double value , MPI_Comm comm )
+{
+  double local = value ;
+  MPI_Allreduce( & local , & value , 1 , MPI_DOUBLE , MPI_SUM , comm );
+  return value ;
+}
+
+inline
+double all_reduce_max( double value , MPI_Comm comm )
+{
+  double local = value ;
+  MPI_Allreduce( & local , & value , 1 , MPI_DOUBLE , MPI_MAX , comm );
+  return value ;
+}
+
+} // namespace Example
+} // namespace Kokkos
+
+#elif ! defined( KOKKOS_HAVE_MPI )
+
+/* Wrap the the MPI_Comm type and heavily used MPI functions
+ * to reduce the number of '#if defined( KOKKOS_HAVE_MPI )'
+ * blocks which have to be sprinkled throughout the examples.
+ */
+
+typedef int MPI_Comm ;
+
+inline int MPI_Comm_size( MPI_Comm , int * size ) { *size = 1 ; return 0 ; }
+inline int MPI_Comm_rank( MPI_Comm , int * rank ) { *rank = 0 ; return 0 ; }
+inline int MPI_Barrier( MPI_Comm ) { return 0; }
+
+namespace Kokkos {
+namespace Example {
+
+inline
+double all_reduce( double value , MPI_Comm ) { return value ; }
+
+inline
+double all_reduce_max( double value , MPI_Comm ) { return value ; }
+
+} // namespace Example
+} // namespace Kokkos
+
+#endif /* ! defined( KOKKOS_HAVE_MPI ) */
+#endif /* #ifndef KOKKOS_EXAMPLE_WRAP_MPI */
+
diff --git a/lib/kokkos/example/feint/CMakeLists.txt b/lib/kokkos/example/feint/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..0018b9f9f538de77ce776daaa267a037714387ad
--- /dev/null
+++ b/lib/kokkos/example/feint/CMakeLists.txt
@@ -0,0 +1,18 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../common)
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../fixture)
+
+SET(SOURCES "")
+
+FILE(GLOB SOURCES *.cpp)
+
+LIST( APPEND SOURCES ../fixture/BoxElemPart.cpp)
+
+TRIBITS_ADD_EXECUTABLE(
+  feint
+  SOURCES ${SOURCES}
+  COMM serial mpi
+  )
+
diff --git a/lib/kokkos/example/feint/ElemFunctor.hpp b/lib/kokkos/example/feint/ElemFunctor.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..651e34c2eed247f37986886c86f04ce24d76c551
--- /dev/null
+++ b/lib/kokkos/example/feint/ElemFunctor.hpp
@@ -0,0 +1,489 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_EXAMPLE_FEINT_FUNCTORS_HPP
+#define KOKKOS_EXAMPLE_FEINT_FUNCTORS_HPP
+
+#include <stdio.h>
+#include <Kokkos_Core.hpp>
+#include <BoxElemFixture.hpp>
+
+namespace Kokkos {
+namespace Example {
+
+/** \brief  Numerically integrate a function on a finite element mesh and
+ *          project the integrated values to nodes.
+ */
+template< class FixtureType ,
+          class FunctionType ,
+          bool PerformScatterAddWithAtomic >
+struct FiniteElementIntegration ;
+
+// Specialized for an 'Example::BoxElemFixture' finite element mesh
+template< class Device , BoxElemPart::ElemOrder ElemOrder , class GridMap ,
+          class FunctionType ,
+          bool PerformScatterAddWithAtomic >
+struct FiniteElementIntegration<
+  Kokkos::Example::BoxElemFixture< Device , ElemOrder , GridMap > ,
+  FunctionType ,
+  PerformScatterAddWithAtomic >
+{
+  // Element mesh types:
+  typedef Kokkos::Example::BoxElemFixture< Device , ElemOrder >
+    BoxFixtureType ;
+
+  typedef Kokkos::Example::HexElement_Data< BoxFixtureType::ElemNode >
+    HexElemDataType ;
+
+  enum { ElemNodeCount    = HexElemDataType::element_node_count  };
+  enum { IntegrationCount = HexElemDataType::integration_count };
+  enum { ValueCount       = FunctionType::value_count };
+
+  // Dictionary of view types:
+  typedef View<int*,                              Device> ElemErrorType ;
+  typedef View<double*[ElemNodeCount][ValueCount],Device> ElemValueType ;
+  typedef View<double*[ValueCount],               Device> NodeValueType ;
+
+  // Data members for this Functor:
+  const HexElemDataType  m_hex_elem_data ; ///< Master element
+  const BoxFixtureType   m_box_fixture ;   ///< Unstructured mesh data
+  const FunctionType     m_function ;      ///< Function to integrate
+  const ElemErrorType    m_elem_error ;    ///< Flags for element errors
+  const ElemValueType    m_elem_integral ; ///< Per-element quantities
+  const NodeValueType    m_node_lumped ;   ///< Quantities lumped to nodes
+
+  //----------------------------------------
+
+  FiniteElementIntegration(
+    const BoxFixtureType & box_fixture ,
+    const FunctionType   & function )
+    : m_hex_elem_data()
+    , m_box_fixture( box_fixture ) // Shallow copy of the mesh fixture
+    , m_function( function )
+    , m_elem_error(    "elem_error"    , box_fixture.elem_count() )
+    , m_elem_integral( "elem_integral" , box_fixture.elem_count() )
+    , m_node_lumped(   "node_lumped"   , box_fixture.node_count() )
+    {}
+
+  //----------------------------------------
+  // Device for parallel dispatch.
+  typedef typename Device::execution_space execution_space;
+
+  // Value type for global parallel reduction.
+  struct value_type {
+    double value[ ValueCount ]; ///< Integrated quantitie
+    int    error ;              ///< Element inversion flag
+  };
+
+  //----------------------------------------
+  // Transform element interpolation function gradients and
+  // compute determinant of spatial jacobian.
+  KOKKOS_INLINE_FUNCTION
+  float transform_gradients(
+    const float  grad[][  ElemNodeCount ] , // Gradient of bases master element
+    const double coord[][ ElemNodeCount ] ,
+          float  dpsi[][  ElemNodeCount ] ) const
+  {
+    enum { TensorDim = 9 };
+    enum { j11 = 0 , j12 = 1 , j13 = 2 ,
+           j21 = 3 , j22 = 4 , j23 = 5 ,
+           j31 = 6 , j32 = 7 , j33 = 8 };
+
+    // Temporary for jacobian accumulation is double for summation accuracy.
+    double J[ TensorDim ] = { 0, 0, 0,  0, 0, 0,  0, 0, 0 };
+
+    for( int i = 0; i < ElemNodeCount ; ++i ) {
+      J[j11] += grad[0][i] * coord[0][i] ;
+      J[j12] += grad[0][i] * coord[1][i] ;
+      J[j13] += grad[0][i] * coord[2][i] ;
+
+      J[j21] += grad[1][i] * coord[0][i] ;
+      J[j22] += grad[1][i] * coord[1][i] ;
+      J[j23] += grad[1][i] * coord[2][i] ;
+
+      J[j31] += grad[2][i] * coord[0][i] ;
+      J[j32] += grad[2][i] * coord[1][i] ;
+      J[j33] += grad[2][i] * coord[2][i] ;
+    }
+
+    // Inverse jacobian, compute as double and store as float.
+    float invJ[ TensorDim ] = {
+      float( J[j22] * J[j33] - J[j23] * J[j32] ) ,
+      float( J[j13] * J[j32] - J[j12] * J[j33] ) ,
+      float( J[j12] * J[j23] - J[j13] * J[j22] ) ,
+
+      float( J[j23] * J[j31] - J[j21] * J[j33] ) ,
+      float( J[j11] * J[j33] - J[j13] * J[j31] ) ,
+      float( J[j13] * J[j21] - J[j11] * J[j23] ) ,
+
+      float( J[j21] * J[j32] - J[j22] * J[j31] ) ,
+      float( J[j12] * J[j31] - J[j11] * J[j32] ) ,
+      float( J[j11] * J[j22] - J[j12] * J[j21] ) };
+
+    const float detJ = J[j11] * invJ[j11] +
+                       J[j21] * invJ[j12] +
+                       J[j31] * invJ[j13] ;
+
+    {
+      const float detJinv = 1.0 / detJ ;
+      for ( int i = 0 ; i < TensorDim ; ++i ) { invJ[i] *= detJinv ; }
+    }
+
+    // Transform gradients:
+    for ( int i = 0; i < ElemNodeCount ; ++i ) {
+      dpsi[0][i] = grad[0][i] * invJ[j11] +
+                   grad[1][i] * invJ[j12] +
+                   grad[2][i] * invJ[j13];
+      dpsi[1][i] = grad[0][i] * invJ[j21] +
+                   grad[1][i] * invJ[j22] +
+                   grad[2][i] * invJ[j23];
+      dpsi[2][i] = grad[0][i] * invJ[j31] +
+                   grad[1][i] * invJ[j32] +
+                   grad[2][i] * invJ[j33];
+    }
+
+    return detJ ;
+  }
+
+  // Functor's function called for each element in the mesh
+  // to numerically integrate the function and add element quantities
+  // to the global integral.
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int ielem , value_type & update ) const
+  {
+    // Local temporaries for gathering nodal data.
+    double node_coord[3][ ElemNodeCount ];
+
+    int inode[ ElemNodeCount ] ;
+
+    // Gather indices of element's node from global memory to local memory.
+    for ( int i = 0 ; i < ElemNodeCount ; ++i ) {
+      inode[i] = m_box_fixture.elem_node( ielem , i );
+    }
+
+    // Gather coordinates of element's nodes from global memory to local memory.
+    for ( int i = 0 ; i < ElemNodeCount ; ++i ) {
+      node_coord[0][i] = m_box_fixture.node_coord( inode[i] , 0 );
+      node_coord[1][i] = m_box_fixture.node_coord( inode[i] , 1 );
+      node_coord[2][i] = m_box_fixture.node_coord( inode[i] , 2 );
+    }
+
+    // Local temporary to accumulate numerical integration
+    // of vector valued function.
+    double accum[ ValueCount ];
+
+    for ( int j = 0 ; j < ValueCount ; ++j ) { accum[j] = 0 ; }
+
+    int error = 0 ;
+
+    // Numerical integration loop for this element:
+    for ( int k = 0 ; k < IntegrationCount ; ++k ) {
+
+      // Integration point in space as interpolated from nodal coordinates:
+      double point[3] = { 0 , 0 , 0 };
+      for ( int i = 0 ; i < ElemNodeCount ; ++i ) {
+        point[0] += node_coord[0][i] * m_hex_elem_data.values[k][i] ;
+        point[1] += node_coord[1][i] * m_hex_elem_data.values[k][i] ;
+        point[2] += node_coord[2][i] * m_hex_elem_data.values[k][i] ;
+      }
+
+      // Example function vector value at cubature point:
+      double val_at_pt[ ValueCount ];
+      m_function( point , val_at_pt );
+
+      // Temporary array for transformed element basis functions' gradient.
+      // Not used in this example, but computed anyway by the more general
+      // deformation function.
+      float dpsi[3][ ElemNodeCount ];
+
+      // Compute deformation jacobian, transform basis function gradient,
+      // and return determinant of deformation jacobian.
+      float detJ = transform_gradients( m_hex_elem_data.gradients[k] ,
+                                        node_coord , dpsi );
+
+      // Check for inverted spatial jacobian
+      if ( detJ <= 0 ) { error = 1 ; detJ = 0 ; }
+
+      // Integration weight.
+      const float w = m_hex_elem_data.weights[k] * detJ ;
+
+      // Cubature of function.
+      for ( int j = 0 ; j < ValueCount ; ++j ) {
+        accum[j] += val_at_pt[j] * w ;
+      }
+    }
+
+    m_elem_error(ielem) = error ;
+
+
+    // Element contribution to global integral:
+
+    if ( error ) { update.error = 1 ; }
+
+    for ( int j = 0 ; j < ValueCount ; ++j ) { update.value[j] += accum[j] ; }
+
+    // Element-node quantity for lumping to nodes:
+    for ( int i = 0 ; i < ElemNodeCount ; ++i ) {
+      for ( int j = 0 ; j < ValueCount ; ++j ) {
+        // Save element's integral apportionment to nodes to global memory
+        m_elem_integral( ielem , i , j ) = accum[j] / ElemNodeCount ;
+      }
+    }
+
+    if ( PerformScatterAddWithAtomic ) {
+      // Option to immediately scatter-add the integrated quantities to nodes.
+      // This is a race condition as two or more threads could attempt
+      // concurrent update of nodal values.  The atomic_fetch_add (+=)
+      // function guarantees that the summation will occur correctly;
+      // however, there can be no guarantee for the order of summation.
+      // Due to non-associativity of floating point arithmetic the result
+      // is non-deterministic within bounds of floating point round-off.
+
+      for ( int i = 0 ; i < ElemNodeCount ; ++i ) {
+        for ( int j = 0 ; j < ValueCount ; ++j ) {
+          Kokkos::atomic_fetch_add( & m_node_lumped( inode[i] , j ) ,
+                                    m_elem_integral( ielem , i , j ) );
+        }
+      }
+    }
+  }
+  //--------------------------------------------------------------------------
+
+  // Initialization of the global reduction value.
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type & update ) const
+  {
+    for ( int j = 0 ; j < ValueCount ; ++j ) update.value[j] = 0 ;
+    update.error = 0 ;
+  }
+
+  // Join two contributions to global reduction value.
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile       value_type & update ,
+             volatile const value_type & input ) const
+  {
+    for ( int j = 0 ; j < ValueCount ; ++j ) update.value[j] += input.value[j] ;
+    if ( input.error ) update.error = 1 ;
+  }
+};
+
+} /* namespace Example */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Example {
+
+template< class ViewElemNode ,
+          class ViewNodeScan ,
+          class ViewNodeElem >
+void map_node_to_elem( const ViewElemNode & elem_node ,
+                       const ViewNodeScan & node_scan ,
+                       const ViewNodeElem & node_elem );
+
+/** \brief  Functor to gather-sum elements' per-node quantities
+ *          to element nodes.  Gather-sum is thread safe and
+ *          does not require atomic updates.
+ */
+template< class ViewNodeValue ,
+          class ViewElemValue ,
+          bool  AlreadyUsedAtomic >
+struct LumpElemToNode {
+
+  typedef typename ViewElemValue::execution_space execution_space ;
+
+  // In this example we know that the ViewElemValue
+  // array specification is < double*[nNode][nValue] >
+
+#if KOKKOS_USING_EXP_VIEW
+  enum { value_count = ViewElemValue::dimension::N2 };
+#else
+  enum { value_count = ViewElemValue::shape_type::N2 };
+#endif
+
+  ViewNodeValue             m_node_value ; ///< Integrated values at nodes
+  ViewElemValue             m_elem_value ; ///< Values apportioned to nodes
+  View<int*,   execution_space> m_node_scan ;  ///< Offsets for nodes->element
+  View<int*[2],execution_space> m_node_elem ;  ///< Node->element connectivity
+
+  // Only allocate node->element connectivity if have
+  // not already used atomic updates for the nodes.
+  template< class ViewElemNode >
+  LumpElemToNode( const ViewNodeValue & node_value ,
+                  const ViewElemValue & elem_value ,
+                  const ViewElemNode  & elem_node )
+    : m_node_value( node_value )
+    , m_elem_value( elem_value )
+    , m_node_scan( "node_scan" ,
+                   AlreadyUsedAtomic ? 0 : node_value.dimension_0() + 1 )
+    , m_node_elem( "node_elem" ,
+                   AlreadyUsedAtomic ? 0 : elem_node.dimension_0() *
+                                           elem_node.dimension_1() )
+    {
+      if ( ! AlreadyUsedAtomic ) {
+        map_node_to_elem( elem_node , m_node_scan , m_node_elem );
+      }
+    }
+
+  //----------------------------------------
+
+  struct value_type { double value[ value_count ]; };
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int inode , value_type & update ) const
+  {
+    if ( ! AlreadyUsedAtomic ) {
+      // Sum element quantities to a local variable.
+      value_type local ;
+      for ( int j = 0 ; j < value_count ; ++j ) { local.value[j] = 0 ; }
+
+      {
+        // nodes' element ids span [i,end)
+        int i = m_node_scan(inode);
+        const int end = m_node_scan(inode+1);
+
+        for ( ; i < end ; ++i ) {
+          // element #ielem , local node #ielem_node is this node:
+          const int ielem      = m_node_elem(i,0);
+          const int ielem_node = m_node_elem(i,1);
+          // Sum the vector-values quantity
+          for ( int j = 0 ; j < value_count ; ++j ) {
+            local.value[j] += m_elem_value( ielem , ielem_node , j );
+          }
+        }
+      }
+
+      // Assign nodal quantity (no race condition).
+      // Sum global value.
+      for ( int j = 0 ; j < value_count ; ++j ) {
+        m_node_value( inode , j ) = local.value[j] ;
+        update.value[j] += local.value[j] ;
+      }
+    }
+    else {
+      // Already used atomic update of the nodal quantity,
+      // query and sum the value.
+      for ( int j = 0 ; j < value_count ; ++j ) {
+        update.value[j] += m_node_value( inode , j );
+      }
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type & update ) const
+    { for ( int j = 0 ; j < value_count ; ++j ) { update.value[j] = 0 ; } }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile       value_type & update ,
+             volatile const value_type & input ) const
+    {
+      for ( int j = 0 ; j < value_count ; ++j ) {
+        update.value[j] += input.value[j] ;
+      }
+    }
+};
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+template< class ViewElemNode ,
+          class ViewNodeScan ,
+          class ViewNodeElem >
+void map_node_to_elem( const ViewElemNode & elem_node ,
+                       const ViewNodeScan & node_scan ,
+                       const ViewNodeElem & node_elem )
+{
+  typedef typename ViewElemNode::host_mirror_space host_mirror_space ;
+
+  const typename ViewElemNode::HostMirror host_elem_node =
+    Kokkos::create_mirror_view(elem_node);
+
+  const typename ViewNodeScan::HostMirror host_node_scan =
+    Kokkos::create_mirror_view(node_scan);
+
+  const typename ViewNodeElem::HostMirror host_node_elem =
+    Kokkos::create_mirror_view(node_elem);
+
+  const int elem_count      = host_elem_node.dimension_0();
+  const int elem_node_count = host_elem_node.dimension_1();
+  const int node_count      = host_node_scan.dimension_0() - 1 ;
+
+  const View<int*, host_mirror_space >
+    node_elem_count( "node_elem_count" , node_count );
+
+  Kokkos::deep_copy( host_elem_node , elem_node );
+
+  for ( int i = 0 ; i < elem_count ; ++i ) {
+    for ( int j = 0 ; j < elem_node_count ; ++j ) {
+      ++node_elem_count( host_elem_node(i,j) );
+    }
+  }
+
+  for ( int i = 0 ; i < node_count ; ++i ) {
+    host_node_scan(i+1) += host_node_scan(i) + node_elem_count(i);
+    node_elem_count(i) = 0 ;
+  }
+
+  for ( int i = 0 ; i < elem_count ; ++i ) {
+    for ( int j = 0 ; j < elem_node_count ; ++j ) {
+      const int inode  = host_elem_node(i,j);
+      const int offset = host_node_scan(inode) + node_elem_count(inode);
+
+      host_node_elem( offset , 0 ) = i ;
+      host_node_elem( offset , 1 ) = j ;
+
+      ++node_elem_count(inode);
+    }
+  }
+
+  Kokkos::deep_copy( node_scan , host_node_scan );
+  Kokkos::deep_copy( node_elem , host_node_elem );
+}
+
+} /* namespace Example */
+} /* namespace Kokkos */
+
+#endif /* #ifndef KOKKOS_EXAMPLE_FEINT_FUNCTORS_HPP */
+
diff --git a/lib/kokkos/example/feint/Makefile b/lib/kokkos/example/feint/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..f198a974c1e34d4014323eb34d03e7aa1f7445ba
--- /dev/null
+++ b/lib/kokkos/example/feint/Makefile
@@ -0,0 +1,61 @@
+KOKKOS_PATH = ../..
+
+vpath %.cpp ${KOKKOS_PATH}/example/fixture ${KOKKOS_PATH}/example/feint
+
+EXAMPLE_HEADERS = $(wildcard $(KOKKOS_PATH)/example/common/*.hpp ${KOKKOS_PATH}/example/fixture/*.hpp ${KOKKOS_PATH}/example/feint/*.hpp)
+
+default: build_all
+	echo "End Build"
+        
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+	CXX = $(NVCC_WRAPPER)
+	CXXFLAGS ?= -O3
+	LINK = $(CXX)
+	LDFLAGS ?= -lpthread
+else
+	CXX ?= g++
+	CXXFLAGS ?= -O3
+	LINK ?= $(CXX)
+	LDFLAGS ?= -lpthread
+endif
+
+KOKKOS_CXXFLAGS +=	\
+	-I${KOKKOS_PATH}/example/common	\
+	-I${KOKKOS_PATH}/example/fixture	\
+	-I${KOKKOS_PATH}/example/feint
+
+EXE_EXAMPLE_FEINT = KokkosExample_Feint
+OBJ_EXAMPLE_FEINT = BoxElemPart.o main.o
+
+ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+  OBJ_EXAMPLE_FEINT += feint_cuda.o
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
+  OBJ_EXAMPLE_FEINT += feint_threads.o
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
+  OBJ_EXAMPLE_FEINT += feint_openmp.o
+endif
+
+TARGETS = $(EXE_EXAMPLE_FEINT)
+
+#TEST_TARGETS =
+
+$(EXE_EXAMPLE_FEINT) : $(OBJ_EXAMPLE_FEINT) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_EXAMPLE_FEINT) $(KOKKOS_LIBS) $(LIB) -o $(EXE_EXAMPLE_FEINT)
+
+build_all : $(TARGETS)
+
+test : build_all
+
+clean: kokkos-clean
+	rm -f *.o $(TARGETS)
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS) $(EXAMPLE_HEADERS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
diff --git a/lib/kokkos/example/feint/feint.hpp b/lib/kokkos/example/feint/feint.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..4b7196c4c713f1bd1c1f077818393edfa5ff506f
--- /dev/null
+++ b/lib/kokkos/example/feint/feint.hpp
@@ -0,0 +1,165 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_EXAMPLE_FEINT_HPP
+#define KOKKOS_EXAMPLE_FEINT_HPP
+
+#include <iostream>
+#include <BoxElemFixture.hpp>
+#include <ElemFunctor.hpp>
+#include <feint_fwd.hpp>
+
+namespace Kokkos {
+namespace Example {
+
+/** \brief  Vector valued function to numerically integrate.
+ *
+ *  F(X) = { 1 , x , y , z , x*y , y*z , z*x , x*y*z }
+ *
+ *  Integrates on a unit cube to:
+ *    { 1 , 1/2 , 1/2 , 1/2 , 1/4 , 1/4 , 1/4 , 1/8 }
+ */
+struct MyFunctionType {
+
+  enum { value_count = 8 };
+
+  // Evaluate function at coordinate.
+  template< typename CoordType , typename ValueType >
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const CoordType point[] , ValueType value[] ) const
+    {
+      value[0] = 1 ;
+      value[1] = point[0] ;
+      value[2] = point[1] ;
+      value[3] = point[2] ;
+      value[4] = point[0] * point[1] ;
+      value[5] = point[1] * point[2] ;
+      value[6] = point[2] * point[0] ;
+      value[7] = point[0] * point[1] * point[2] ;
+    }
+};
+
+template < class Device , bool UseAtomic >
+void feint(
+  const unsigned global_elem_nx ,
+  const unsigned global_elem_ny ,
+  const unsigned global_elem_nz )
+{
+  //----------------------------------------
+  // Create the unstructured finite element mesh box fixture on the device:
+
+  typedef Kokkos::Example::
+    BoxElemFixture< Device , Kokkos::Example::BoxElemPart::ElemLinear >
+    // BoxElemFixture< Device , Kokkos::Example::BoxElemPart::ElemQuadratic >
+      BoxFixtureType ;
+
+  // MPI distributed parallel domain decomposition of the fixture.
+  // Either by element (DecomposeElem) or by node (DecomposeNode)
+  // with ghosted elements.
+
+  static const Kokkos::Example::BoxElemPart::Decompose
+    decompose = Kokkos::Example::BoxElemPart:: DecomposeElem ;
+    // decompose = Kokkos::Example::BoxElemPart:: DecomposeNode ;
+
+  // Not using MPI in this example.
+  const unsigned mpi_rank = 0 ;
+  const unsigned mpi_size = 1 ;
+
+  const BoxFixtureType fixture( decompose , mpi_size , mpi_rank ,
+                                global_elem_nx ,
+                                global_elem_ny ,
+                                global_elem_nz );
+
+  //----------------------------------------
+  // Create and execute the numerical integration functor on the device:
+
+  typedef Kokkos::Example::
+    FiniteElementIntegration< BoxFixtureType , MyFunctionType , UseAtomic >
+      FeintType ;
+
+  const FeintType feint( fixture , MyFunctionType() );
+
+  typename FeintType::value_type elem_integral ;
+
+  // A reduction for the global integral:
+  Kokkos::parallel_reduce( fixture.elem_count() , feint , elem_integral );
+
+  if ( elem_integral.error ) {
+    std::cout << "An element had a spatial jacobian error" << std::endl ;
+    return ;
+  }
+
+  std::cout << "Elem integral =" ;
+  for ( int i = 0 ; i < MyFunctionType::value_count ; ++i ) {
+    std::cout << " " << elem_integral.value[i] ;
+  }
+  std::cout << std::endl ;
+ 
+  //----------------------------------------
+  // Create and execute the nodal lumped value projection and reduction functor:
+
+  typedef Kokkos::Example::
+    LumpElemToNode< typename FeintType::NodeValueType ,
+                    typename FeintType::ElemValueType ,
+                    UseAtomic > LumpType ;
+
+  const LumpType lump( feint.m_node_lumped ,
+                       feint.m_elem_integral ,
+                       fixture.elem_node() );
+
+  typename LumpType ::value_type node_sum ;
+
+  Kokkos::parallel_reduce( fixture.node_count() , lump , node_sum );
+
+  std::cout << "Node lumped sum =" ;
+  for ( int i = 0 ; i < MyFunctionType::value_count ; ++i ) {
+    std::cout << " " << node_sum.value[i] ;
+  }
+  std::cout << std::endl ;
+}
+
+} /* namespace Example */
+} /* namespace Kokkos */
+
+#endif /* #ifndef KOKKOS_EXAMPLE_FEINT_HPP */
+
diff --git a/lib/kokkos/example/feint/feint_cuda.cpp b/lib/kokkos/example/feint/feint_cuda.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1370822febbc4e2099c68ed691ddfb425d47772a
--- /dev/null
+++ b/lib/kokkos/example/feint/feint_cuda.cpp
@@ -0,0 +1,67 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+
+#if defined( KOKKOS_HAVE_CUDA )
+
+#include <feint.hpp>
+
+namespace Kokkos {
+namespace Example {
+
+template void feint<Kokkos::Cuda,false>(
+  const unsigned global_elem_nx ,
+  const unsigned global_elem_ny ,
+  const unsigned global_elem_nz );
+
+template void feint<Kokkos::Cuda,true>(
+  const unsigned global_elem_nx ,
+  const unsigned global_elem_ny ,
+  const unsigned global_elem_nz );
+
+} /* namespace Example */
+} /* namespace Kokkos */
+
+#endif
+
diff --git a/lib/kokkos/example/feint/feint_fwd.hpp b/lib/kokkos/example/feint/feint_fwd.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f02e547cfccbb6b89c7bca192a87bcad95a266f0
--- /dev/null
+++ b/lib/kokkos/example/feint/feint_fwd.hpp
@@ -0,0 +1,60 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_EXAMPLE_FEINT_FWD_HPP
+#define KOKKOS_EXAMPLE_FEINT_FWD_HPP
+
+namespace Kokkos {
+namespace Example {
+
+template < class Device , bool UseAtomic >
+void feint(
+  const unsigned global_elem_nx = 100 ,
+  const unsigned global_elem_ny = 115 ,
+  const unsigned global_elem_nz = 130 );
+
+} /* namespace Example */
+} /* namespace Kokkos */
+
+#endif /* #ifndef KOKKOS_EXAMPLE_FEINT_FWD_HPP */
+
diff --git a/lib/kokkos/example/feint/feint_openmp.cpp b/lib/kokkos/example/feint/feint_openmp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..cf2fdca5ba4b7d0e2a60d74e433fc9b849e79108
--- /dev/null
+++ b/lib/kokkos/example/feint/feint_openmp.cpp
@@ -0,0 +1,67 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+
+#ifdef KOKKOS_HAVE_OPENMP
+
+#include <feint.hpp>
+
+namespace Kokkos {
+namespace Example {
+
+template void feint<Kokkos::OpenMP,false>(
+  const unsigned global_elem_nx ,
+  const unsigned global_elem_ny ,
+  const unsigned global_elem_nz );
+
+template void feint<Kokkos::OpenMP,true>(
+  const unsigned global_elem_nx ,
+  const unsigned global_elem_ny ,
+  const unsigned global_elem_nz );
+
+} /* namespace Example */
+} /* namespace Kokkos */
+
+#endif
+
diff --git a/lib/kokkos/example/feint/feint_threads.cpp b/lib/kokkos/example/feint/feint_threads.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5dcf5654f1e7b5cdc24e8110a866eabfdbcb3350
--- /dev/null
+++ b/lib/kokkos/example/feint/feint_threads.cpp
@@ -0,0 +1,66 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+
+#if defined( KOKKOS_HAVE_PTHREAD )
+
+#include <feint.hpp>
+
+namespace Kokkos {
+namespace Example {
+
+template void feint< Kokkos::Threads ,false>(
+  const unsigned global_elem_nx ,
+  const unsigned global_elem_ny ,
+  const unsigned global_elem_nz );
+
+template void feint< Kokkos::Threads ,true>(
+  const unsigned global_elem_nx ,
+  const unsigned global_elem_ny ,
+  const unsigned global_elem_nz );
+
+} /* namespace Example */
+} /* namespace Kokkos */
+
+#endif /* #if defined( KOKKOS_HAVE_PTHREAD ) */
diff --git a/lib/kokkos/example/feint/main.cpp b/lib/kokkos/example/feint/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c45d483e76da818a8f5a3cb6dd7cb76522504467
--- /dev/null
+++ b/lib/kokkos/example/feint/main.cpp
@@ -0,0 +1,110 @@
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+
+#include <utility>
+#include <iostream>
+
+#include <Kokkos_Core.hpp>
+
+#include <feint_fwd.hpp>
+
+int main()
+{
+#if defined( KOKKOS_HAVE_PTHREAD )
+  {
+    // Use 4 cores per NUMA region, unless fewer available
+
+    const unsigned use_numa_count     = Kokkos::hwloc::get_available_numa_count();
+    const unsigned use_cores_per_numa = std::min( 4u , Kokkos::hwloc::get_available_cores_per_numa() );
+
+    Kokkos::Threads::initialize( use_numa_count * use_cores_per_numa );
+
+    std::cout << "feint< Threads , NotUsingAtomic >" << std::endl ;
+    Kokkos::Example::feint< Kokkos::Threads , false >();
+
+    std::cout << "feint< Threads , Usingtomic >" << std::endl ;
+    Kokkos::Example::feint< Kokkos::Threads , true  >();
+
+    Kokkos::Threads::finalize();
+  }
+#endif
+
+#if defined( KOKKOS_HAVE_OPENMP )
+  {
+    // Use 4 cores per NUMA region, unless fewer available
+
+    const unsigned use_numa_count     = Kokkos::hwloc::get_available_numa_count();
+    const unsigned use_cores_per_numa = std::min( 4u , Kokkos::hwloc::get_available_cores_per_numa() );
+
+    Kokkos::OpenMP::initialize( use_numa_count * use_cores_per_numa );
+
+    std::cout << "feint< OpenMP , NotUsingAtomic >" << std::endl ;
+    Kokkos::Example::feint< Kokkos::OpenMP , false >();
+
+    std::cout << "feint< OpenMP , Usingtomic >" << std::endl ;
+    Kokkos::Example::feint< Kokkos::OpenMP , true  >();
+
+    Kokkos::OpenMP::finalize();
+  }
+#endif
+
+#if defined( KOKKOS_HAVE_CUDA )
+  {
+    // Initialize Host mirror device
+    Kokkos::HostSpace::execution_space::initialize(1);
+    const unsigned device_count = Kokkos::Cuda::detect_device_count();
+
+    // Use the last device:
+    Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice(device_count-1) );
+
+    std::cout << "feint< Cuda , NotUsingAtomic >" << std::endl ;
+    Kokkos::Example::feint< Kokkos::Cuda , false >();
+
+    std::cout << "feint< Cuda , UsingAtomic >" << std::endl ;
+    Kokkos::Example::feint< Kokkos::Cuda , true  >();
+
+    Kokkos::Cuda::finalize();
+    Kokkos::HostSpace::execution_space::finalize();
+
+  }
+#endif
+}
+
diff --git a/lib/kokkos/example/fenl/CGSolve.hpp b/lib/kokkos/example/fenl/CGSolve.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..06a0030e09cadb0f9ab82080d8ab244563ae54b0
--- /dev/null
+++ b/lib/kokkos/example/fenl/CGSolve.hpp
@@ -0,0 +1,296 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_EXAMPLE_CG_SOLVE
+#define KOKKOS_EXAMPLE_CG_SOLVE
+
+#include <cmath>
+#include <limits>
+#include <Kokkos_Core.hpp>
+#include <impl/Kokkos_Timer.hpp>
+
+#include <WrapMPI.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Example {
+
+template< typename ValueType , class Space >
+struct CrsMatrix {
+  typedef Kokkos::StaticCrsGraph< unsigned , Space , void , unsigned >  StaticCrsGraphType ;
+  typedef View< ValueType * , Space > coeff_type ;
+
+  StaticCrsGraphType  graph ;
+  coeff_type          coeff ;
+
+  CrsMatrix() : graph(), coeff() {}
+
+  CrsMatrix( const StaticCrsGraphType & arg_graph )
+    : graph( arg_graph )
+    , coeff( "crs_matrix_coeff" , arg_graph.entries.dimension_0() )
+    {}
+};
+
+template< typename MScalar 
+        , typename VScalar
+        , class Space >
+struct Multiply {
+
+  const Example::CrsMatrix< MScalar , Space >    m_A ;
+  const Kokkos::View< const VScalar * , Space > m_x ;
+  const Kokkos::View<       VScalar * , Space > m_y ;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int iRow ) const
+    {
+      const int iEntryBegin = m_A.graph.row_map[iRow];
+      const int iEntryEnd   = m_A.graph.row_map[iRow+1];
+
+      double sum = 0 ;
+
+      for ( int iEntry = iEntryBegin ; iEntry < iEntryEnd ; ++iEntry ) {
+        sum += m_A.coeff(iEntry) * m_x( m_A.graph.entries(iEntry) );
+      }
+
+      m_y(iRow) = sum ;
+    }
+
+  Multiply( const View<       VScalar * , Space > & y 
+          , const CrsMatrix< MScalar , Space >    & A 
+          , const View< const VScalar * , Space > & x 
+          )
+  : m_A( A ), m_x( x ), m_y( y )
+  {}
+};
+
+template< typename MScalar
+        , typename VScalar
+        , class Space >
+inline
+void multiply( const int nrow
+             , const Kokkos::View< VScalar * , Space >    & y
+             , const Example::CrsMatrix< MScalar , Space > & A
+             , const Kokkos::View< VScalar * , Space >    & x
+             )
+{
+  Kokkos::parallel_for( Kokkos::RangePolicy<Space>(0,nrow), Multiply<MScalar,VScalar,Space>( y , A , x ) );
+}
+
+template< typename ValueType , class Space >
+struct WAXPBY {
+  const Kokkos::View< const ValueType * , Space >  m_x ;
+  const Kokkos::View< const ValueType * , Space >  m_y ;
+  const Kokkos::View<       ValueType * , Space >  m_w ;
+  const double m_alpha ;
+  const double m_beta ;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i ) const
+    { m_w(i) = m_alpha * m_x(i) + m_beta * m_y(i); }
+
+  WAXPBY( const View< ValueType * , Space >  & arg_w
+        , const double arg_alpha
+        , const View< ValueType * , Space >  & arg_x
+        , const double arg_beta
+        , const View< ValueType * , Space >  & arg_y
+        )
+    : m_x( arg_x )
+    , m_y( arg_y )
+    , m_w( arg_w )
+    , m_alpha( arg_alpha )
+    , m_beta( arg_beta )
+    {}
+};
+
+template< typename VScalar , class Space >
+void waxpby( const int n
+           , const Kokkos::View< VScalar * , Space > & arg_w
+           , const double                      arg_alpha
+           , const Kokkos::View< VScalar * , Space > & arg_x
+           , const double                      arg_beta
+           , const Kokkos::View< VScalar * , Space > & arg_y
+           )
+{
+  Kokkos::parallel_for( Kokkos::RangePolicy<Space>(0,n), WAXPBY<VScalar,Space>(arg_w,arg_alpha,arg_x,arg_beta,arg_y) );
+}
+
+template< typename VScalar , class Space >
+struct Dot {
+  typedef double value_type ;
+
+  const Kokkos::View< const VScalar * , Space >  m_x ;
+  const Kokkos::View< const VScalar * , Space >  m_y ;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i , value_type & update ) const
+    { update += m_x(i) * m_y(i); }
+
+  Dot( const Kokkos::View< VScalar * , Space >  & arg_x
+     , const Kokkos::View< VScalar * , Space >  & arg_y
+     )
+    : m_x(arg_x), m_y(arg_y) {}
+};
+
+template< typename VScalar , class Space >
+double dot( const int n
+          , const Kokkos::View< VScalar * , Space > & arg_x
+          , const Kokkos::View< VScalar * , Space > & arg_y
+          )
+{
+  double result = 0 ;
+  Kokkos::parallel_reduce( Kokkos::RangePolicy<Space>(0,n) , Dot<VScalar,Space>( arg_x , arg_y ) , result );
+  return result ;
+}
+
+} // namespace Example
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Example {
+
+struct CGSolveResult {
+  size_t  iteration ;
+  double  iter_time ;
+  double  matvec_time ;
+  double  norm_res ;
+};
+
+template< class ImportType
+        , typename MScalar
+        , typename VScalar
+        , class Space
+        >
+inline
+void cgsolve( const ImportType & import
+            , const CrsMatrix< MScalar , Space >      & A
+            , const Kokkos::View< VScalar * , Space > & b
+            , const Kokkos::View< VScalar * , Space > & x
+            , const size_t  maximum_iteration = 200
+            , const double  tolerance = std::numeric_limits<double>::epsilon()
+            , CGSolveResult * result = 0
+            )
+{
+  typedef View< VScalar * , Space >  VectorType ;
+
+  const size_t count_owned = import.count_owned ;
+  const size_t count_total = import.count_owned + import.count_receive;
+
+  size_t  iteration = 0 ;
+  double  iter_time = 0 ;
+  double  matvec_time = 0 ;
+  double  norm_res = 0 ;
+
+  // Need input vector to matvec to be owned + received
+  VectorType pAll ( "cg::p" , count_total );
+
+  VectorType p = Kokkos::subview( pAll , std::pair<size_t,size_t>(0,count_owned) );
+  VectorType r ( "cg::r" , count_owned );
+  VectorType Ap( "cg::Ap", count_owned );
+
+  /* r = b - A * x ; */
+
+  /* p  = x       */  Kokkos::deep_copy( p , x );
+  /* import p     */  import( pAll );
+  /* Ap = A * p   */  multiply( count_owned , Ap , A , pAll );
+  /* r = b - Ap   */  waxpby( count_owned , r , 1.0 , b , -1.0 , Ap );
+  /* p  = r       */  Kokkos::deep_copy( p , r );
+
+  double old_rdot = Kokkos::Example::all_reduce( dot( count_owned , r , r ) , import.comm );
+
+  norm_res  = sqrt( old_rdot );
+  iteration = 0 ;
+
+  Kokkos::Timer wall_clock ;
+  Kokkos::Timer timer;
+
+  while ( tolerance < norm_res && iteration < maximum_iteration ) {
+
+    /* pAp_dot = dot( p , Ap = A * p ) */
+
+    timer.reset();
+    /* import p    */  import( pAll );
+    /* Ap = A * p  */  multiply( count_owned , Ap , A , pAll );
+    Space::fence();
+    matvec_time += timer.seconds();
+
+    const double pAp_dot = Kokkos::Example::all_reduce( dot( count_owned , p , Ap ) , import.comm );
+    const double alpha   = old_rdot / pAp_dot ;
+
+    /* x +=  alpha * p ;  */ waxpby( count_owned , x ,  alpha, p  , 1.0 , x );
+    /* r += -alpha * Ap ; */ waxpby( count_owned , r , -alpha, Ap , 1.0 , r );
+
+    const double r_dot = Kokkos::Example::all_reduce( dot( count_owned , r , r ) , import.comm );
+    const double beta  = r_dot / old_rdot ;
+
+    /* p = r + beta * p ; */ waxpby( count_owned , p , 1.0 , r , beta , p );
+
+    norm_res = sqrt( old_rdot = r_dot );
+
+    ++iteration ;
+  }
+
+  Space::fence();
+  iter_time = wall_clock.seconds();
+
+  if ( 0 != result ) {
+    result->iteration   = iteration ;
+    result->iter_time   = iter_time ;
+    result->matvec_time = matvec_time ;
+    result->norm_res    = norm_res ;
+  }
+}
+
+} // namespace Example
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_EXAMPLE_CG_SOLVE */
+
+
diff --git a/lib/kokkos/example/fenl/CMakeLists.txt b/lib/kokkos/example/fenl/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..150656b16e13d4977c6ea975b87a785103cc7d48
--- /dev/null
+++ b/lib/kokkos/example/fenl/CMakeLists.txt
@@ -0,0 +1,17 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../common)
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../fixture)
+
+SET(SOURCES "")
+
+FILE( GLOB SOURCES *.cpp )
+
+LIST( APPEND SOURCES ../fixture/BoxElemPart.cpp )
+
+TRIBITS_ADD_EXECUTABLE(
+  fenl
+  SOURCES ${SOURCES}
+  COMM serial mpi
+  )
diff --git a/lib/kokkos/example/fenl/Makefile b/lib/kokkos/example/fenl/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..5d8e6fd3034ec7c20044552a5688fc6751e374fb
--- /dev/null
+++ b/lib/kokkos/example/fenl/Makefile
@@ -0,0 +1,54 @@
+KOKKOS_PATH ?= ../..
+
+MAKEFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST)))
+SRC_DIR := $(dir $(MAKEFILE_PATH))
+
+vpath %.cpp ${SRC_DIR}/../fixture ${SRC_DIR}
+
+EXAMPLE_HEADERS = $(wildcard $(SRC_DIR)/../common/*.hpp ${SRC_DIR}/../fixture/*.hpp ${SRC_DIR}/*.hpp)
+
+default: build_all
+	echo "End Build"
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+# KOKKOS_INTERNAL_USE_CUDA is not exported to installed Makefile.kokkos
+# use KOKKOS_DEVICE here
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+	CXX = $(NVCC_WRAPPER)
+	CXXFLAGS ?= -O3
+	LINK = $(CXX)
+	LDFLAGS ?= -lpthread
+else
+	CXX ?= g++
+	CXXFLAGS ?= -O3
+	LINK ?= $(CXX)
+	LDFLAGS ?= -lpthread
+endif
+
+KOKKOS_CXXFLAGS +=	\
+	-I${SRC_DIR}/../common	\
+	-I${SRC_DIR}/../fixture	\
+	-I${SRC_DIR}
+
+EXE_EXAMPLE_FENL = KokkosExample_Fenl
+OBJ_EXAMPLE_FENL = BoxElemPart.o main.o fenl.o
+
+TARGETS = $(EXE_EXAMPLE_FENL)
+
+#TEST_TARGETS =
+
+$(EXE_EXAMPLE_FENL) : $(OBJ_EXAMPLE_FENL) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_EXAMPLE_FENL) $(KOKKOS_LIBS) $(LIB) -o $(EXE_EXAMPLE_FENL)
+
+build_all : $(TARGETS)
+
+test : build_all
+
+clean: kokkos-clean
+	rm -f *.o $(TARGETS)
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS) $(EXAMPLE_HEADERS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
diff --git a/lib/kokkos/example/fenl/fenl.cpp b/lib/kokkos/example/fenl/fenl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a5cba608f4454d38ebea3904caba3b8f6b6078dc
--- /dev/null
+++ b/lib/kokkos/example/fenl/fenl.cpp
@@ -0,0 +1,117 @@
+/*
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+*/
+
+#include <HexElement.hpp>
+#include <fenl_impl.hpp>
+
+namespace Kokkos {
+namespace Example {
+namespace FENL {
+
+#if defined( KOKKOS_HAVE_PTHREAD )
+
+template
+Perf fenl< Kokkos::Threads , Kokkos::Example::BoxElemPart::ElemLinear >(
+  MPI_Comm comm ,
+  const int use_print ,
+  const int use_trials ,
+  const int use_atomic ,
+  const int global_elems[] );
+
+
+template
+Perf fenl< Kokkos::Threads , Kokkos::Example::BoxElemPart::ElemQuadratic >(
+  MPI_Comm comm ,
+  const int use_print ,
+  const int use_trials ,
+  const int use_atomic ,
+  const int global_elems[] );
+
+#endif
+
+
+#if defined (KOKKOS_HAVE_OPENMP)
+
+template
+Perf fenl< Kokkos::OpenMP , Kokkos::Example::BoxElemPart::ElemLinear >(
+  MPI_Comm comm ,
+  const int use_print ,
+  const int use_trials ,
+  const int use_atomic ,
+  const int global_elems[] );
+
+
+template
+Perf fenl< Kokkos::OpenMP , Kokkos::Example::BoxElemPart::ElemQuadratic >(
+  MPI_Comm comm ,
+  const int use_print ,
+  const int use_trials ,
+  const int use_atomic ,
+  const int global_elems[] );
+
+#endif
+
+#if defined( KOKKOS_HAVE_CUDA )
+
+template
+Perf fenl< Kokkos::Cuda , Kokkos::Example::BoxElemPart::ElemLinear >(
+  MPI_Comm comm ,
+  const int use_print ,
+  const int use_trials ,
+  const int use_atomic ,
+  const int global_elems[] );
+
+
+template
+Perf fenl< Kokkos::Cuda , Kokkos::Example::BoxElemPart::ElemQuadratic >(
+  MPI_Comm comm ,
+  const int use_print ,
+  const int use_trials ,
+  const int use_atomic ,
+  const int global_elems[] );
+
+#endif
+
+
+} /* namespace FENL */
+} /* namespace Example */
+} /* namespace Kokkos */
+
diff --git a/lib/kokkos/example/fenl/fenl.hpp b/lib/kokkos/example/fenl/fenl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e524a378c0c2eb75223c933c51267ff55852d08b
--- /dev/null
+++ b/lib/kokkos/example/fenl/fenl.hpp
@@ -0,0 +1,89 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_EXAMPLE_FENL_HPP
+#define KOKKOS_EXAMPLE_FENL_HPP
+
+#include <stdlib.h>
+#include <BoxElemPart.hpp>
+#include <WrapMPI.hpp>
+
+namespace Kokkos {
+namespace Example {
+namespace FENL {
+
+struct Perf {
+  size_t global_elem_count ;
+  size_t global_node_count ;
+  size_t newton_iter_count ;
+  size_t cg_iter_count ;
+  double map_ratio ;
+  double fill_node_set ;
+  double scan_node_count ;
+  double fill_graph_entries ;
+  double sort_graph_entries ;
+  double fill_element_graph ;
+  double create_sparse_matrix ;
+  double fill_time ;
+  double bc_time ;
+  double matvec_time ;
+  double cg_time ;
+  double newton_residual ;
+  double error_max ;
+
+};
+
+template < class Device , BoxElemPart::ElemOrder ElemOrder >
+Perf fenl(
+  MPI_Comm comm ,
+  const int use_print ,
+  const int use_trials ,
+  const int use_atomic ,
+  const int global_elems[] );
+
+} /* namespace FENL */
+} /* namespace Example */
+} /* namespace Kokkos */
+
+#endif /* #ifndef KOKKOS_EXAMPLE_FENL_HPP */
+
diff --git a/lib/kokkos/example/fenl/fenl_functors.hpp b/lib/kokkos/example/fenl/fenl_functors.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..3020c99a2f58637c64377a9ae933d0e3549d3c12
--- /dev/null
+++ b/lib/kokkos/example/fenl/fenl_functors.hpp
@@ -0,0 +1,1173 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_EXAMPLE_FENLFUNCTORS_HPP
+#define KOKKOS_EXAMPLE_FENLFUNCTORS_HPP
+
+#include <stdio.h>
+
+#include <iostream>
+#include <fstream>
+#include <iomanip>
+#include <cstdlib>
+#include <cmath>
+#include <limits>
+
+#include <Kokkos_Pair.hpp>
+#include <Kokkos_UnorderedMap.hpp>
+
+#include <impl/Kokkos_Timer.hpp>
+
+#include <BoxElemFixture.hpp>
+#include <HexElement.hpp>
+#include <CGSolve.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Example {
+namespace FENL {
+
+template< class ElemNodeIdView , class CrsGraphType , unsigned ElemNode >
+class NodeNodeGraph {
+public:
+
+  typedef typename ElemNodeIdView::execution_space  execution_space ;
+  typedef pair<unsigned,unsigned> key_type ;
+
+  typedef Kokkos::UnorderedMap< key_type, void , execution_space >  SetType ;
+  typedef typename CrsGraphType::row_map_type::non_const_type       RowMapType ;
+  typedef Kokkos::View< unsigned ,  execution_space >               UnsignedValue ;
+
+  // Static dimensions of 0 generate compiler warnings or errors.
+  typedef Kokkos::View< unsigned*[ElemNode][ElemNode] , execution_space >
+    ElemGraphType ;
+
+  struct TagFillNodeSet {};
+  struct TagScanNodeCount {};
+  struct TagFillGraphEntries {};
+  struct TagSortGraphEntries {};
+  struct TagFillElementGraph {};
+
+private:
+
+  enum PhaseType { FILL_NODE_SET ,
+                   SCAN_NODE_COUNT ,
+                   FILL_GRAPH_ENTRIES ,
+                   SORT_GRAPH_ENTRIES ,
+                   FILL_ELEMENT_GRAPH };
+
+  const unsigned        node_count ;
+  const ElemNodeIdView  elem_node_id ;
+  UnsignedValue         row_total ;
+  RowMapType            row_count ;
+  RowMapType            row_map ;
+  SetType               node_node_set ;
+  PhaseType             phase ;
+
+public:
+
+  CrsGraphType          graph ;
+  ElemGraphType         elem_graph ;
+
+  struct Times
+  {
+    double ratio;
+    double fill_node_set;
+    double scan_node_count;
+    double fill_graph_entries;
+    double sort_graph_entries;
+    double fill_element_graph;
+  };
+
+  NodeNodeGraph( const ElemNodeIdView & arg_elem_node_id ,
+                 const unsigned         arg_node_count,
+                 Times & results
+               )
+    : node_count(arg_node_count)
+    , elem_node_id( arg_elem_node_id )
+    , row_total( "row_total" )
+    , row_count(Kokkos::ViewAllocateWithoutInitializing("row_count") , node_count ) // will deep_copy to 0 inside loop
+    , row_map( "graph_row_map" , node_count + 1 )
+    , node_node_set()
+    , phase( FILL_NODE_SET )
+    , graph()
+    , elem_graph()
+   {
+      //--------------------------------
+      // Guess at capacity required for the map:
+
+      Kokkos::Timer wall_clock ;
+
+      wall_clock.reset();
+      phase = FILL_NODE_SET ;
+
+      // upper bound on the capacity
+      size_t set_capacity = (28ull * node_count) / 2;
+      unsigned failed_insert_count = 0 ;
+
+      do {
+        // Zero the row count to restart the fill
+        Kokkos::deep_copy( row_count , 0u );
+
+        node_node_set = SetType( ( set_capacity += failed_insert_count ) );
+
+        // May be larger that requested:
+        set_capacity = node_node_set.capacity();
+
+        Kokkos::parallel_reduce( Kokkos::RangePolicy<execution_space,TagFillNodeSet>(0,elem_node_id.dimension_0())
+                               , *this
+                               , failed_insert_count );
+
+      } while ( failed_insert_count );
+
+      execution_space::fence();
+      results.ratio = (double)node_node_set.size() / (double)node_node_set.capacity();
+      results.fill_node_set = wall_clock.seconds();
+      //--------------------------------
+
+      wall_clock.reset();
+      phase = SCAN_NODE_COUNT ;
+
+      // Exclusive scan of row_count into row_map
+      // including the final total in the 'node_count + 1' position.
+      // Zero the 'row_count' values.
+      Kokkos::parallel_scan( node_count , *this );
+
+      // Zero the row count for the fill:
+      Kokkos::deep_copy( row_count , 0u );
+
+      unsigned graph_entry_count = 0 ;
+
+      Kokkos::deep_copy( graph_entry_count , row_total );
+
+      // Assign graph's row_map and allocate graph's entries
+      graph.row_map = row_map ;
+      graph.entries = typename CrsGraphType::entries_type( "graph_entries" , graph_entry_count );
+
+      //--------------------------------
+      // Fill graph's entries from the (node,node) set.
+
+      execution_space::fence();
+      results.scan_node_count = wall_clock.seconds();
+
+      wall_clock.reset();
+      phase = FILL_GRAPH_ENTRIES ;
+      Kokkos::parallel_for( node_node_set.capacity() , *this );
+
+      execution_space::fence();
+      results.fill_graph_entries = wall_clock.seconds();
+
+      //--------------------------------
+      // Done with the temporary sets and arrays
+      wall_clock.reset();
+      phase = SORT_GRAPH_ENTRIES ;
+
+      row_total = UnsignedValue();
+      row_count = RowMapType();
+      row_map   = RowMapType();
+      node_node_set.clear();
+
+      //--------------------------------
+
+      Kokkos::parallel_for( node_count , *this );
+
+      execution_space::fence();
+      results.sort_graph_entries = wall_clock.seconds();
+
+      //--------------------------------
+      // Element-to-graph mapping:
+      wall_clock.reset();
+      phase = FILL_ELEMENT_GRAPH ;
+      elem_graph = ElemGraphType("elem_graph", elem_node_id.dimension_0() );
+      Kokkos::parallel_for( elem_node_id.dimension_0() , *this );
+
+      execution_space::fence();
+      results.fill_element_graph = wall_clock.seconds();
+    }
+
+  //------------------------------------
+  // parallel_for: create map and count row length
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const TagFillNodeSet & , unsigned ielem , unsigned & count ) const
+  {
+    // Loop over element's (row_local_node,col_local_node) pairs:
+    for ( unsigned row_local_node = 0 ; row_local_node < elem_node_id.dimension_1() ; ++row_local_node ) {
+
+      const unsigned row_node = elem_node_id( ielem , row_local_node );
+
+      for ( unsigned col_local_node = row_local_node ; col_local_node < elem_node_id.dimension_1() ; ++col_local_node ) {
+
+        const unsigned col_node = elem_node_id( ielem , col_local_node );
+
+        // If either node is locally owned then insert the pair into the unordered map:
+
+        if ( row_node < row_count.dimension_0() || col_node < row_count.dimension_0() ) {
+
+          const key_type key = (row_node < col_node) ? make_pair( row_node, col_node ) : make_pair( col_node, row_node ) ;
+
+          const typename SetType::insert_result result = node_node_set.insert( key );
+
+          // A successfull insert: the first time this pair was added
+          if ( result.success() ) {
+
+            // If row node is owned then increment count
+            if ( row_node < row_count.dimension_0() ) { atomic_fetch_add( & row_count( row_node ) , 1 ); }
+
+            // If column node is owned and not equal to row node then increment count
+            if ( col_node < row_count.dimension_0() && col_node != row_node ) { atomic_fetch_add( & row_count( col_node ) , 1 ); }
+          }
+          else if ( result.failed() ) {
+            ++count ;
+          }
+        }
+      }
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void fill_graph_entries( const unsigned iset ) const
+  {
+    if ( node_node_set.valid_at(iset) ) {
+      // Add each entry to the graph entries.
+
+      const key_type key = node_node_set.key_at(iset) ;
+      const unsigned row_node = key.first ;
+      const unsigned col_node = key.second ;
+
+      if ( row_node < row_count.dimension_0() ) {
+        const unsigned offset = graph.row_map( row_node ) + atomic_fetch_add( & row_count( row_node ) , 1 );
+        graph.entries( offset ) = col_node ;
+      }
+
+      if ( col_node < row_count.dimension_0() && col_node != row_node ) {
+        const unsigned offset = graph.row_map( col_node ) + atomic_fetch_add( & row_count( col_node ) , 1 );
+        graph.entries( offset ) = row_node ;
+      }
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void sort_graph_entries( const unsigned irow ) const
+  {
+    const unsigned row_beg = graph.row_map( irow );
+    const unsigned row_end = graph.row_map( irow + 1 );
+    for ( unsigned i = row_beg + 1 ; i < row_end ; ++i ) {
+      const unsigned col = graph.entries(i);
+      unsigned j = i ;
+      for ( ; row_beg < j && col < graph.entries(j-1) ; --j ) {
+        graph.entries(j) = graph.entries(j-1);
+      }
+      graph.entries(j) = col ;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void fill_elem_graph_map( const unsigned ielem ) const
+  {
+    for ( unsigned row_local_node = 0 ; row_local_node < elem_node_id.dimension_1() ; ++row_local_node ) {
+
+      const unsigned row_node = elem_node_id( ielem , row_local_node );
+
+      for ( unsigned col_local_node = 0 ; col_local_node < elem_node_id.dimension_1() ; ++col_local_node ) {
+
+        const unsigned col_node = elem_node_id( ielem , col_local_node );
+
+        unsigned entry = ~0u ;
+
+        if ( row_node + 1 < graph.row_map.dimension_0() ) {
+
+          const unsigned entry_end = graph.row_map( row_node + 1 );
+
+          entry = graph.row_map( row_node );
+
+          for ( ; entry < entry_end && graph.entries(entry) != col_node ; ++entry );
+
+          if ( entry == entry_end ) entry = ~0u ;
+        }
+
+        elem_graph( ielem , row_local_node , col_local_node ) = entry ;
+      }
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const unsigned iwork ) const
+  {
+/*
+    if ( phase == FILL_NODE_SET ) {
+      operator()( TagFillNodeSet() , iwork );
+    }
+    else */  
+    if ( phase == FILL_GRAPH_ENTRIES ) {
+      fill_graph_entries( iwork );
+    }
+    else if ( phase == SORT_GRAPH_ENTRIES ) {
+      sort_graph_entries( iwork );
+    }
+    else if ( phase == FILL_ELEMENT_GRAPH ) {
+      fill_elem_graph_map( iwork );
+    }
+  }
+
+  //------------------------------------
+  // parallel_scan: row offsets
+
+  typedef unsigned value_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const unsigned irow , unsigned & update , const bool final ) const
+  {
+    // exclusive scan
+    if ( final ) { row_map( irow ) = update ; }
+
+    update += row_count( irow );
+
+    if ( final ) {
+      if ( irow + 1 == row_count.dimension_0() ) {
+        row_map( irow + 1 ) = update ;
+        row_total()         = update ;
+      }
+    }
+  }
+
+  // For the reduce phase:
+  KOKKOS_INLINE_FUNCTION
+  void init( const TagFillNodeSet & , unsigned & update ) const { update = 0 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( const TagFillNodeSet & 
+           , volatile       unsigned & update
+           , volatile const unsigned & input ) const { update += input ; }
+
+  // For the scan phase::
+  KOKKOS_INLINE_FUNCTION
+  void init( unsigned & update ) const { update = 0 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile       unsigned & update
+           , volatile const unsigned & input ) const { update += input ; }
+
+  //------------------------------------
+};
+
+} /* namespace FENL */
+} /* namespace Example */
+} /* namespace Kokkos  */
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Example {
+namespace FENL {
+
+template< class ElemCompType >
+class NodeElemGatherFill {
+public:
+
+  typedef typename ElemCompType::execution_space         execution_space ;
+  typedef typename ElemCompType::vector_type         vector_type ;
+  typedef typename ElemCompType::sparse_matrix_type  sparse_matrix_type ;
+  typedef typename ElemCompType::elem_node_type      elem_node_type ;
+  typedef typename ElemCompType::elem_vectors_type   elem_vectors_type ;
+  typedef typename ElemCompType::elem_matrices_type  elem_matrices_type ;
+  typedef typename ElemCompType::elem_graph_type     elem_graph_type ;
+
+  static const unsigned ElemNodeCount = ElemCompType::ElemNodeCount ;
+
+  //------------------------------------
+
+private:
+
+  typedef Kokkos::StaticCrsGraph< unsigned[2] , execution_space >  CrsGraphType ;
+  typedef typename CrsGraphType::row_map_type::non_const_type  RowMapType ;
+  typedef Kokkos::View< unsigned ,  execution_space >              UnsignedValue ;
+
+  enum PhaseType { FILL_NODE_COUNT ,
+                   SCAN_NODE_COUNT ,
+                   FILL_GRAPH_ENTRIES ,
+                   SORT_GRAPH_ENTRIES ,
+                   GATHER_FILL };
+
+  const elem_node_type  elem_node_id ;
+  const elem_graph_type elem_graph ;
+  UnsignedValue         row_total ;
+  RowMapType            row_count ;
+  RowMapType            row_map ;
+  CrsGraphType          graph ;
+  vector_type           residual ;
+  sparse_matrix_type    jacobian ;
+  elem_vectors_type     elem_residual ;
+  elem_matrices_type    elem_jacobian ;
+  PhaseType             phase ;
+
+public:
+
+  NodeElemGatherFill()
+    : elem_node_id()
+    , elem_graph()
+    , row_total()
+    , row_count()
+    , row_map()
+    , graph()
+    , residual()
+    , jacobian()
+    , elem_residual()
+    , elem_jacobian()
+    , phase( FILL_NODE_COUNT )
+    {}
+
+  NodeElemGatherFill( const NodeElemGatherFill & rhs )
+    : elem_node_id(  rhs.elem_node_id )
+    , elem_graph(    rhs.elem_graph )
+    , row_total(     rhs.row_total )
+    , row_count(     rhs.row_count )
+    , row_map(       rhs.row_map )
+    , graph(         rhs.graph )
+    , residual(      rhs.residual )
+    , jacobian(      rhs.jacobian )
+    , elem_residual( rhs.elem_residual )
+    , elem_jacobian( rhs.elem_jacobian )
+    , phase(         rhs.phase )
+    {}
+
+  NodeElemGatherFill( const elem_node_type     & arg_elem_node_id ,
+                      const elem_graph_type    & arg_elem_graph ,
+                      const vector_type        & arg_residual ,
+                      const sparse_matrix_type & arg_jacobian ,
+                      const elem_vectors_type  & arg_elem_residual ,
+                      const elem_matrices_type & arg_elem_jacobian )
+    : elem_node_id( arg_elem_node_id )
+    , elem_graph( arg_elem_graph )
+    , row_total( "row_total" )
+    , row_count( "row_count" , arg_residual.dimension_0() )
+    , row_map( "graph_row_map" , arg_residual.dimension_0() + 1 )
+    , graph()
+    , residual( arg_residual )
+    , jacobian( arg_jacobian )
+    , elem_residual( arg_elem_residual )
+    , elem_jacobian( arg_elem_jacobian )
+    , phase( FILL_NODE_COUNT )
+    {
+      //--------------------------------
+      // Count node->element relations
+
+      phase = FILL_NODE_COUNT ;
+
+      Kokkos::parallel_for( elem_node_id.dimension_0() , *this );
+
+      //--------------------------------
+
+      phase = SCAN_NODE_COUNT ;
+
+      // Exclusive scan of row_count into row_map
+      // including the final total in the 'node_count + 1' position.
+      // Zero the 'row_count' values.
+      Kokkos::parallel_scan( residual.dimension_0() , *this );
+
+      // Zero the row count for the fill:
+      Kokkos::deep_copy( row_count , typename RowMapType::value_type(0) );
+
+      unsigned graph_entry_count = 0 ;
+
+      Kokkos::deep_copy( graph_entry_count , row_total );
+
+      // Assign graph's row_map and allocate graph's entries
+      graph.row_map = row_map ;
+
+      typedef typename CrsGraphType::entries_type graph_entries_type ;
+
+      graph.entries = graph_entries_type( "graph_entries" , graph_entry_count );
+
+      //--------------------------------
+      // Fill graph's entries from the (node,node) set.
+
+      phase = FILL_GRAPH_ENTRIES ;
+
+      Kokkos::deep_copy( row_count , 0u );
+      Kokkos::parallel_for( elem_node_id.dimension_0() , *this );
+
+      execution_space::fence();
+
+      //--------------------------------
+      // Done with the temporary sets and arrays
+
+      row_total = UnsignedValue();
+      row_count = RowMapType();
+      row_map   = RowMapType();
+
+      //--------------------------------
+
+      phase = SORT_GRAPH_ENTRIES ;
+      Kokkos::parallel_for( residual.dimension_0() , *this );
+
+      execution_space::fence();
+
+      phase = GATHER_FILL ;
+    }
+
+  void apply() const
+  {
+    Kokkos::parallel_for( residual.dimension_0() , *this );
+  }
+
+  //------------------------------------
+  //------------------------------------
+  // parallel_for: Count node->element pairs
+
+  KOKKOS_INLINE_FUNCTION
+  void fill_node_count( const unsigned ielem ) const
+  {
+    for ( unsigned row_local_node = 0 ; row_local_node < elem_node_id.dimension_1() ; ++row_local_node ) {
+
+      const unsigned row_node = elem_node_id( ielem , row_local_node );
+
+      if ( row_node < row_count.dimension_0() ) {
+        atomic_fetch_add( & row_count( row_node ) , 1 );
+      }
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void fill_graph_entries( const unsigned ielem ) const
+  {
+    for ( unsigned row_local_node = 0 ; row_local_node < elem_node_id.dimension_1() ; ++row_local_node ) {
+
+      const unsigned row_node = elem_node_id( ielem , row_local_node );
+
+      if ( row_node < row_count.dimension_0() ) {
+
+        const unsigned offset = graph.row_map( row_node ) + atomic_fetch_add( & row_count( row_node ) , 1 );
+
+        graph.entries( offset , 0 ) = ielem ;
+        graph.entries( offset , 1 ) = row_local_node ;
+      }
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void sort_graph_entries( const unsigned irow ) const
+  {
+    const unsigned row_beg = graph.row_map( irow );
+    const unsigned row_end = graph.row_map( irow + 1 );
+    for ( unsigned i = row_beg + 1 ; i < row_end ; ++i ) {
+      const unsigned elem  = graph.entries(i,0);
+      const unsigned local = graph.entries(i,1);
+      unsigned j = i ;
+      for ( ; row_beg < j && elem < graph.entries(j-1,0) ; --j ) {
+        graph.entries(j,0) = graph.entries(j-1,0);
+        graph.entries(j,1) = graph.entries(j-1,1);
+      }
+      graph.entries(j,0) = elem ;
+      graph.entries(j,1) = local ;
+    }
+  }
+
+  //------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  void gather_fill( const unsigned irow ) const
+  {
+    const unsigned node_elem_begin = graph.row_map(irow);
+    const unsigned node_elem_end   = graph.row_map(irow+1);
+
+    //  for each element that a node belongs to
+
+    for ( unsigned i = node_elem_begin ; i < node_elem_end ; i++ ) {
+
+      const unsigned elem_id   = graph.entries( i, 0);
+      const unsigned row_index = graph.entries( i, 1);
+
+      residual(irow) += elem_residual(elem_id, row_index);
+
+      //  for each node in a particular related element
+      //  gather the contents of the element stiffness
+      //  matrix that belong in irow
+
+      for ( unsigned j = 0 ; j < ElemNodeCount ; ++j ) {
+        const unsigned A_index = elem_graph( elem_id , row_index , j );
+
+        jacobian.coeff( A_index ) += elem_jacobian( elem_id, row_index, j );
+      }
+    }
+  }
+
+  //------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const unsigned iwork ) const
+  {
+    if ( phase == FILL_NODE_COUNT ) {
+      fill_node_count( iwork );
+    }
+    else if ( phase == FILL_GRAPH_ENTRIES ) {
+      fill_graph_entries( iwork );
+    }
+    else if ( phase == SORT_GRAPH_ENTRIES ) {
+      sort_graph_entries( iwork );
+    }
+    else if ( phase == GATHER_FILL ) {
+      gather_fill( iwork );
+    }
+  }
+
+  //------------------------------------
+  // parallel_scan: row offsets
+
+  typedef unsigned value_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const unsigned irow , unsigned & update , const bool final ) const
+  {
+    // exclusive scan
+    if ( final ) { row_map( irow ) = update ; }
+
+    update += row_count( irow );
+
+    if ( final ) {
+      if ( irow + 1 == row_count.dimension_0() ) {
+        row_map( irow + 1 ) = update ;
+        row_total()         = update ;
+      }
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void init( unsigned & update ) const { update = 0 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile unsigned & update , const volatile unsigned & input ) const { update += input ; }
+};
+
+} /* namespace FENL */
+} /* namespace Example */
+} /* namespace Kokkos  */
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Example {
+namespace FENL {
+
+template< class FiniteElementMeshType , class SparseMatrixType >
+class ElementComputation ;
+
+
+template< class ExecSpace , BoxElemPart::ElemOrder Order , class CoordinateMap , typename ScalarType >
+class ElementComputation<
+  Kokkos::Example::BoxElemFixture< ExecSpace , Order , CoordinateMap > ,
+  Kokkos::Example::CrsMatrix< ScalarType , ExecSpace > >
+{
+public:
+
+  typedef Kokkos::Example::BoxElemFixture< ExecSpace, Order, CoordinateMap >  mesh_type ;
+  typedef Kokkos::Example::HexElement_Data< mesh_type::ElemNode >             element_data_type ;
+
+  typedef Kokkos::Example::CrsMatrix< ScalarType , ExecSpace >  sparse_matrix_type ;
+  typedef typename sparse_matrix_type::StaticCrsGraphType       sparse_graph_type ;
+
+  typedef ExecSpace   execution_space ;
+  typedef ScalarType  scalar_type ;
+
+  static const unsigned SpatialDim       = element_data_type::spatial_dimension ;
+  static const unsigned TensorDim        = SpatialDim * SpatialDim ;
+  static const unsigned ElemNodeCount    = element_data_type::element_node_count ;
+  static const unsigned FunctionCount    = element_data_type::function_count ;
+  static const unsigned IntegrationCount = element_data_type::integration_count ;
+
+  //------------------------------------
+
+  typedef typename mesh_type::node_coord_type                                      node_coord_type ;
+  typedef typename mesh_type::elem_node_type                                       elem_node_type ;
+  typedef Kokkos::View< scalar_type*[FunctionCount][FunctionCount] , execution_space > elem_matrices_type ;
+  typedef Kokkos::View< scalar_type*[FunctionCount] ,                execution_space > elem_vectors_type ;
+  typedef Kokkos::View< scalar_type* ,                               execution_space > vector_type ;
+
+  typedef typename NodeNodeGraph< elem_node_type , sparse_graph_type , ElemNodeCount >::ElemGraphType elem_graph_type ;
+
+  //------------------------------------
+
+
+  //------------------------------------
+  // Computational data:
+
+  const element_data_type   elem_data ;
+  const elem_node_type      elem_node_ids ;
+  const node_coord_type     node_coords ;
+  const elem_graph_type     elem_graph ;
+  const elem_matrices_type  elem_jacobians ;
+  const elem_vectors_type   elem_residuals ;
+  const vector_type         solution ;
+  const vector_type         residual ;
+  const sparse_matrix_type  jacobian ;
+  const scalar_type         coeff_K ;
+
+  ElementComputation( const ElementComputation & rhs )
+    : elem_data()
+    , elem_node_ids( rhs.elem_node_ids )
+    , node_coords(   rhs.node_coords )
+    , elem_graph(    rhs.elem_graph )
+    , elem_jacobians( rhs.elem_jacobians )
+    , elem_residuals( rhs.elem_residuals )
+    , solution( rhs.solution )
+    , residual( rhs.residual )
+    , jacobian( rhs.jacobian )
+    , coeff_K( rhs.coeff_K )
+    {}
+
+  // If the element->sparse_matrix graph is provided then perform atomic updates
+  // Otherwise fill per-element contributions for subequent gather-add into a residual and jacobian.
+  ElementComputation( const mesh_type          & arg_mesh ,
+	              const scalar_type          arg_coeff_K ,
+                      const vector_type        & arg_solution ,
+                      const elem_graph_type    & arg_elem_graph ,
+                      const sparse_matrix_type & arg_jacobian ,
+                      const vector_type        & arg_residual )
+    : elem_data()
+    , elem_node_ids( arg_mesh.elem_node() )
+    , node_coords(   arg_mesh.node_coord() )
+    , elem_graph(    arg_elem_graph )
+    , elem_jacobians()
+    , elem_residuals()
+    , solution( arg_solution )
+    , residual( arg_residual )
+    , jacobian( arg_jacobian )
+    , coeff_K( arg_coeff_K )
+    {}
+
+  ElementComputation( const mesh_type    & arg_mesh ,
+	              const scalar_type    arg_coeff_K ,
+                      const vector_type  & arg_solution )
+    : elem_data()
+    , elem_node_ids( arg_mesh.elem_node() )
+    , node_coords(   arg_mesh.node_coord() )
+    , elem_graph()
+    , elem_jacobians( "elem_jacobians" , arg_mesh.elem_count() )
+    , elem_residuals( "elem_residuals" , arg_mesh.elem_count() )
+    , solution( arg_solution )
+    , residual()
+    , jacobian()
+    , coeff_K( arg_coeff_K )
+    {}
+
+  //------------------------------------
+
+  void apply() const
+  {
+    parallel_for( elem_node_ids.dimension_0() , *this );
+  }
+
+  //------------------------------------
+
+  static const unsigned FLOPS_transform_gradients =
+     /* Jacobian */           FunctionCount * TensorDim * 2 +
+     /* Inverse jacobian */   TensorDim * 6 + 6 +
+     /* Gradient transform */ FunctionCount * 15 ;
+
+  KOKKOS_INLINE_FUNCTION
+  float transform_gradients(
+    const float grad[][ FunctionCount ] , // Gradient of bases master element
+    const double x[] ,
+    const double y[] ,
+    const double z[] ,
+    float dpsidx[] ,
+    float dpsidy[] ,
+    float dpsidz[] ) const
+  {
+    enum { j11 = 0 , j12 = 1 , j13 = 2 ,
+           j21 = 3 , j22 = 4 , j23 = 5 ,
+           j31 = 6 , j32 = 7 , j33 = 8 };
+
+    // Jacobian accumulation:
+
+    double J[ TensorDim ] = { 0, 0, 0,  0, 0, 0,  0, 0, 0 };
+
+    for( unsigned i = 0; i < FunctionCount ; ++i ) {
+      const double x1 = x[i] ;
+      const double x2 = y[i] ;
+      const double x3 = z[i] ;
+
+      const float g1 = grad[0][i] ;
+      const float g2 = grad[1][i] ;
+      const float g3 = grad[2][i] ;
+
+      J[j11] += g1 * x1 ;
+      J[j12] += g1 * x2 ;
+      J[j13] += g1 * x3 ;
+
+      J[j21] += g2 * x1 ;
+      J[j22] += g2 * x2 ;
+      J[j23] += g2 * x3 ;
+
+      J[j31] += g3 * x1 ;
+      J[j32] += g3 * x2 ;
+      J[j33] += g3 * x3 ;
+    }
+
+    // Inverse jacobian:
+
+    float invJ[ TensorDim ] = {
+      static_cast<float>( J[j22] * J[j33] - J[j23] * J[j32] ) ,
+      static_cast<float>( J[j13] * J[j32] - J[j12] * J[j33] ) ,
+      static_cast<float>( J[j12] * J[j23] - J[j13] * J[j22] ) ,
+
+      static_cast<float>( J[j23] * J[j31] - J[j21] * J[j33] ) ,
+      static_cast<float>( J[j11] * J[j33] - J[j13] * J[j31] ) ,
+      static_cast<float>( J[j13] * J[j21] - J[j11] * J[j23] ) ,
+
+      static_cast<float>( J[j21] * J[j32] - J[j22] * J[j31] ) ,
+      static_cast<float>( J[j12] * J[j31] - J[j11] * J[j32] ) ,
+      static_cast<float>( J[j11] * J[j22] - J[j12] * J[j21] ) };
+
+    const float detJ = J[j11] * invJ[j11] +
+                       J[j21] * invJ[j12] +
+                       J[j31] * invJ[j13] ;
+
+    const float detJinv = 1.0 / detJ ;
+
+    for ( unsigned i = 0 ; i < TensorDim ; ++i ) { invJ[i] *= detJinv ; }
+
+    // Transform gradients:
+
+    for( unsigned i = 0; i < FunctionCount ; ++i ) {
+      const float g0 = grad[0][i];
+      const float g1 = grad[1][i];
+      const float g2 = grad[2][i];
+
+      dpsidx[i] = g0 * invJ[j11] + g1 * invJ[j12] + g2 * invJ[j13];
+      dpsidy[i] = g0 * invJ[j21] + g1 * invJ[j22] + g2 * invJ[j23];
+      dpsidz[i] = g0 * invJ[j31] + g1 * invJ[j32] + g2 * invJ[j33];
+    }
+
+    return detJ ;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void contributeResidualJacobian(
+    const float coeff_k ,
+    const double dof_values[] ,
+    const float dpsidx[] ,
+    const float dpsidy[] ,
+    const float dpsidz[] ,
+    const float detJ ,
+    const float integ_weight ,
+    const float bases_vals[] ,
+    double elem_res[] ,
+    double elem_mat[][ FunctionCount ] ) const
+  {
+    double value_at_pt = 0 ;
+    double gradx_at_pt = 0 ;
+    double grady_at_pt = 0 ;
+    double gradz_at_pt = 0 ;
+
+    for ( unsigned m = 0 ; m < FunctionCount ; m++ ) {
+      value_at_pt += dof_values[m] * bases_vals[m] ;
+      gradx_at_pt += dof_values[m] * dpsidx[m] ;
+      grady_at_pt += dof_values[m] * dpsidy[m] ;
+      gradz_at_pt += dof_values[m] * dpsidz[m] ;
+    }
+
+    const scalar_type k_detJ_weight = coeff_k        * detJ * integ_weight ;
+    const double res_val = value_at_pt * value_at_pt * detJ * integ_weight ;
+    const double mat_val = 2.0 * value_at_pt         * detJ * integ_weight ;
+
+    // $$ R_i = \int_{\Omega} \nabla \phi_i \cdot (k \nabla T) + \phi_i T^2 d \Omega $$
+    // $$ J_{i,j} = \frac{\partial R_i}{\partial T_j} = \int_{\Omega} k \nabla \phi_i \cdot \nabla \phi_j + 2 \phi_i \phi_j T d \Omega $$
+
+    for ( unsigned m = 0; m < FunctionCount; ++m) {
+      double * const mat = elem_mat[m] ;
+      const float bases_val_m = bases_vals[m];
+      const float dpsidx_m    = dpsidx[m] ;
+      const float dpsidy_m    = dpsidy[m] ;
+      const float dpsidz_m    = dpsidz[m] ;
+
+      elem_res[m] += k_detJ_weight * ( dpsidx_m * gradx_at_pt +
+                                       dpsidy_m * grady_at_pt +
+                                       dpsidz_m * gradz_at_pt ) +
+                     res_val * bases_val_m ;
+
+      for( unsigned n = 0; n < FunctionCount; n++) {
+
+        mat[n] += k_detJ_weight * ( dpsidx_m * dpsidx[n] +
+                                    dpsidy_m * dpsidy[n] +
+                                    dpsidz_m * dpsidz[n] ) +
+                  mat_val * bases_val_m * bases_vals[n];
+      }
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const unsigned ielem ) const
+  {
+    // Gather nodal coordinates and solution vector:
+
+    double x[ FunctionCount ] ;
+    double y[ FunctionCount ] ;
+    double z[ FunctionCount ] ;
+    double val[ FunctionCount ] ;
+    unsigned node_index[ ElemNodeCount ];
+
+    for ( unsigned i = 0 ; i < ElemNodeCount ; ++i ) {
+      const unsigned ni = elem_node_ids( ielem , i );
+
+      node_index[i] = ni ;
+
+      x[i] = node_coords( ni , 0 );
+      y[i] = node_coords( ni , 1 );
+      z[i] = node_coords( ni , 2 );
+
+      val[i] = solution( ni );
+    }
+
+
+    double elem_vec[ FunctionCount ] ;
+    double elem_mat[ FunctionCount ][ FunctionCount ] ;
+
+    for( unsigned i = 0; i < FunctionCount ; i++ ) {
+      elem_vec[i] = 0 ;
+      for( unsigned j = 0; j < FunctionCount ; j++){
+        elem_mat[i][j] = 0 ;
+      }
+    }
+
+
+    for ( unsigned i = 0 ; i < IntegrationCount ; ++i ) {
+      float dpsidx[ FunctionCount ] ;
+      float dpsidy[ FunctionCount ] ;
+      float dpsidz[ FunctionCount ] ;
+
+      const float detJ =
+        transform_gradients( elem_data.gradients[i] , x , y , z ,
+                             dpsidx , dpsidy , dpsidz );
+
+      contributeResidualJacobian( coeff_K ,
+                                  val , dpsidx , dpsidy , dpsidz ,
+                                  detJ ,
+                                  elem_data.weights[i] ,
+                                  elem_data.values[i] ,
+                                  elem_vec , elem_mat );
+    }
+
+#if 0
+
+if ( 1 == ielem ) {
+  printf("ElemResidual { %f %f %f %f %f %f %f %f }\n",
+         elem_vec[0], elem_vec[1], elem_vec[2], elem_vec[3],
+         elem_vec[4], elem_vec[5], elem_vec[6], elem_vec[7]);
+
+  printf("ElemJacobian {\n");
+
+  for ( unsigned j = 0 ; j < FunctionCount ; ++j ) {
+  printf("  { %f %f %f %f %f %f %f %f }\n",
+         elem_mat[j][0], elem_mat[j][1], elem_mat[j][2], elem_mat[j][3],
+         elem_mat[j][4], elem_mat[j][5], elem_mat[j][6], elem_mat[j][7]);
+  }
+  printf("}\n");
+}
+
+#endif
+
+    if ( ! residual.dimension_0() ) {
+      for( unsigned i = 0; i < FunctionCount ; i++){
+        elem_residuals(ielem, i) = elem_vec[i] ;
+        for( unsigned j = 0; j < FunctionCount ; j++){
+          elem_jacobians(ielem, i, j) = elem_mat[i][j] ;
+        }
+      }
+    }
+    else {
+      for( unsigned i = 0 ; i < FunctionCount ; i++ ) {
+        const unsigned row = node_index[i] ;
+        if ( row < residual.dimension_0() ) {
+          atomic_fetch_add( & residual( row ) , elem_vec[i] );
+
+          for( unsigned j = 0 ; j < FunctionCount ; j++ ) {
+            const unsigned entry = elem_graph( ielem , i , j );
+            if ( entry != ~0u ) {
+              atomic_fetch_add( & jacobian.coeff( entry ) , elem_mat[i][j] );
+            }
+          }
+        }
+      }
+    }
+  }
+}; /* ElementComputation */
+
+//----------------------------------------------------------------------------
+
+template< class FixtureType , class SparseMatrixType >
+class DirichletComputation ;
+
+template< class ExecSpace , BoxElemPart::ElemOrder Order , class CoordinateMap , typename ScalarType >
+class DirichletComputation<
+  Kokkos::Example::BoxElemFixture< ExecSpace , Order , CoordinateMap > ,
+  Kokkos::Example::CrsMatrix< ScalarType , ExecSpace > >
+{
+public:
+
+  typedef Kokkos::Example::BoxElemFixture< ExecSpace, Order, CoordinateMap >  mesh_type ;
+  typedef typename mesh_type::node_coord_type                                 node_coord_type ;
+  typedef typename node_coord_type::value_type                                scalar_coord_type ;
+
+  typedef Kokkos::Example::CrsMatrix< ScalarType , ExecSpace >  sparse_matrix_type ;
+  typedef typename sparse_matrix_type::StaticCrsGraphType       sparse_graph_type ;
+
+  typedef ExecSpace   execution_space ;
+  typedef ScalarType  scalar_type ;
+
+  //------------------------------------
+
+  typedef Kokkos::View< scalar_type* , execution_space > vector_type ;
+
+  //------------------------------------
+  // Computational data:
+
+  const node_coord_type     node_coords ;
+  const vector_type         solution ;
+  const sparse_matrix_type  jacobian ;
+  const vector_type         residual ;
+  const scalar_type         bc_lower_value ;
+  const scalar_type         bc_upper_value ;
+  const scalar_coord_type   bc_lower_limit ;
+  const scalar_coord_type   bc_upper_limit ;
+  const unsigned            bc_plane ;
+  const unsigned            node_count ;
+        bool                init ;
+
+
+  DirichletComputation( const mesh_type          & arg_mesh ,
+                        const vector_type        & arg_solution ,
+                        const sparse_matrix_type & arg_jacobian ,
+                        const vector_type        & arg_residual ,
+                        const unsigned             arg_bc_plane ,
+                        const scalar_type          arg_bc_lower_value ,
+                        const scalar_type          arg_bc_upper_value )
+    : node_coords( arg_mesh.node_coord() )
+    , solution(    arg_solution )
+    , jacobian(    arg_jacobian )
+    , residual(    arg_residual )
+    , bc_lower_value( arg_bc_lower_value )
+    , bc_upper_value( arg_bc_upper_value )
+    , bc_lower_limit( std::numeric_limits<scalar_coord_type>::epsilon() )
+    , bc_upper_limit( scalar_coord_type(1) - std::numeric_limits<scalar_coord_type>::epsilon() )
+    , bc_plane(       arg_bc_plane )
+    , node_count( arg_mesh.node_count_owned() )
+    , init( false )
+    {
+      parallel_for( node_count , *this );
+      init = true ;
+    }
+
+  void apply() const
+  {
+    parallel_for( node_count , *this );
+  }
+
+  //------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const unsigned inode ) const
+  {
+    //  Apply dirichlet boundary condition on the Solution and Residual vectors.
+    //  To maintain the symmetry of the original global stiffness matrix,
+    //  zero out the columns that correspond to boundary conditions, and
+    //  update the residual vector accordingly
+
+    const unsigned iBeg = jacobian.graph.row_map[inode];
+    const unsigned iEnd = jacobian.graph.row_map[inode+1];
+
+    const scalar_coord_type c = node_coords(inode,bc_plane);
+    const bool bc_lower = c <= bc_lower_limit ;
+    const bool bc_upper = bc_upper_limit <= c ;
+
+    if ( ! init ) {
+      solution(inode) = bc_lower ? bc_lower_value : (
+                        bc_upper ? bc_upper_value : 0 );
+    }
+    else {
+      if ( bc_lower || bc_upper ) {
+
+        residual(inode) = 0 ;
+
+        //  zero each value on the row, and leave a one
+        //  on the diagonal
+
+        for( unsigned i = iBeg ; i < iEnd ; ++i ) {
+          jacobian.coeff(i) = int(inode) == int(jacobian.graph.entries(i)) ? 1 : 0 ;
+        }
+      }
+      else {
+
+        //  Find any columns that are boundary conditions.
+        //  Clear them and adjust the residual vector
+
+        for( unsigned i = iBeg ; i < iEnd ; ++i ) {
+          const unsigned       cnode = jacobian.graph.entries(i) ;
+          const scalar_coord_type cc = node_coords(cnode,bc_plane);
+
+          if ( ( cc <= bc_lower_limit ) || ( bc_upper_limit <= cc ) ) {
+            jacobian.coeff(i) = 0 ;
+          }
+        }
+      }
+    }
+  }
+};
+
+} /* namespace FENL */
+} /* namespace Example */
+} /* namespace Kokkos  */
+
+//----------------------------------------------------------------------------
+
+/* A Cuda-specific specialization for the element computation functor. */
+#if defined( __CUDACC__ )
+// #include <NonlinearElement_Cuda.hpp>
+#endif
+
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_EXAMPLE_FENLFUNCTORS_HPP */
+
diff --git a/lib/kokkos/example/fenl/fenl_impl.hpp b/lib/kokkos/example/fenl/fenl_impl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..64070ce55fdc1cf7b94d631a0f29b32eecfab357
--- /dev/null
+++ b/lib/kokkos/example/fenl/fenl_impl.hpp
@@ -0,0 +1,598 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_EXAMPLE_FENL_IMPL_HPP
+#define KOKKOS_EXAMPLE_FENL_IMPL_HPP
+
+#include <math.h>
+
+// Kokkos libraries' headers:
+
+#include <Kokkos_UnorderedMap.hpp>
+#include <Kokkos_StaticCrsGraph.hpp>
+#include <impl/Kokkos_Timer.hpp>
+
+// Examples headers:
+
+#include <BoxElemFixture.hpp>
+#include <VectorImport.hpp>
+#include <CGSolve.hpp>
+
+#include <fenl.hpp>
+#include <fenl_functors.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Example {
+namespace FENL {
+
+inline
+double maximum( MPI_Comm comm , double local )
+{
+  double global = local ;
+#if defined( KOKKOS_HAVE_MPI )
+  MPI_Allreduce( & local , & global , 1 , MPI_DOUBLE , MPI_MAX , comm );
+#endif
+  return global ;
+}
+
+} /* namespace FENL */
+} /* namespace Example */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Example {
+namespace FENL {
+
+class ManufacturedSolution {
+public:
+
+  // Manufactured solution for one dimensional nonlinear PDE
+  //
+  //  -K T_zz + T^2 = 0 ; T(zmin) = T_zmin ; T(zmax) = T_zmax
+  //
+  //  Has an analytic solution of the form:
+  //
+  //    T(z) = ( a ( z - zmin ) + b )^(-2) where K = 1 / ( 6 a^2 )
+  //
+  //  Given T_0 and T_L compute K for this analytic solution.
+  //
+  //  Two analytic solutions:
+  //
+  //    Solution with singularity:
+  //    , a( ( 1.0 / sqrt(T_zmax) + 1.0 / sqrt(T_zmin) ) / ( zmax - zmin ) )
+  //    , b( -1.0 / sqrt(T_zmin) )
+  //
+  //    Solution without singularity:
+  //    , a( ( 1.0 / sqrt(T_zmax) - 1.0 / sqrt(T_zmin) ) / ( zmax - zmin ) )
+  //    , b( 1.0 / sqrt(T_zmin) )
+
+  const double zmin ;
+  const double zmax ;
+  const double T_zmin ;
+  const double T_zmax ;
+  const double a ;
+  const double b ;
+  const double K ;
+
+  ManufacturedSolution( const double arg_zmin ,
+                        const double arg_zmax ,
+                        const double arg_T_zmin ,
+                        const double arg_T_zmax )
+    : zmin( arg_zmin )
+    , zmax( arg_zmax )
+    , T_zmin( arg_T_zmin )
+    , T_zmax( arg_T_zmax )
+    , a( ( 1.0 / sqrt(T_zmax) - 1.0 / sqrt(T_zmin) ) / ( zmax - zmin ) )
+    , b( 1.0 / sqrt(T_zmin) )
+    , K( 1.0 / ( 6.0 * a * a ) )
+    {}
+
+  double operator()( const double z ) const
+  {
+    const double tmp = a * ( z - zmin ) + b ;
+    return 1.0 / ( tmp * tmp );
+  }
+};
+
+} /* namespace FENL */
+} /* namespace Example */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Example {
+namespace FENL {
+
+template < class Space , BoxElemPart::ElemOrder ElemOrder >
+Perf fenl(
+  MPI_Comm comm ,
+  const int use_print ,
+  const int use_trials ,
+  const int use_atomic ,
+  const int use_elems[] )
+{
+  typedef Kokkos::Example::BoxElemFixture< Space , ElemOrder > FixtureType ;
+
+  typedef Kokkos::Example::CrsMatrix< double , Space >
+    SparseMatrixType ;
+
+  typedef typename SparseMatrixType::StaticCrsGraphType
+    SparseGraphType ;
+
+  typedef Kokkos::Example::FENL::NodeNodeGraph< typename FixtureType::elem_node_type , SparseGraphType , FixtureType::ElemNode >
+     NodeNodeGraphType ;
+
+  typedef Kokkos::Example::FENL::ElementComputation< FixtureType , SparseMatrixType >
+    ElementComputationType ;
+
+  typedef Kokkos::Example::FENL::DirichletComputation< FixtureType , SparseMatrixType >
+    DirichletComputationType ;
+
+  typedef NodeElemGatherFill< ElementComputationType >
+    NodeElemGatherFillType ;
+
+  typedef typename ElementComputationType::vector_type VectorType ;
+
+  typedef Kokkos::Example::VectorImport<
+     typename FixtureType::comm_list_type ,
+     typename FixtureType::send_nodeid_type ,
+     VectorType > ImportType ;
+
+  //------------------------------------
+
+  const unsigned newton_iteration_limit     = 10 ;
+  const double   newton_iteration_tolerance = 1e-7 ;
+  const unsigned cg_iteration_limit         = 200 ;
+  const double   cg_iteration_tolerance     = 1e-7 ;
+
+  //------------------------------------
+
+  const int print_flag = use_print && Kokkos::Impl::is_same< Kokkos::HostSpace , typename Space::memory_space >::value ;
+
+  int comm_rank ;
+  int comm_size ;
+
+  MPI_Comm_rank( comm , & comm_rank );
+  MPI_Comm_size( comm , & comm_size );
+
+  // Decompose by node to avoid mpi-communication for assembly
+
+  const float bubble_x = 1.0 ;
+  const float bubble_y = 1.0 ;
+  const float bubble_z = 1.0 ;
+
+  const FixtureType fixture( BoxElemPart::DecomposeNode , comm_size , comm_rank ,
+                             use_elems[0] , use_elems[1] , use_elems[2] ,
+                             bubble_x , bubble_y , bubble_z );
+
+
+  {
+    int global_error = ! fixture.ok();
+
+#if defined( KOKKOS_HAVE_MPI )
+    int local_error = global_error ;
+    global_error = 0 ;
+    MPI_Allreduce( & local_error , & global_error , 1 , MPI_INT , MPI_SUM , comm );
+#endif
+
+    if ( global_error ) {
+      throw std::runtime_error(std::string("Error generating finite element fixture"));
+    }
+  }
+
+  //------------------------------------
+
+  const ImportType comm_nodal_import(
+    comm ,
+    fixture.recv_node() ,
+    fixture.send_node() ,
+    fixture.send_nodeid() ,
+    fixture.node_count_owned() ,
+    fixture.node_count() - fixture.node_count_owned() );
+
+  //------------------------------------
+
+  const double bc_lower_value = 1 ;
+  const double bc_upper_value = 2 ;
+
+  const Kokkos::Example::FENL::ManufacturedSolution
+    manufactured_solution( 0 , 1 , bc_lower_value , bc_upper_value  );
+
+  //------------------------------------
+
+  for ( int k = 0 ; k < comm_size && use_print ; ++k ) {
+    if ( k == comm_rank ) {
+      typename FixtureType::node_grid_type::HostMirror
+        h_node_grid = Kokkos::create_mirror_view( fixture.node_grid() );
+
+      typename FixtureType::node_coord_type::HostMirror
+        h_node_coord = Kokkos::create_mirror_view( fixture.node_coord() );
+
+      typename FixtureType::elem_node_type::HostMirror
+        h_elem_node = Kokkos::create_mirror_view( fixture.elem_node() );
+
+      Kokkos::deep_copy( h_node_grid , fixture.node_grid() );
+      Kokkos::deep_copy( h_node_coord , fixture.node_coord() );
+      Kokkos::deep_copy( h_elem_node , fixture.elem_node() );
+
+      std::cout << "MPI[" << comm_rank << "]" << std::endl ;
+      std::cout << "Node grid {" ;
+      for ( unsigned inode = 0 ; inode < fixture.node_count() ; ++inode ) {
+        std::cout << " (" << h_node_grid(inode,0)
+                  << "," << h_node_grid(inode,1)
+                  << "," << h_node_grid(inode,2)
+                  << ")" ;
+      }
+      std::cout << " }" << std::endl ;
+  
+      std::cout << "Node coord {" ;
+      for ( unsigned inode = 0 ; inode < fixture.node_count() ; ++inode ) {
+        std::cout << " (" << h_node_coord(inode,0)
+                  << "," << h_node_coord(inode,1)
+                  << "," << h_node_coord(inode,2)
+                  << ")" ;
+      }
+      std::cout << " }" << std::endl ;
+
+      std::cout << "Manufactured solution"
+                << " a[" << manufactured_solution.a << "]"
+                << " b[" << manufactured_solution.b << "]"
+                << " K[" << manufactured_solution.K << "]"
+                << " {" ;
+      for ( unsigned inode = 0 ; inode < fixture.node_count() ; ++inode ) {
+        std::cout << " " << manufactured_solution( h_node_coord( inode , 2 ) );
+      }
+      std::cout << " }" << std::endl ;
+
+      std::cout << "ElemNode {" << std::endl ;
+      for ( unsigned ielem = 0 ; ielem < fixture.elem_count() ; ++ielem ) {
+        std::cout << "  elem[" << ielem << "]{" ;
+        for ( unsigned inode = 0 ; inode < FixtureType::ElemNode ; ++inode ) {
+          std::cout << " " << h_elem_node(ielem,inode);
+        }
+        std::cout << " }{" ;
+        for ( unsigned inode = 0 ; inode < FixtureType::ElemNode ; ++inode ) {
+          std::cout << " (" << h_node_grid(h_elem_node(ielem,inode),0)
+                    << "," << h_node_grid(h_elem_node(ielem,inode),1)
+                    << "," << h_node_grid(h_elem_node(ielem,inode),2)
+                    << ")" ;
+        }
+        std::cout << " }" << std::endl ;
+      }
+      std::cout << "}" << std::endl ;
+    }
+    std::cout.flush();
+    MPI_Barrier( comm );
+  }
+
+  //------------------------------------
+
+  Kokkos::Timer wall_clock ;
+
+  Perf perf_stats = Perf() ;
+
+  for ( int itrial = 0 ; itrial < use_trials ; ++itrial ) {
+
+    Perf perf = Perf() ;
+
+    perf.global_elem_count = fixture.elem_count_global();
+    perf.global_node_count = fixture.node_count_global();
+
+    //----------------------------------
+    // Create the sparse matrix graph and element-to-graph map
+    // from the element->to->node identifier array.
+    // The graph only has rows for the owned nodes.
+
+    typename NodeNodeGraphType::Times graph_times;
+
+    const NodeNodeGraphType
+      mesh_to_graph( fixture.elem_node() , fixture.node_count_owned(), graph_times );
+
+    perf.map_ratio          = maximum(comm, graph_times.ratio);
+    perf.fill_node_set      = maximum(comm, graph_times.fill_node_set);
+    perf.scan_node_count    = maximum(comm, graph_times.scan_node_count);
+    perf.fill_graph_entries = maximum(comm, graph_times.fill_graph_entries);
+    perf.sort_graph_entries = maximum(comm, graph_times.sort_graph_entries);
+    perf.fill_element_graph = maximum(comm, graph_times.fill_element_graph);
+
+    wall_clock.reset();
+    // Create the sparse matrix from the graph:
+
+    SparseMatrixType jacobian( mesh_to_graph.graph );
+
+    Space::fence();
+
+    perf.create_sparse_matrix = maximum( comm , wall_clock.seconds() );
+
+    //----------------------------------
+
+    for ( int k = 0 ; k < comm_size && print_flag ; ++k ) {
+      if ( k == comm_rank ) {
+        const unsigned nrow = jacobian.graph.numRows();
+        std::cout << "MPI[" << comm_rank << "]" << std::endl ;
+        std::cout << "JacobianGraph {" << std::endl ;
+        for ( unsigned irow = 0 ; irow < nrow ; ++irow ) {
+          std::cout << "  row[" << irow << "]{" ;
+          const unsigned entry_end = jacobian.graph.row_map(irow+1);
+          for ( unsigned entry = jacobian.graph.row_map(irow) ; entry < entry_end ; ++entry ) {
+            std::cout << " " << jacobian.graph.entries(entry);
+          }
+          std::cout << " }" << std::endl ;
+        }
+        std::cout << "}" << std::endl ;
+
+        std::cout << "ElemGraph {" << std::endl ;
+        for ( unsigned ielem = 0 ; ielem < mesh_to_graph.elem_graph.dimension_0() ; ++ielem ) {
+          std::cout << "  elem[" << ielem << "]{" ;
+          for ( unsigned irow = 0 ; irow < mesh_to_graph.elem_graph.dimension_1() ; ++irow ) {
+            std::cout << " {" ;
+            for ( unsigned icol = 0 ; icol < mesh_to_graph.elem_graph.dimension_2() ; ++icol ) {
+              std::cout << " " << mesh_to_graph.elem_graph(ielem,irow,icol);
+            }
+            std::cout << " }" ;
+          }
+          std::cout << " }" << std::endl ;
+        }
+        std::cout << "}" << std::endl ;
+      }
+      std::cout.flush();
+      MPI_Barrier( comm );
+    }
+
+    //----------------------------------
+
+    // Allocate solution vector for each node in the mesh and residual vector for each owned node
+    const VectorType nodal_solution( "nodal_solution" , fixture.node_count() );
+    const VectorType nodal_residual( "nodal_residual" , fixture.node_count_owned() );
+    const VectorType nodal_delta(    "nodal_delta" ,    fixture.node_count_owned() );
+
+    // Create element computation functor
+    const ElementComputationType elemcomp(
+      use_atomic ? ElementComputationType( fixture , manufactured_solution.K , nodal_solution ,
+                                           mesh_to_graph.elem_graph , jacobian , nodal_residual )
+                 : ElementComputationType( fixture , manufactured_solution.K , nodal_solution ) );
+
+    const NodeElemGatherFillType gatherfill(
+      use_atomic ? NodeElemGatherFillType()
+                 : NodeElemGatherFillType( fixture.elem_node() ,
+                                           mesh_to_graph.elem_graph ,
+                                           nodal_residual ,
+                                           jacobian ,
+                                           elemcomp.elem_residuals ,
+                                           elemcomp.elem_jacobians ) );
+
+    // Create boundary condition functor
+    const DirichletComputationType dirichlet(
+      fixture , nodal_solution , jacobian , nodal_residual ,
+      2 /* apply at 'z' ends */ ,
+      manufactured_solution.T_zmin ,
+      manufactured_solution.T_zmax );
+
+    //----------------------------------
+    // Nonlinear Newton iteration:
+
+    double residual_norm_init = 0 ;
+
+    for ( perf.newton_iter_count = 0 ;
+          perf.newton_iter_count < newton_iteration_limit ;
+          ++perf.newton_iter_count ) {
+
+      //--------------------------------
+
+      comm_nodal_import( nodal_solution );
+
+      //--------------------------------
+      // Element contributions to residual and jacobian
+
+      wall_clock.reset();
+
+      Kokkos::deep_copy( nodal_residual , double(0) );
+      Kokkos::deep_copy( jacobian.coeff , double(0) );
+
+      elemcomp.apply();
+
+      if ( ! use_atomic ) {
+        gatherfill.apply();
+      }
+
+      Space::fence();
+      perf.fill_time = maximum( comm , wall_clock.seconds() );
+
+      //--------------------------------
+      // Apply boundary conditions
+
+      wall_clock.reset();
+
+      dirichlet.apply();
+
+      Space::fence();
+      perf.bc_time = maximum( comm , wall_clock.seconds() );
+
+      //--------------------------------
+      // Evaluate convergence
+
+      const double residual_norm =
+        std::sqrt(
+          Kokkos::Example::all_reduce(
+            Kokkos::Example::dot( fixture.node_count_owned() , nodal_residual, nodal_residual ) , comm ) );
+
+      perf.newton_residual = residual_norm ;
+
+      if ( 0 == perf.newton_iter_count ) { residual_norm_init = residual_norm ; }
+
+      if ( residual_norm < residual_norm_init * newton_iteration_tolerance ) { break ; }
+
+      //--------------------------------
+      // Solve for nonlinear update
+
+      CGSolveResult cg_result ;
+
+      Kokkos::Example::cgsolve( comm_nodal_import
+                              , jacobian
+                              , nodal_residual
+                              , nodal_delta
+                              , cg_iteration_limit
+                              , cg_iteration_tolerance
+                              , & cg_result
+                              );
+
+      // Update solution vector
+
+      Kokkos::Example::waxpby( fixture.node_count_owned() , nodal_solution , -1.0 , nodal_delta , 1.0 , nodal_solution );
+
+      perf.cg_iter_count += cg_result.iteration ;
+      perf.matvec_time   += cg_result.matvec_time ;
+      perf.cg_time       += cg_result.iter_time ;
+
+      //--------------------------------
+
+      if ( print_flag ) {
+        const double delta_norm =
+          std::sqrt(
+            Kokkos::Example::all_reduce(
+              Kokkos::Example::dot( fixture.node_count_owned() , nodal_delta, nodal_delta ) , comm ) );
+
+        if ( 0 == comm_rank ) {
+          std::cout << "Newton iteration[" << perf.newton_iter_count << "]"
+                    << " residual[" << perf.newton_residual << "]"
+                    << " update[" << delta_norm << "]"
+                    << " cg_iteration[" << cg_result.iteration << "]"
+                    << " cg_residual[" << cg_result.norm_res << "]"
+                    << std::endl ;
+        }
+
+        for ( int k = 0 ; k < comm_size ; ++k ) {
+          if ( k == comm_rank ) {
+            const unsigned nrow = jacobian.graph.numRows();
+
+            std::cout << "MPI[" << comm_rank << "]" << std::endl ;
+            std::cout << "Residual {" ;
+            for ( unsigned irow = 0 ; irow < nrow ; ++irow ) {
+              std::cout << " " << nodal_residual(irow);
+            }
+            std::cout << " }" << std::endl ;
+
+            std::cout << "Delta {" ;
+            for ( unsigned irow = 0 ; irow < nrow ; ++irow ) {
+              std::cout << " " << nodal_delta(irow);
+            }
+            std::cout << " }" << std::endl ;
+
+            std::cout << "Solution {" ;
+            for ( unsigned irow = 0 ; irow < nrow ; ++irow ) {
+              std::cout << " " << nodal_solution(irow);
+            }
+            std::cout << " }" << std::endl ;
+
+            std::cout << "Jacobian[ "
+                      << jacobian.graph.numRows() << " x " << Kokkos::maximum_entry( jacobian.graph )
+                      << " ] {" << std::endl ;
+            for ( unsigned irow = 0 ; irow < nrow ; ++irow ) {
+              std::cout << "  {" ;
+              const unsigned entry_end = jacobian.graph.row_map(irow+1);
+              for ( unsigned entry = jacobian.graph.row_map(irow) ; entry < entry_end ; ++entry ) {
+                std::cout << " (" << jacobian.graph.entries(entry)
+                          << "," << jacobian.coeff(entry)
+                          << ")" ;
+              }
+              std::cout << " }" << std::endl ;
+            }
+            std::cout << "}" << std::endl ;
+          }
+          std::cout.flush();
+          MPI_Barrier( comm );
+        }
+      }
+      //--------------------------------
+    }
+
+    // Evaluate solution error
+
+    if ( 0 == itrial ) {
+      const typename FixtureType::node_coord_type::HostMirror
+        h_node_coord = Kokkos::create_mirror_view( fixture.node_coord() );
+
+      const typename VectorType::HostMirror
+        h_nodal_solution = Kokkos::create_mirror_view( nodal_solution );
+
+      Kokkos::deep_copy( h_node_coord , fixture.node_coord() );
+      Kokkos::deep_copy( h_nodal_solution , nodal_solution );
+
+      double error_max = 0 ;
+      for ( unsigned inode = 0 ; inode < fixture.node_count_owned() ; ++inode ) {
+        const double answer = manufactured_solution( h_node_coord( inode , 2 ) );
+        const double error = ( h_nodal_solution(inode) - answer ) / answer ;
+        if ( error_max < fabs( error ) ) { error_max = fabs( error ); }
+      }
+
+      perf.error_max = std::sqrt( Kokkos::Example::all_reduce_max( error_max , comm ) );
+
+      perf_stats = perf ;
+    }
+    else {
+      perf_stats.fill_node_set = std::min( perf_stats.fill_node_set , perf.fill_node_set );
+      perf_stats.scan_node_count = std::min( perf_stats.scan_node_count , perf.scan_node_count );
+      perf_stats.fill_graph_entries = std::min( perf_stats.fill_graph_entries , perf.fill_graph_entries );
+      perf_stats.sort_graph_entries = std::min( perf_stats.sort_graph_entries , perf.sort_graph_entries );
+      perf_stats.fill_element_graph = std::min( perf_stats.fill_element_graph , perf.fill_element_graph );
+      perf_stats.create_sparse_matrix = std::min( perf_stats.create_sparse_matrix , perf.create_sparse_matrix );
+      perf_stats.fill_time = std::min( perf_stats.fill_time , perf.fill_time );
+      perf_stats.bc_time = std::min( perf_stats.bc_time , perf.bc_time );
+      perf_stats.cg_time = std::min( perf_stats.cg_time , perf.cg_time );
+    }
+  }
+
+  return perf_stats ;
+}
+
+} /* namespace FENL */
+} /* namespace Example */
+} /* namespace Kokkos */
+
+#endif /* #ifndef KOKKOS_EXAMPLE_FENL_IMPL_HPP */
+
diff --git a/lib/kokkos/example/fenl/main.cpp b/lib/kokkos/example/fenl/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..06005d97cb17d20fcf8759f08f76823a936ee558
--- /dev/null
+++ b/lib/kokkos/example/fenl/main.cpp
@@ -0,0 +1,422 @@
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+
+#include <utility>
+#include <string>
+#include <vector>
+#include <sstream>
+#include <iostream>
+#include <iomanip>
+
+#include <Kokkos_Core.hpp>
+
+#include <WrapMPI.hpp>
+#include <fenl.hpp>
+
+// For vtune
+#include <sys/types.h>
+#include <unistd.h>
+
+//----------------------------------------------------------------------------
+
+enum { CMD_USE_THREADS = 0
+     , CMD_USE_NUMA
+     , CMD_USE_CORE_PER_NUMA
+     , CMD_USE_CUDA
+     , CMD_USE_OPENMP
+     , CMD_USE_CUDA_DEV
+     , CMD_USE_FIXTURE_X
+     , CMD_USE_FIXTURE_Y
+     , CMD_USE_FIXTURE_Z
+     , CMD_USE_FIXTURE_BEGIN
+     , CMD_USE_FIXTURE_END
+     , CMD_USE_FIXTURE_QUADRATIC
+     , CMD_USE_ATOMIC
+     , CMD_USE_TRIALS
+     , CMD_VTUNE
+     , CMD_PRINT
+     , CMD_ECHO
+     , CMD_ERROR
+     , CMD_COUNT };
+
+void print_cmdline( std::ostream & s , const int cmd[] )
+{
+  if ( cmd[ CMD_USE_THREADS ] ) {
+    s << " Threads(" << cmd[ CMD_USE_THREADS ]
+      << ") NUMA(" << cmd[ CMD_USE_NUMA ]
+      << ") CORE_PER_NUMA(" << cmd[ CMD_USE_CORE_PER_NUMA ]
+      << ")" ;
+  }
+  if ( cmd[ CMD_USE_OPENMP ] ) {
+    s << " OpenMP(" << cmd[ CMD_USE_OPENMP ]
+      << ") NUMA(" << cmd[ CMD_USE_NUMA ]
+      << ") CORE_PER_NUMA(" << cmd[ CMD_USE_CORE_PER_NUMA ]
+      << ")" ;
+  }
+  if ( cmd[ CMD_USE_FIXTURE_X ] ) {
+    s << " Fixture(" << cmd[ CMD_USE_FIXTURE_X ]
+      << "x" << cmd[ CMD_USE_FIXTURE_Y ]
+      << "x" << cmd[ CMD_USE_FIXTURE_Z ]
+      << ")" ;
+  }
+  if ( cmd[ CMD_USE_FIXTURE_BEGIN ] ) {
+    s << " Fixture( " << cmd[ CMD_USE_FIXTURE_BEGIN ]
+      << " .. " << cmd[ CMD_USE_FIXTURE_END ]
+      << " )" ;
+  }
+  if ( cmd[ CMD_USE_FIXTURE_QUADRATIC ] ) {
+    s << " Quadratic-Element" ;
+  }
+  if ( cmd[ CMD_USE_CUDA ] ) {
+    s << " CUDA(" << cmd[ CMD_USE_CUDA_DEV ] << ")" ;
+  }
+  if ( cmd[ CMD_USE_ATOMIC ] ) {
+    s << " ATOMIC" ;
+  }
+  if ( cmd[ CMD_USE_TRIALS ] ) {
+    s << " TRIALS(" << cmd[ CMD_USE_TRIALS ] << ")" ;
+  }
+  if ( cmd[ CMD_VTUNE ] ) {
+    s << " VTUNE" ;
+  }
+  if ( cmd[ CMD_PRINT ] ) {
+    s << " PRINT" ;
+  }
+  s << std::endl ;
+}
+
+void print_perf_value( std::ostream & s , const std::vector<size_t> & widths,  const Kokkos::Example::FENL::Perf & perf )
+{
+  int i=0;
+  s << std::setw(widths[i++]) << perf.global_elem_count << " ,";
+  s << std::setw(widths[i++]) << perf.global_node_count << " ,";
+  s << std::setw(widths[i++]) << perf.newton_iter_count << " ,";
+  s << std::setw(widths[i++]) << perf.cg_iter_count << " ,";
+  s << std::setw(widths[i++]) << perf.map_ratio << " ,";
+  s << std::setw(widths[i++]) << ( perf.fill_node_set * 1000.0 ) / perf.global_node_count << " ,";
+  s << std::setw(widths[i++]) << ( perf.scan_node_count * 1000.0 ) / perf.global_node_count << " ,";
+  s << std::setw(widths[i++]) << ( perf.fill_graph_entries * 1000.0 ) / perf.global_node_count << " ,";
+  s << std::setw(widths[i++]) << ( perf.sort_graph_entries * 1000.0 ) / perf.global_node_count << " ,";
+  s << std::setw(widths[i++]) << ( perf.fill_element_graph * 1000.0 ) / perf.global_node_count << " ,";
+  s << std::setw(widths[i++]) << ( perf.create_sparse_matrix * 1000.0 ) / perf.global_node_count << " ,";
+  s << std::setw(widths[i++]) << ( perf.fill_time * 1000.0 ) / perf.global_node_count << " ,";
+  s << std::setw(widths[i++]) << ( perf.bc_time * 1000.0 ) / perf.global_node_count << " ,";
+  s << std::setw(widths[i++]) << ( ( perf.matvec_time * 1000.0 ) / perf.cg_iter_count ) / perf.global_node_count << " ,";
+  s << std::setw(widths[i++]) << ( ( perf.cg_time * 1000.0 ) / perf.cg_iter_count ) / perf.global_node_count << " ,";
+  s << std::setw(widths[i])   << perf.error_max;
+  s << std::endl ;
+}
+
+template< class Device , Kokkos::Example::BoxElemPart::ElemOrder ElemOrder >
+void run( MPI_Comm comm , const int cmd[] )
+{
+  int comm_rank = 0 ;
+
+#if defined( KOKKOS_HAVE_MPI )
+  MPI_Comm_rank( comm , & comm_rank );
+#else
+  comm = 0 ;
+#endif
+
+
+  if ( 0 == comm_rank ) {
+    if ( cmd[ CMD_USE_THREADS ] ) { std::cout << "THREADS , " << cmd[ CMD_USE_THREADS ] ; }
+    else if ( cmd[ CMD_USE_OPENMP ] ) { std::cout << "OPENMP , " << cmd[ CMD_USE_OPENMP ] ; }
+    else if ( cmd[ CMD_USE_CUDA ] ) { std::cout << "CUDA" ; }
+
+    if ( cmd[ CMD_USE_FIXTURE_QUADRATIC ] ) { std::cout << " , QUADRATIC-ELEMENT" ; }
+    else { std::cout << " , LINEAR-ELEMENT" ; }
+
+    if ( cmd[ CMD_USE_ATOMIC ] ) { std::cout << " , USING ATOMICS" ; }
+  }
+
+  std::vector< std::pair<std::string,std::string> > headers;
+
+
+  headers.push_back(std::make_pair("ELEMS","count"));
+  headers.push_back(std::make_pair("NODES","count"));
+  headers.push_back(std::make_pair("NEWTON","iter"));
+  headers.push_back(std::make_pair("CG","iter"));
+  headers.push_back(std::make_pair("MAP_RATIO","ratio"));
+  headers.push_back(std::make_pair("SET_FILL/NODE","millisec"));
+  headers.push_back(std::make_pair("SCAN/NODE","millisec"));
+  headers.push_back(std::make_pair("GRAPH_FILL/NODE","millisec"));
+  headers.push_back(std::make_pair("SORT/NODE","millisec"));
+  headers.push_back(std::make_pair("ELEM_GRAPH_FILL/NODE","millisec"));
+  headers.push_back(std::make_pair("MATRIX_CREATE/NODE","millisec"));
+  headers.push_back(std::make_pair("MATRIX_FILL/NODE","millisec"));
+  headers.push_back(std::make_pair("BOUNDARY/NODE","millisec"));
+  headers.push_back(std::make_pair("MAT_VEC/ITER/ROW","millisec"));
+  headers.push_back(std::make_pair("CG/ITER/ROW","millisec"));
+  headers.push_back(std::make_pair("ERROR","ratio"));
+
+  // find print widths
+  size_t min_width = 10;
+  std::vector< size_t > widths(headers.size());
+  for (size_t i=0, ie=headers.size(); i<ie; ++i)
+    widths[i] = std::max(min_width, headers[i].first.size()+1);
+
+  // print column headers
+  if ( 0 == comm_rank ) {
+    std::cout << std::endl ;
+    for (size_t i=0; i<headers.size(); ++i)
+      std::cout << std::setw(widths[i]) << headers[i].first << " ,";
+    std::cout << "\b\b  " << std::endl;
+    for (size_t i=0; i<headers.size(); ++i)
+      std::cout << std::setw(widths[i]) << headers[i].second << " ,";
+    std::cout << "\b\b  " << std::endl;
+
+    std::cout << std::scientific;
+    std::cout.precision(3);
+  }
+
+  if ( cmd[ CMD_USE_FIXTURE_BEGIN ] ) {
+    for ( int i = cmd[CMD_USE_FIXTURE_BEGIN] ; i < cmd[CMD_USE_FIXTURE_END] * 2 ; i *= 2 ) {
+      int nelem[3] ;
+      nelem[0] = std::max( 1 , (int) cbrt( ((double) i) / 2.0 ) );
+      nelem[1] = 1 + nelem[0] ;
+      nelem[2] = 2 * nelem[0] ;
+
+      const Kokkos::Example::FENL::Perf perf =
+        cmd[ CMD_USE_FIXTURE_QUADRATIC ]
+        ? Kokkos::Example::FENL::fenl< Device , Kokkos::Example::BoxElemPart::ElemQuadratic >
+            ( comm , cmd[CMD_PRINT], cmd[CMD_USE_TRIALS], cmd[CMD_USE_ATOMIC], nelem )
+        : Kokkos::Example::FENL::fenl< Device , Kokkos::Example::BoxElemPart::ElemLinear >
+            ( comm , cmd[CMD_PRINT], cmd[CMD_USE_TRIALS], cmd[CMD_USE_ATOMIC], nelem )
+        ;
+
+      if ( 0 == comm_rank ) print_perf_value( std::cout , widths, perf );
+    }
+  }
+  else {
+    int nelem[3] = { cmd[ CMD_USE_FIXTURE_X ] ,
+                     cmd[ CMD_USE_FIXTURE_Y ] ,
+                     cmd[ CMD_USE_FIXTURE_Z ] };
+
+    const Kokkos::Example::FENL::Perf perf =
+      cmd[ CMD_USE_FIXTURE_QUADRATIC ]
+      ? Kokkos::Example::FENL::fenl< Device , Kokkos::Example::BoxElemPart::ElemQuadratic >
+          ( comm , cmd[CMD_PRINT], cmd[CMD_USE_TRIALS], cmd[CMD_USE_ATOMIC], nelem )
+      : Kokkos::Example::FENL::fenl< Device , Kokkos::Example::BoxElemPart::ElemLinear >
+          ( comm , cmd[CMD_PRINT], cmd[CMD_USE_TRIALS], cmd[CMD_USE_ATOMIC], nelem )
+      ;
+
+    if ( 0 == comm_rank ) print_perf_value( std::cout , widths, perf );
+  }
+}
+
+//----------------------------------------------------------------------------
+
+int main( int argc , char ** argv )
+{
+  int comm_rank = 0 ;
+
+#if defined( KOKKOS_HAVE_MPI )
+  MPI_Init( & argc , & argv );
+  MPI_Comm comm = MPI_COMM_WORLD ;
+  MPI_Comm_rank( comm , & comm_rank );
+#else
+  MPI_Comm comm = 0 ;
+  (void) comm ; // suppress warning
+#endif
+
+  int cmdline[ CMD_COUNT ] ;
+
+  for ( int i = 0 ; i < CMD_COUNT ; ++i ) cmdline[i] = 0 ;
+
+  if ( 0 == comm_rank ) {
+    for ( int i = 1 ; i < argc ; ++i ) {
+      if ( 0 == strcasecmp( argv[i] , "threads" ) ) {
+        cmdline[ CMD_USE_THREADS ] = atoi( argv[++i] );
+      }
+      else if ( 0 == strcasecmp( argv[i] , "openmp" ) ) {
+        cmdline[ CMD_USE_OPENMP ] = atoi( argv[++i] );
+      }
+      else if ( 0 == strcasecmp( argv[i] , "cores" ) ) {
+        sscanf( argv[++i] , "%dx%d" ,
+                cmdline + CMD_USE_NUMA ,
+                cmdline + CMD_USE_CORE_PER_NUMA );
+      }
+      else if ( 0 == strcasecmp( argv[i] , "cuda" ) ) {
+        cmdline[ CMD_USE_CUDA ] = 1 ;
+      }
+      else if ( 0 == strcasecmp( argv[i] , "cuda-dev" ) ) {
+        cmdline[ CMD_USE_CUDA ] = 1 ;
+        cmdline[ CMD_USE_CUDA_DEV ] = atoi( argv[++i] ) ;
+      }
+      else if ( 0 == strcasecmp( argv[i] , "fixture" ) ) {
+        sscanf( argv[++i] , "%dx%dx%d" ,
+                cmdline + CMD_USE_FIXTURE_X ,
+                cmdline + CMD_USE_FIXTURE_Y ,
+                cmdline + CMD_USE_FIXTURE_Z );
+      }
+      else if ( 0 == strcasecmp( argv[i] , "fixture-range" ) ) {
+        sscanf( argv[++i] , "%d..%d" ,
+                cmdline + CMD_USE_FIXTURE_BEGIN ,
+                cmdline + CMD_USE_FIXTURE_END );
+      }
+      else if ( 0 == strcasecmp( argv[i] , "fixture-quadratic" ) ) {
+        cmdline[ CMD_USE_FIXTURE_QUADRATIC ] = 1 ;
+      }
+      else if ( 0 == strcasecmp( argv[i] , "atomic" ) ) {
+        cmdline[ CMD_USE_ATOMIC ] = 1 ;
+      }
+      else if ( 0 == strcasecmp( argv[i] , "trials" ) ) {
+        cmdline[ CMD_USE_TRIALS ] = atoi( argv[++i] ) ;
+      }
+      else if ( 0 == strcasecmp( argv[i] , "vtune" ) ) {
+        cmdline[ CMD_VTUNE ] = 1 ;
+      }
+      else if ( 0 == strcasecmp( argv[i] , "print" ) ) {
+        cmdline[ CMD_PRINT ] = 1 ;
+      }
+      else if ( 0 == strcasecmp( argv[i] , "echo" ) ) {
+        cmdline[ CMD_ECHO ] = 1 ;
+      }
+      else {
+        cmdline[ CMD_ERROR ] = 1 ;
+
+        std::cerr << "Unrecognized command line argument #" << i << ": " << argv[i] << std::endl ;
+      }
+    }
+
+    if ( cmdline[ CMD_ECHO ] && 0 == comm_rank ) { print_cmdline( std::cout , cmdline ); }
+  }
+
+#if defined( KOKKOS_HAVE_MPI )
+  MPI_Bcast( cmdline , CMD_COUNT , MPI_INT , 0 , comm );
+#endif
+
+  if ( cmdline[ CMD_VTUNE ] ) {
+    std::stringstream cmd;
+    pid_t my_os_pid=getpid();
+    const std::string vtune_loc =
+      "/usr/local/intel/vtune_amplifier_xe_2013/bin64/amplxe-cl";
+    const std::string output_dir = "./vtune/vtune.";
+    const int p_rank = comm_rank;
+    cmd << vtune_loc
+        << " -collect hotspots -result-dir " << output_dir << p_rank
+        << " -target-pid " << my_os_pid << " &";
+    if (p_rank == 0)
+      std::cout << cmd.str() << std::endl;
+    system(cmd.str().c_str());
+    system("sleep 10");
+  }
+
+  if ( ! cmdline[ CMD_ERROR ] && ! cmdline[ CMD_ECHO ] ) {
+
+    if ( ! cmdline[ CMD_USE_TRIALS ] ) { cmdline[ CMD_USE_TRIALS ] = 1 ; }
+
+    if ( ! cmdline[ CMD_USE_FIXTURE_X ] && ! cmdline[ CMD_USE_FIXTURE_BEGIN ] ) {
+      cmdline[ CMD_USE_FIXTURE_X ] = 2 ;
+      cmdline[ CMD_USE_FIXTURE_Y ] = 2 ;
+      cmdline[ CMD_USE_FIXTURE_Z ] = 2 ;
+    }
+
+#if defined( KOKKOS_HAVE_PTHREAD )
+
+    if ( cmdline[ CMD_USE_THREADS ] ) {
+
+      if ( cmdline[ CMD_USE_NUMA ] && cmdline[ CMD_USE_CORE_PER_NUMA ] ) {
+        Kokkos::Threads::initialize( cmdline[ CMD_USE_THREADS ] ,
+                                     cmdline[ CMD_USE_NUMA ] ,
+                                     cmdline[ CMD_USE_CORE_PER_NUMA ] );
+      }
+      else {
+        Kokkos::Threads::initialize( cmdline[ CMD_USE_THREADS ] );
+      }
+
+      run< Kokkos::Threads , Kokkos::Example::BoxElemPart::ElemLinear >( comm , cmdline );
+
+      Kokkos::Threads::finalize();
+    }
+
+#endif
+
+#if defined( KOKKOS_HAVE_OPENMP )
+
+    if ( cmdline[ CMD_USE_OPENMP ] ) {
+
+      if ( cmdline[ CMD_USE_NUMA ] && cmdline[ CMD_USE_CORE_PER_NUMA ] ) {
+        Kokkos::OpenMP::initialize( cmdline[ CMD_USE_OPENMP ] ,
+                                     cmdline[ CMD_USE_NUMA ] ,
+                                     cmdline[ CMD_USE_CORE_PER_NUMA ] );
+      }
+      else {
+        Kokkos::OpenMP::initialize( cmdline[ CMD_USE_OPENMP ] );
+      }
+
+      run< Kokkos::OpenMP , Kokkos::Example::BoxElemPart::ElemLinear >( comm , cmdline );
+
+      Kokkos::OpenMP::finalize();
+    }
+
+#endif
+
+#if defined( KOKKOS_HAVE_CUDA )
+    if ( cmdline[ CMD_USE_CUDA ] ) {
+      // Use the last device:
+
+      Kokkos::HostSpace::execution_space::initialize();
+      Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice( cmdline[ CMD_USE_CUDA_DEV ] ) );
+
+      run< Kokkos::Cuda , Kokkos::Example::BoxElemPart::ElemLinear >( comm , cmdline );
+
+      Kokkos::Cuda::finalize();
+      Kokkos::HostSpace::execution_space::finalize();
+    }
+
+#endif
+
+  }
+
+#if defined( KOKKOS_HAVE_MPI )
+  MPI_Finalize();
+#endif
+
+  return cmdline[ CMD_ERROR ] ? -1 : 0 ;
+}
+
diff --git a/lib/kokkos/example/fixture/BoxElemFixture.hpp b/lib/kokkos/example/fixture/BoxElemFixture.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..66d6e741afcc40d6e7b838bb0712ab5c1652ffe8
--- /dev/null
+++ b/lib/kokkos/example/fixture/BoxElemFixture.hpp
@@ -0,0 +1,355 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_EXAMPLE_BOXELEMFIXTURE_HPP
+#define KOKKOS_EXAMPLE_BOXELEMFIXTURE_HPP
+
+#include <stdio.h>
+#include <utility>
+
+#include <Kokkos_Core.hpp>
+
+#include <HexElement.hpp>
+#include <BoxElemPart.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Example {
+
+/** \brief  Map a grid onto a unit cube with smooth nonlinear grading
+ *          of the map.
+ */
+struct MapGridUnitCube {
+
+  const float m_a ;
+  const float m_b ;
+  const float m_c ;
+  const size_t m_max_x ;
+  const size_t m_max_y ;
+  const size_t m_max_z ;
+
+  MapGridUnitCube( const size_t grid_max_x ,
+                   const size_t grid_max_y ,
+                   const size_t grid_max_z ,
+                   const float bubble_x ,
+                   const float bubble_y ,
+                   const float bubble_z )
+    : m_a( bubble_x )
+    , m_b( bubble_y )
+    , m_c( bubble_z )
+    , m_max_x( grid_max_x )
+    , m_max_y( grid_max_y )
+    , m_max_z( grid_max_z )
+    {}
+
+  template< typename Scalar >
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int grid_x ,
+                   int grid_y ,
+                   int grid_z ,
+                   Scalar & coord_x ,
+                   Scalar & coord_y ,
+                   Scalar & coord_z ) const
+    {
+      // Map to a unit cube [0,1]^3
+
+      const double x = double(grid_x) / double(m_max_x);
+      const double y = double(grid_y) / double(m_max_y);
+      const double z = double(grid_z) / double(m_max_z);
+    
+      coord_x = x + x * x * ( x - 1 ) * ( x - 1 ) * m_a ;
+      coord_y = y + y * y * ( y - 1 ) * ( y - 1 ) * m_b ;
+      coord_z = z + z * z * ( z - 1 ) * ( z - 1 ) * m_c ;
+    }
+};
+
+} // namespace Example
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Example {
+
+/** \brief  Generate a distributed unstructured finite element mesh
+ *          from a partitioned NX*NY*NZ box of elements.
+ *
+ *  Order owned nodes first followed by off-process nodes
+ *  grouped by owning process.
+ */
+template< class Device ,
+          BoxElemPart::ElemOrder Order ,
+          class CoordinateMap = MapGridUnitCube >
+class BoxElemFixture {
+public:
+
+  typedef Device execution_space ;
+
+  enum { SpaceDim = 3 };
+  enum { ElemNode = Order == BoxElemPart::ElemLinear ? 8 :
+                    Order == BoxElemPart::ElemQuadratic ? 27 : 0 };
+
+private:
+
+  typedef Kokkos::Example::HexElement_TensorData< ElemNode > hex_data ;
+
+  Kokkos::Example::BoxElemPart m_box_part ;
+  CoordinateMap                m_coord_map ;
+
+  Kokkos::View< double *[SpaceDim] , Device > m_node_coord ;
+  Kokkos::View< size_t *[SpaceDim] , Device > m_node_grid ;
+  Kokkos::View< size_t *[ElemNode] , Device > m_elem_node ;
+  Kokkos::View< size_t *[2] ,        Device > m_recv_node ;
+  Kokkos::View< size_t *[2] ,        Device > m_send_node ;
+  Kokkos::View< size_t * ,           Device > m_send_node_id ;
+
+  unsigned char m_elem_node_local[ ElemNode ][4] ;
+
+public:
+
+  typedef Kokkos::View< const size_t  * [ElemNode], Device > elem_node_type ;
+  typedef Kokkos::View< const double  * [SpaceDim], Device > node_coord_type ;
+  typedef Kokkos::View< const size_t  * [SpaceDim], Device > node_grid_type ;
+  typedef Kokkos::View< const size_t  * [2] , Device > comm_list_type ;
+  typedef Kokkos::View< const size_t  *     , Device > send_nodeid_type ;
+
+  inline bool ok() const { return m_box_part.ok(); }
+
+  KOKKOS_INLINE_FUNCTION
+  size_t node_count() const { return m_node_grid.dimension_0(); }
+
+  KOKKOS_INLINE_FUNCTION
+  size_t node_count_owned() const { return m_box_part.owns_node_count(); }
+
+  KOKKOS_INLINE_FUNCTION
+  size_t node_count_global() const { return m_box_part.global_node_count(); }
+
+  KOKKOS_INLINE_FUNCTION
+  size_t elem_count() const { return m_elem_node.dimension_0(); }
+
+  KOKKOS_INLINE_FUNCTION
+  size_t elem_count_global() const { return m_box_part.global_elem_count(); }
+
+  KOKKOS_INLINE_FUNCTION
+  size_t elem_node_local( size_t inode , int k ) const
+    { return m_elem_node_local[inode][k] ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_t node_grid( size_t inode , int iaxis ) const
+    { return m_node_grid(inode,iaxis); }
+
+  KOKKOS_INLINE_FUNCTION
+  size_t node_global_index( size_t local ) const
+    {
+      const size_t tmp_node_grid[SpaceDim] =
+        { m_node_grid(local,0) , m_node_grid(local,1) , m_node_grid(local,2) };
+      return m_box_part.global_node_id( tmp_node_grid );
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  double node_coord( size_t inode , int iaxis ) const
+    { return m_node_coord(inode,iaxis); }
+
+  KOKKOS_INLINE_FUNCTION
+  size_t node_grid_max( int iaxis ) const
+    { return m_box_part.global_coord_max(iaxis); }
+
+  KOKKOS_INLINE_FUNCTION
+  size_t elem_node( size_t ielem , size_t inode ) const
+    { return m_elem_node(ielem,inode); }
+
+  elem_node_type   elem_node()   const { return m_elem_node ; }
+  node_coord_type  node_coord()  const { return m_node_coord ; }
+  node_grid_type   node_grid()   const { return m_node_grid ; }
+  comm_list_type   recv_node()   const { return m_recv_node ; }
+  comm_list_type   send_node()   const { return m_send_node ; }
+  send_nodeid_type send_nodeid() const { return m_send_node_id ; }
+
+  KOKKOS_INLINE_FUNCTION
+  BoxElemFixture( const BoxElemFixture & rhs )
+    : m_box_part(   rhs.m_box_part )
+    , m_coord_map(  rhs.m_coord_map )
+    , m_node_coord( rhs.m_node_coord )
+    , m_node_grid(  rhs.m_node_grid )
+    , m_elem_node(  rhs.m_elem_node )
+    , m_recv_node(  rhs.m_recv_node )
+    , m_send_node(  rhs.m_send_node )
+    , m_send_node_id( rhs.m_send_node_id )
+    {
+      for ( int i = 0 ; i < ElemNode ; ++i ) {
+        m_elem_node_local[i][0] = rhs.m_elem_node_local[i][0] ;
+        m_elem_node_local[i][1] = rhs.m_elem_node_local[i][1] ;
+        m_elem_node_local[i][2] = rhs.m_elem_node_local[i][2] ;
+        m_elem_node_local[i][3] = 0 ;
+      }
+    }
+
+  BoxElemFixture & operator = ( const BoxElemFixture & rhs )
+    {
+      m_box_part      = rhs.m_box_part ;
+      m_coord_map     = rhs.m_coord_map ;
+      m_node_coord    = rhs.m_node_coord ;
+      m_node_grid     = rhs.m_node_grid ;
+      m_elem_node     = rhs.m_elem_node ;
+      m_recv_node     = rhs.m_recv_node ;
+      m_send_node     = rhs.m_send_node ;
+      m_send_node_id  = rhs.m_send_node_id ;
+     
+      for ( int i = 0 ; i < ElemNode ; ++i ) {
+        m_elem_node_local[i][0] = rhs.m_elem_node_local[i][0] ;
+        m_elem_node_local[i][1] = rhs.m_elem_node_local[i][1] ;
+        m_elem_node_local[i][2] = rhs.m_elem_node_local[i][2] ;
+        m_elem_node_local[i][3] = 0 ;
+      }
+      return *this ;
+    }
+
+  BoxElemFixture( const BoxElemPart::Decompose decompose ,
+                  const size_t global_size ,
+                  const size_t global_rank ,
+                  const size_t elem_nx ,
+                  const size_t elem_ny ,
+                  const size_t elem_nz ,
+                  const float bubble_x = 1.1f ,
+                  const float bubble_y = 1.2f ,
+                  const float bubble_z = 1.3f )
+  : m_box_part( Order , decompose , global_size , global_rank , elem_nx , elem_ny , elem_nz )
+  , m_coord_map( m_box_part.global_coord_max(0) ,
+                 m_box_part.global_coord_max(1) ,
+                 m_box_part.global_coord_max(2) ,
+                 bubble_x ,
+                 bubble_y ,
+                 bubble_z )
+  , m_node_coord( "fixture_node_coord" , m_box_part.uses_node_count() )
+  , m_node_grid(  "fixture_node_grid" , m_box_part.uses_node_count() )
+  , m_elem_node(  "fixture_elem_node" , m_box_part.uses_elem_count() )
+  , m_recv_node(  "fixture_recv_node" , m_box_part.recv_node_msg_count() )
+  , m_send_node(  "fixture_send_node" , m_box_part.send_node_msg_count() )
+  , m_send_node_id( "fixture_send_node_id" , m_box_part.send_node_id_count() )
+  {
+    {
+      const hex_data elem_data ;
+
+      for ( int i = 0 ; i < ElemNode ; ++i ) {
+        m_elem_node_local[i][0] = elem_data.eval_map[i][0] ;
+        m_elem_node_local[i][1] = elem_data.eval_map[i][1] ;
+        m_elem_node_local[i][2] = elem_data.eval_map[i][2] ;
+        m_elem_node_local[i][3] = 0 ;
+      }
+    }
+
+    const size_t nwork = 
+      std::max( m_recv_node.dimension_0() ,
+      std::max( m_send_node.dimension_0() ,
+      std::max( m_send_node_id.dimension_0() ,
+      std::max( m_node_grid.dimension_0() ,
+                m_elem_node.dimension_0() * m_elem_node.dimension_1() ))));
+
+    Kokkos::parallel_for( nwork , *this );
+  }
+
+
+  // Initialization:
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( size_t i ) const
+  {
+    if ( i < m_elem_node.dimension_0() * m_elem_node.dimension_1() ) {
+
+      const size_t ielem = i / ElemNode ;
+      const size_t inode = i % ElemNode ;
+
+      size_t elem_grid[SpaceDim] ;
+      size_t tmp_node_grid[SpaceDim] ;
+
+      m_box_part.uses_elem_coord( ielem , elem_grid );
+
+      enum { elem_node_scale = Order == BoxElemPart::ElemLinear ? 1 :
+                               Order == BoxElemPart::ElemQuadratic ? 2 : 0 };
+
+      tmp_node_grid[0] = elem_node_scale * elem_grid[0] + m_elem_node_local[inode][0] ;
+      tmp_node_grid[1] = elem_node_scale * elem_grid[1] + m_elem_node_local[inode][1] ;
+      tmp_node_grid[2] = elem_node_scale * elem_grid[2] + m_elem_node_local[inode][2] ;
+
+      m_elem_node(ielem,inode) = m_box_part.local_node_id( tmp_node_grid );
+    }
+
+    if ( i < m_node_grid.dimension_0() ) {
+      size_t tmp_node_grid[SpaceDim] ;
+      m_box_part.local_node_coord( i , tmp_node_grid );
+      m_node_grid(i,0) = tmp_node_grid[0] ;
+      m_node_grid(i,1) = tmp_node_grid[1] ;
+      m_node_grid(i,2) = tmp_node_grid[2] ;
+
+      m_coord_map( tmp_node_grid[0] ,
+                   tmp_node_grid[1] ,
+                   tmp_node_grid[2] ,
+                   m_node_coord(i,0) ,
+                   m_node_coord(i,1) ,
+                   m_node_coord(i,2) );
+    }
+
+    if ( i < m_recv_node.dimension_0() ) {
+      m_recv_node(i,0) = m_box_part.recv_node_rank(i);
+      m_recv_node(i,1) = m_box_part.recv_node_count(i);
+    }
+
+    if ( i < m_send_node.dimension_0() ) {
+      m_send_node(i,0) = m_box_part.send_node_rank(i);
+      m_send_node(i,1) = m_box_part.send_node_count(i);
+    }
+
+    if ( i < m_send_node_id.dimension_0() ) {
+      m_send_node_id(i) = m_box_part.send_node_id(i);
+    }
+  }
+};
+
+} // namespace Example
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_EXAMPLE_BOXELEMFIXTURE_HPP */
+
diff --git a/lib/kokkos/example/fixture/BoxElemPart.cpp b/lib/kokkos/example/fixture/BoxElemPart.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fe89246689ea41c1157035231e34c9f3a94dfceb
--- /dev/null
+++ b/lib/kokkos/example/fixture/BoxElemPart.cpp
@@ -0,0 +1,413 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <utility>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <limits>
+#include <BoxElemPart.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Example {
+
+void box_partition( const size_t global_size ,
+                    const size_t global_rank ,
+                    const size_t global_box[][2] ,
+                          size_t box[][2] )
+{
+  box[0][0] = global_box[0][0] ; box[0][1] = global_box[0][1] ;
+  box[1][0] = global_box[1][0] ; box[1][1] = global_box[1][1] ;
+  box[2][0] = global_box[2][0] ; box[2][1] = global_box[2][1] ;
+
+  size_t ip = 0 ;
+  size_t np = global_size ;
+
+  while ( 1 < np ) {
+
+    // P = [ ip + j * portion , ip + ( j + 1 ) * portion )
+
+    size_t jip , jup ;
+
+    {
+      const size_t part = ( 0 == ( np % 5 ) ) ? 5 : (
+                          ( 0 == ( np % 3 ) ) ? 3 : 2 );
+
+      const size_t portion = np / part ;
+
+      if ( 2 < part || global_rank < ip + portion ) {
+        jip = portion * size_t( double( global_rank - ip ) / double(portion) );
+        jup = jip + portion ;
+      }
+      else {
+        jip = portion ;
+        jup = np ;
+      }
+    }
+
+    // Choose axis with largest count:
+
+    const size_t nb[3] = {
+      box[0][1] - box[0][0] ,
+      box[1][1] - box[1][0] ,
+      box[2][1] - box[2][0] };
+
+    const int axis = nb[2] > nb[1] ? ( nb[2] > nb[0] ? 2 : 0 )
+                                        : ( nb[1] > nb[0] ? 1 : 0 );
+
+    box[ axis ][1] = box[ axis ][0] + size_t( double(nb[axis]) * ( double(jup) / double(np) ));
+    box[ axis ][0] = box[ axis ][0] + size_t( double(nb[axis]) * ( double(jip) / double(np) ));
+
+    np = jup - jip ;
+    ip = ip + jip ;
+  }
+}
+
+} /* namespace Example */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Example {
+
+void BoxElemPart::local( const size_t  rank ,
+                               size_t  uses_elem[][2] ,
+                               size_t  owns_node[][2] ,
+                               size_t  uses_node[][2] ) const
+{
+  if ( BoxElemPart::DecomposeElem == m_decompose ) {
+
+    Kokkos::Example::box_partition( m_global_size , rank , m_global_elem_box , uses_elem );
+
+    for ( int i = 0 ; i < 3 ; ++i ) {
+      owns_node[i][0] = uses_elem[i][0] ;
+      owns_node[i][1] = uses_elem[i][1] + ( m_global_elem_box[i][1] == uses_elem[i][1] ? 1 : 0 );
+    }
+  }
+  else {
+
+    const size_t global_vert[3][2] =
+      { { 0 , m_global_elem_box[0][1] + 1 },
+        { 0 , m_global_elem_box[1][1] + 1 },
+        { 0 , m_global_elem_box[2][1] + 1 } };
+
+    Kokkos::Example::box_partition( m_global_size , rank , global_vert , owns_node );
+
+    for ( int i = 0 ; i < 3 ; ++i ) {
+      uses_elem[i][0] = global_vert[i][0] == owns_node[i][0] ? owns_node[i][0] : owns_node[i][0] - 1 ;
+      uses_elem[i][1] = global_vert[i][1] == owns_node[i][1] ? owns_node[i][1] - 1 : owns_node[i][1] ;
+    }
+  }
+
+  for ( int i = 0 ; i < 3 ; ++i ) {
+    uses_node[i][0] = uses_elem[i][0] ;
+    uses_node[i][1] = uses_elem[i][1] + 1 ;
+  }
+
+  if ( BoxElemPart::ElemQuadratic == m_elem_order ) {
+    for ( int i = 0 ; i < 3 ; ++i ) {
+      owns_node[i][0] = 2 * owns_node[i][0] ;
+      uses_node[i][0] = 2 * uses_node[i][0] ;
+      owns_node[i][1] = 2 * owns_node[i][1] - 1 ;
+      uses_node[i][1] = 2 * uses_node[i][1] - 1 ;
+    }
+  }
+}
+
+BoxElemPart::BoxElemPart(
+  const BoxElemPart::ElemOrder elem_order ,
+  const BoxElemPart::Decompose decompose ,
+  const size_t global_size ,
+  const size_t global_rank ,
+  const size_t elem_nx ,
+  const size_t elem_ny ,
+  const size_t elem_nz )
+{
+  m_global_size = global_size ;
+  m_global_rank = global_rank ;
+  m_decompose   = decompose ;
+  m_elem_order  = elem_order ;
+
+  m_global_elem_box[0][0] = 0 ; m_global_elem_box[0][1] = elem_nx ;
+  m_global_elem_box[1][0] = 0 ; m_global_elem_box[1][1] = elem_ny ;
+  m_global_elem_box[2][0] = 0 ; m_global_elem_box[2][1] = elem_nz ;
+
+  m_global_node_box[0][0] = 0 ; m_global_node_box[0][1] = 0 ;
+  m_global_node_box[1][0] = 0 ; m_global_node_box[1][1] = 0 ;
+  m_global_node_box[2][0] = 0 ; m_global_node_box[2][1] = 0 ;
+
+  m_owns_node_count = 0 ;
+  m_send_node_count = 0 ;
+
+  m_ok = true ;
+
+  //----------------------------------------
+
+  if ( ElemLinear == elem_order ) {
+    m_global_node_box[0][1] = elem_nx + 1 ;
+    m_global_node_box[1][1] = elem_ny + 1 ;
+    m_global_node_box[2][1] = elem_nz + 1 ;
+  }
+  else if ( ElemQuadratic == elem_order ) {
+    m_global_node_box[0][1] = 2 * elem_nx + 1 ;
+    m_global_node_box[1][1] = 2 * elem_ny + 1 ;
+    m_global_node_box[2][1] = 2 * elem_nz + 1 ;
+  }
+
+  //----------------------------------------
+
+  local( m_global_rank , m_uses_elem_box , m_owns_node_box[0] , m_uses_node_box );
+
+  const size_t global_node_count_ = Kokkos::Example::box_count( m_global_node_box );
+  const size_t global_elem_count_ = Kokkos::Example::box_count( m_global_elem_box );
+
+  //----------------------------------------
+
+  size_t elem_count = Kokkos::Example::box_count( m_uses_elem_box );
+  size_t node_count = Kokkos::Example::box_count( m_owns_node_box[0] );
+
+  m_owns_node[0][0] = global_rank ;
+  m_owns_node[0][1] = node_count ;
+  m_owns_node_count = 1 ;
+  m_send_node_count = 0 ;
+
+  for ( size_t rr = 1 ; rr < m_global_size && m_ok ; ++rr ) {
+
+    const size_t rank = ( m_global_rank + rr ) % m_global_size ;
+
+    size_t elem_box[3][2] , o_node_box[3][2] , u_node_box[3][2] ;
+
+    // Boxes for process 'rank'
+    local( rank , elem_box , o_node_box , u_node_box );
+
+    // Box that this process uses but is owned by process 'rank'
+    Kokkos::Example::box_intersect( m_owns_node_box[ m_owns_node_count ] , m_uses_node_box , o_node_box );
+
+    m_owns_node[ m_owns_node_count ][1] = Kokkos::Example::box_count( m_owns_node_box[ m_owns_node_count ] );
+
+    if ( m_owns_node[ m_owns_node_count ][1] ) {
+
+      if ( ( PROC_NEIGH_MAX - 1 ) <= m_owns_node_count ) {
+        std::cout << "BoxElemPart exceeded maximum neighbor count" << std::endl ;
+        m_ok = false ;
+        break ;
+      }
+
+      m_owns_node[ m_owns_node_count ][0] = rank ;
+
+      ++m_owns_node_count ;
+    }
+
+    // Box that this process owns and is used by process 'rank'
+    Kokkos::Example::box_intersect( m_send_node_box[ m_send_node_count ] , m_owns_node_box[0] , u_node_box );
+
+    m_send_node[ m_send_node_count ][1] = Kokkos::Example::box_count( m_send_node_box[ m_send_node_count ] );
+
+    if ( m_send_node[ m_send_node_count ][1] ) {
+
+      if ( ( PROC_NEIGH_MAX - 1 ) <= m_send_node_count ) {
+        std::cout << "BoxElemPart exceeded maximum neighbor count" << std::endl ;
+        m_ok = false ;
+        break ;
+      }
+
+      m_send_node[ m_send_node_count ][0] = rank ;
+      ++m_send_node_count ;
+    }
+
+    // Error checking:
+
+    size_t test_box[3][2] ;
+
+    elem_count += Kokkos::Example::box_count( elem_box );
+    node_count += Kokkos::Example::box_count( o_node_box );
+
+    {
+      Kokkos::Example::box_intersect( test_box , m_owns_node_box[0] , o_node_box );
+
+      if ( Kokkos::Example::box_count( test_box ) ) {
+        std::cout << "Box partitioning error" << std::endl ;
+        std::cout << "owns_node[" << m_global_rank << "]{"
+                  << " [" << m_owns_node_box[0][0][0] << "," << m_owns_node_box[0][0][1] << ")"
+                  << " [" << m_owns_node_box[0][1][0] << "," << m_owns_node_box[0][1][1] << ")"
+                  << " [" << m_owns_node_box[0][2][0] << "," << m_owns_node_box[0][2][1] << ")"
+                  << "} intersects"
+                  << " owns_node[" << rank << "]{"
+                  << " [" << o_node_box[0][0] << "," << o_node_box[0][1] << ")"
+                  << " [" << o_node_box[1][0] << "," << o_node_box[1][1] << ")"
+                  << " [" << o_node_box[2][0] << "," << o_node_box[2][1] << ")"
+                  << "}" << std::endl ;
+        m_ok = false ;
+        break ;
+      }
+    }
+
+    if ( DecomposeElem == decompose ) {
+
+      Kokkos::Example::box_intersect( test_box , m_uses_elem_box , elem_box );
+
+      if ( Kokkos::Example::box_count( test_box ) ) {
+        std::cout << "Box partitioning error" << std::endl ;
+        std::cout << "ElemBox[" << m_global_rank << "]{"
+                  << " [" << m_uses_elem_box[0][0] << "," << m_uses_elem_box[0][1] << ")"
+                  << " [" << m_uses_elem_box[1][0] << "," << m_uses_elem_box[1][1] << ")"
+                  << " [" << m_uses_elem_box[2][0] << "," << m_uses_elem_box[2][1] << ")"
+                  << "} intersects"
+                  << " ElemBox[" << rank << "]{"
+                  << " [" << elem_box[0][0] << "," << elem_box[0][1] << ")"
+                  << " [" << elem_box[1][0] << "," << elem_box[1][1] << ")"
+                  << " [" << elem_box[2][0] << "," << elem_box[2][1] << ")"
+                  << "}" << std::endl ;
+        m_ok = false ;
+        break ;
+      }
+    }
+  }
+
+  // Sentinal values at the end of the owns and send lists:
+
+  m_owns_node[ m_owns_node_count ][0] = ~0u ;
+  m_owns_node[ m_owns_node_count ][1] = ~0u ;
+  m_owns_node_box[ m_owns_node_count ][0][0] = 0u ; m_owns_node_box[ m_owns_node_count ][0][0] = ~0u ;
+  m_owns_node_box[ m_owns_node_count ][1][0] = 0u ; m_owns_node_box[ m_owns_node_count ][1][0] = ~0u ;
+  m_owns_node_box[ m_owns_node_count ][2][0] = 0u ; m_owns_node_box[ m_owns_node_count ][2][0] = ~0u ;
+
+  m_send_node[ m_send_node_count ][0] = ~0u ;
+  m_send_node[ m_send_node_count ][1] = ~0u ;
+  m_send_node_box[ m_send_node_count ][0][0] = 0u ; m_send_node_box[ m_send_node_count ][0][0] = ~0u ;
+  m_send_node_box[ m_send_node_count ][1][0] = 0u ; m_send_node_box[ m_send_node_count ][1][0] = ~0u ;
+  m_send_node_box[ m_send_node_count ][2][0] = 0u ; m_send_node_box[ m_send_node_count ][2][0] = ~0u ;
+
+  {
+    size_t count = 0 ;
+    for ( size_t i = 0 ; i < m_owns_node_count ; ++i ) {
+      count += m_owns_node[i][1] ;
+    }
+    if ( count != Kokkos::Example::box_count( m_uses_node_box ) ) {
+      std::cout << "Node uses count = " << Kokkos::Example::box_count( m_uses_node_box )
+                << " error count = " << count << std::endl ;
+      m_ok = false ;
+    }
+  }
+
+  if ( global_node_count_ != node_count ) {
+    std::cout << "Node count = " << global_node_count_ << " overlap error count = " << node_count << std::endl ;
+    m_ok = false ;
+  }
+
+  if ( DecomposeElem == decompose && global_elem_count_ != elem_count ) {
+    std::cout << "Elem count = " << global_elem_count_ << " overlap error count = " << elem_count << std::endl ;
+    m_ok = false ;
+  }
+
+  if ( ! m_ok ) {
+    for ( int i = 0 ; i < 3 ; ++i ) { for ( int j = 0 ; j < 2 ; ++j ) {
+      m_global_elem_box[i][j] = 0 ;
+      m_global_node_box[i][j] = 0 ;
+      m_uses_elem_box[i][j] = 0 ;
+      m_uses_node_box[i][j] = 0 ;
+    }}
+    m_owns_node_count = 0 ;
+    m_send_node_count = 0 ;
+  }
+}
+
+void BoxElemPart::print( std::ostream & s ) const
+{
+  s << "BoxElemPart P[" << m_global_rank << ":" << m_global_size << "]"
+    << std::endl
+    << "  elem_box {"
+    << " [" << m_uses_elem_box[0][0] << "," << m_uses_elem_box[0][1] << ")"
+    << " [" << m_uses_elem_box[1][0] << "," << m_uses_elem_box[1][1] << ")"
+    << " [" << m_uses_elem_box[2][0] << "," << m_uses_elem_box[2][1] << ")"
+    << " } / {"
+    << " [" << m_global_elem_box[0][0] << "," << m_global_elem_box[0][1] << ")"
+    << " [" << m_global_elem_box[1][0] << "," << m_global_elem_box[1][1] << ")"
+    << " [" << m_global_elem_box[2][0] << "," << m_global_elem_box[2][1] << ")"
+    << " }"
+    << std::endl
+    << "  node_box {"
+    << " [" << m_owns_node_box[0][0][0] << "," << m_owns_node_box[0][0][1] << ")"
+    << " [" << m_owns_node_box[0][1][0] << "," << m_owns_node_box[0][1][1] << ")"
+    << " [" << m_owns_node_box[0][2][0] << "," << m_owns_node_box[0][2][1] << ")"
+    << " } / {"
+    << " [" << m_uses_node_box[0][0] << "," << m_uses_node_box[0][1] << ")"
+    << " [" << m_uses_node_box[1][0] << "," << m_uses_node_box[1][1] << ")"
+    << " [" << m_uses_node_box[2][0] << "," << m_uses_node_box[2][1] << ")"
+    << " } / {"
+    << " [" << m_global_node_box[0][0] << "," << m_global_node_box[0][1] << ")"
+    << " [" << m_global_node_box[1][0] << "," << m_global_node_box[1][1] << ")"
+    << " [" << m_global_node_box[2][0] << "," << m_global_node_box[2][1] << ")"
+    << " }"
+    << std::endl ;
+
+  for ( size_t i = 1 ; i < m_owns_node_count ; ++i ) {
+    s << "  P[" << m_owns_node[i][0] << "]"
+      << " recv node_box {"
+      << " [" << m_owns_node_box[i][0][0] << "," << m_owns_node_box[i][0][1] << ")"
+      << " [" << m_owns_node_box[i][1][0] << "," << m_owns_node_box[i][1][1] << ")"
+      << " [" << m_owns_node_box[i][2][0] << "," << m_owns_node_box[i][2][1] << ")"
+      << " }"
+      << std::endl ;
+  }
+
+  for ( size_t i = 0 ; i < m_send_node_count ; ++i ) {
+    s << "  P[" << m_send_node[i][0] << "]"
+      << " send node_box {"
+      << " [" << m_send_node_box[i][0][0] << "," << m_send_node_box[i][0][1] << ")"
+      << " [" << m_send_node_box[i][1][0] << "," << m_send_node_box[i][1][1] << ")"
+      << " [" << m_send_node_box[i][2][0] << "," << m_send_node_box[i][2][1] << ")"
+      << " }"
+      << std::endl ;
+  }
+}
+
+} /* namespace Example */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+
+
diff --git a/lib/kokkos/example/fixture/BoxElemPart.hpp b/lib/kokkos/example/fixture/BoxElemPart.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..98f44e7d9da92a9b3c04a2df936cc850d9b1d632
--- /dev/null
+++ b/lib/kokkos/example/fixture/BoxElemPart.hpp
@@ -0,0 +1,320 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_BOXELEMPART_HPP
+#define KOKKOS_BOXELEMPART_HPP
+
+#include <utility>
+#include <ostream>
+#include <Kokkos_Macros.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Example {
+
+KOKKOS_INLINE_FUNCTION
+void box_intersect( size_t box[][2] ,
+                    const size_t boxA[][2] ,
+                    const size_t boxB[][2] )
+{
+  for ( int i = 0 ; i < 3 ; ++i ) {
+    box[i][0] = boxA[i][0] > boxB[i][0] ? boxA[i][0] : boxB[i][0] ;
+    box[i][1] = boxA[i][1] < boxB[i][1] ? boxA[i][1] : boxB[i][1] ;
+    if ( box[i][0] > box[i][1] ) box[i][1] = box[i][0] ;
+  }
+}
+
+KOKKOS_INLINE_FUNCTION
+size_t box_count( const size_t box[][2] )
+{
+  return size_t( box[0][1] - box[0][0] ) *
+         size_t( box[1][1] - box[1][0] ) *
+         size_t( box[2][1] - box[2][0] );
+}
+
+KOKKOS_INLINE_FUNCTION
+void box_ghost_layer( const size_t global_box[][2] ,
+                      const size_t local_box[][2] ,
+                      const size_t ghost_layer ,
+                            size_t ghost_box[][2] )
+{
+  for ( int i = 0 ; i < 3 ; ++i ) {
+    ghost_box[i][0] = global_box[i][0] + ghost_layer > local_box[i][0] ? global_box[i][0] : local_box[i][0] - ghost_layer ;
+    ghost_box[i][1] = global_box[i][1] < local_box[i][1] + ghost_layer ? global_box[i][1] : local_box[i][1] + ghost_layer ;
+  }
+}
+
+void box_partition( const size_t global_size ,
+                    const size_t global_rank ,
+                    const size_t global_box[][2] ,
+                          size_t box[][2] );
+
+} // namespace Example
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Example {
+
+/** \brief Partition a box of hexahedral elements among subdomains.
+ *
+ *  Nodes are ordered locally as follows:
+ *    { owned_by[ this_process ] ,
+ *      owned_by[ neighbor_process[0] ] ,
+ *      owned_by[ neighbor_process[1] ] ,
+ *      owned_by[ neighbor_process[2] ] ,
+ *      ... };
+ */
+class BoxElemPart {
+public:
+
+  enum Decompose { DecomposeNode , DecomposeElem };
+  enum ElemOrder { ElemLinear , ElemQuadratic };
+
+  bool ok() const { return m_ok ; }
+
+  BoxElemPart( const ElemOrder elem_order ,
+               const Decompose decompose ,
+               const size_t global_size ,
+               const size_t global_rank ,
+               const size_t elem_nx ,
+               const size_t elem_ny ,
+               const size_t elem_nz );
+
+  KOKKOS_INLINE_FUNCTION
+  size_t global_elem_count() const
+    { return Kokkos::Example::box_count( m_global_elem_box ); }
+
+  KOKKOS_INLINE_FUNCTION
+  size_t global_node_count() const
+    { return Kokkos::Example::box_count( m_global_node_box ); }
+
+  KOKKOS_INLINE_FUNCTION
+  size_t uses_elem_count() const
+    { return Kokkos::Example::box_count( m_uses_elem_box ); }
+
+  KOKKOS_INLINE_FUNCTION
+  size_t owns_node_count() const
+    { return Kokkos::Example::box_count( m_owns_node_box[0] ); }
+
+  KOKKOS_INLINE_FUNCTION
+  size_t uses_node_count() const
+    { return Kokkos::Example::box_count( m_uses_node_box ); }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  size_t uses_elem_offset( const size_t ix ,
+                           const size_t iy ,
+                           const size_t iz ) const
+  {
+    return size_t( ix - m_uses_elem_box[0][0] ) + size_t( m_uses_elem_box[0][1] - m_uses_elem_box[0][0] ) * (
+           size_t( iy - m_uses_elem_box[1][0] ) + size_t( m_uses_elem_box[1][1] - m_uses_elem_box[1][0] ) * (
+           size_t( iz - m_uses_elem_box[2][0] ) ) );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void uses_elem_coord( size_t lid , size_t c[] ) const
+  {
+    const size_t nx = m_uses_elem_box[0][1] - m_uses_elem_box[0][0] ;
+    const size_t ny = m_uses_elem_box[1][1] - m_uses_elem_box[1][0] ;
+
+    c[0] = m_uses_elem_box[0][0] + lid % nx ; lid /= nx ;
+    c[1] = m_uses_elem_box[1][0] + lid % ny ; lid /= ny ;
+    c[2] = m_uses_elem_box[2][0] + lid ;
+  }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  size_t global_coord_max( size_t axis ) const
+  { return m_global_node_box[axis][1] - 1 ; }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  void local_node_coord( size_t lid , size_t coord[] ) const
+  {
+    // Local id within an 'owns' block (has sentinal)
+    size_t j = 0 ;
+    while ( m_owns_node[j][1] <= lid ) { lid -= m_owns_node[j][1] ; ++j ; }
+
+    // Map to global coordinates:
+    const size_t nx = m_owns_node_box[j][0][1] - m_owns_node_box[j][0][0] ;
+    const size_t ny = m_owns_node_box[j][1][1] - m_owns_node_box[j][1][0] ;
+
+    coord[0] = m_owns_node_box[j][0][0] + lid % nx ; lid /= nx ;
+    coord[1] = m_owns_node_box[j][1][0] + lid % ny ; lid /= ny ;
+    coord[2] = m_owns_node_box[j][2][0] + lid ;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  size_t local_node_id( const size_t c[] ) const
+  {
+    // Find which 'owns' block and accumulate the offset of this block:
+    size_t lid = 0 ;
+    size_t j = 0 ;
+    while ( ! ( m_owns_node_box[j][0][0] <= c[0] && c[0] < m_owns_node_box[j][0][1] &&
+                m_owns_node_box[j][1][0] <= c[1] && c[1] < m_owns_node_box[j][1][1] &&
+                m_owns_node_box[j][2][0] <= c[2] && c[2] < m_owns_node_box[j][2][1] ) ) {
+      
+      lid += m_owns_node[j][1] ;
+      ++j ;
+    }
+
+    // Map offset to the block plus offset within the block:
+    return lid +
+           size_t( c[0] - m_owns_node_box[j][0][0] ) + size_t( m_owns_node_box[j][0][1] - m_owns_node_box[j][0][0] ) * (
+           size_t( c[1] - m_owns_node_box[j][1][0] ) + size_t( m_owns_node_box[j][1][1] - m_owns_node_box[j][1][0] ) * (
+           size_t( c[2] - m_owns_node_box[j][2][0] ) ) );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  size_t global_node_id( const size_t c[] ) const
+  {
+    return size_t( c[0] - m_global_node_box[0][0] ) + size_t( m_global_node_box[0][1] - m_global_node_box[0][0] ) * (
+           size_t( c[1] - m_global_node_box[1][0] ) + size_t( m_global_node_box[1][1] - m_global_node_box[1][0] ) * (
+           size_t( c[2] - m_global_node_box[2][0] ) ) );
+  }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  size_t recv_node_msg_count() const { return m_owns_node_count - 1 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_t recv_node_rank(  size_t msg ) const { return m_owns_node[msg+1][0] ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_t recv_node_count( size_t msg ) const { return m_owns_node[msg+1][1] ; }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  size_t send_node_msg_count() const { return m_send_node_count ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_t send_node_rank(  size_t msg ) const { return m_send_node[msg][0] ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_t send_node_count( size_t msg ) const { return m_send_node[msg][1] ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_t send_node_id_count() const
+  {
+    size_t count = 0 ;
+    for ( size_t i = 0 ; i < m_send_node_count ; ++i ) {
+      count += m_send_node[i][1] ;
+    }
+    return count ;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  size_t send_node_id( size_t item ) const
+  {
+    // Find which send list this send item is in:
+    size_t j = 0 ;
+    while ( m_send_node[j][1] <= item ) { item -= m_send_node[j][1] ; ++j ; }
+
+    // Map to global coordinate:
+    const size_t nx = m_send_node_box[j][0][1] - m_send_node_box[j][0][0] ;
+    const size_t ny = m_send_node_box[j][1][1] - m_send_node_box[j][1][0] ;
+
+    size_t c[3] ;
+
+    c[0] = m_send_node_box[j][0][0] + item % nx ; item /= nx ;
+    c[1] = m_send_node_box[j][1][0] + item % ny ; item /= ny ;
+    c[2] = m_send_node_box[j][2][0] + item ;
+
+    // Map to local id:
+    return size_t( c[0] - m_owns_node_box[0][0][0] ) + size_t( m_owns_node_box[0][0][1] - m_owns_node_box[0][0][0] ) * (
+           size_t( c[1] - m_owns_node_box[0][1][0] ) + size_t( m_owns_node_box[0][1][1] - m_owns_node_box[0][1][0] ) * (
+           size_t( c[2] - m_owns_node_box[0][2][0] ) ) );
+  }
+
+  //----------------------------------------
+
+  void print( std::ostream & s ) const ;
+
+private:
+
+  // Maximum number of processes in a neighborhood, including this process
+  enum { PROC_NEIGH_MAX = 64 };
+
+  void local( const size_t  rank ,
+                    size_t  uses_elem[][2] ,
+                    size_t  owns_node[][2] ,
+                    size_t  uses_node[][2] ) const ;
+
+  size_t  m_global_size ;
+  size_t  m_global_rank ;
+
+  Decompose m_decompose ;
+  ElemOrder m_elem_order ;
+
+  size_t m_global_elem_box[3][2] ;
+  size_t m_global_node_box[3][2] ;
+  size_t m_uses_elem_box[3][2] ;
+  size_t m_uses_node_box[3][2] ;
+
+  // [ processor rank , count ]
+  size_t m_owns_node_box[ PROC_NEIGH_MAX ][3][2] ;
+  size_t m_owns_node[     PROC_NEIGH_MAX ][2] ;
+  size_t m_owns_node_count ;
+
+  size_t m_send_node_box[ PROC_NEIGH_MAX ][3][2] ;
+  size_t m_send_node[     PROC_NEIGH_MAX ][2] ;
+  size_t m_send_node_count ;
+
+  bool   m_ok ;
+};
+
+} // namespace Example
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_BOXELEMPART_HPP */
+
diff --git a/lib/kokkos/example/fixture/CMakeLists.txt b/lib/kokkos/example/fixture/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..298c54c5bb3e00bf5ecaf5ad18e53de2ba405272
--- /dev/null
+++ b/lib/kokkos/example/fixture/CMakeLists.txt
@@ -0,0 +1,13 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../common)
+
+SET(SOURCES_TEST Main.cpp TestFixture.cpp BoxElemPart.cpp )
+
+# Automatically picks up 'kokkosexample_fixture'
+TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  TestFixture
+  SOURCES ${SOURCES_TEST}
+  )
+
diff --git a/lib/kokkos/example/fixture/HexElement.hpp b/lib/kokkos/example/fixture/HexElement.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..cb39358f9d23a6e01a45f9ca6f277613bb321301
--- /dev/null
+++ b/lib/kokkos/example/fixture/HexElement.hpp
@@ -0,0 +1,270 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_HEXELEMENT_HPP
+#define KOKKOS_HEXELEMENT_HPP
+
+namespace Kokkos {
+namespace Example {
+
+template< unsigned NodeCount >
+class HexElement_TensorData ;
+
+template< unsigned NodeCount , class Device >
+class HexElement_TensorEval ;
+
+//----------------------------------------------------------------------------
+/** \brief  Evaluate Hex element on interval [-1,1]^3 */
+template<>
+class HexElement_TensorData< 8 > {
+public:
+
+  static const unsigned element_node_count    = 8 ;
+  static const unsigned spatial_dimension     = 3 ;
+  static const unsigned integration_count_1d  = 2 ;
+  static const unsigned function_count_1d     = 2 ;
+
+  float values_1d [ function_count_1d ][ integration_count_1d ];
+  float derivs_1d [ function_count_1d ][ integration_count_1d ];
+  float weights_1d[ integration_count_1d ];
+
+  unsigned char eval_map[ element_node_count ][4] ;
+
+  static float eval_value_1d( const unsigned jf , const float x )
+  {
+    return 0 == jf ? 0.5 * ( 1.0 - x ) : (
+           1 == jf ? 0.5 * ( 1.0 + x ) : 0 );
+  }
+
+  static float eval_deriv_1d( const unsigned jf , const float )
+  {
+    return 0 == jf ? -0.5 : (
+           1 == jf ?  0.5 : 0 );
+  }
+
+  HexElement_TensorData()
+  {
+    const unsigned char tmp_map[ element_node_count ][ spatial_dimension ] =
+      { { 0 , 0 , 0 },
+        { 1 , 0 , 0 },
+        { 1 , 1 , 0 },
+        { 0 , 1 , 0 },
+        { 0 , 0 , 1 },
+        { 1 , 0 , 1 },
+        { 1 , 1 , 1 },
+        { 0 , 1 , 1 } };
+
+    weights_1d[0] = 1 ;
+    weights_1d[1] = 1 ;
+
+    const float points_1d[ integration_count_1d ] =
+      { -0.577350269 , 0.577350269 };
+
+    for ( unsigned i = 0 ; i < element_node_count ; ++i ) {
+      eval_map[i][0] = tmp_map[i][0];
+      eval_map[i][1] = tmp_map[i][1];
+      eval_map[i][2] = tmp_map[i][2];
+    }
+
+    for ( unsigned xp = 0 ; xp < integration_count_1d ; ++xp ) {
+    for ( unsigned xf = 0 ; xf < function_count_1d ; ++xf ) {
+      values_1d[xp][xf] = eval_value_1d( xf , points_1d[xp] );
+      derivs_1d[xp][xf] = eval_deriv_1d( xf , points_1d[xp] );
+    }}
+  }
+};
+
+//----------------------------------------------------------------------------
+
+template<>
+class HexElement_TensorData< 27 > {
+public:
+
+  static const unsigned element_node_count    = 27 ;
+  static const unsigned spatial_dimension     = 3 ;
+  static const unsigned integration_count_1d  = 3 ;
+  static const unsigned function_count_1d     = 3 ;
+
+  float values_1d [ function_count_1d ][ integration_count_1d ];
+  float derivs_1d [ function_count_1d ][ integration_count_1d ];
+  float weights_1d[ integration_count_1d ];
+
+  unsigned char eval_map[ element_node_count ][4] ;
+
+  // sizeof(EvaluateElementHex) = 111 bytes =
+  //   sizeof(float) * 9 +
+  //   sizeof(float) * 9 +
+  //   sizeof(float) * 3 +
+  //   sizeof(char) * 27 
+
+  static float eval_value_1d( const unsigned jf , const float p )
+  {
+    return 0 == jf ? 0.5 * p * ( p - 1 ) : (
+           1 == jf ? 1.0 - p * p : (
+           2 == jf ? 0.5 * p * ( p + 1 ) : 0 ));
+  }
+
+  static float eval_deriv_1d( const unsigned jf , const float p )
+  {
+    return 0 == jf ? p - 0.5 : (
+           1 == jf ? -2.0 * p : (
+           2 == jf ? p + 0.5 : 0 ));
+  }
+
+  HexElement_TensorData()
+  {
+    const unsigned char tmp_map[ element_node_count ][ spatial_dimension ] =
+      { { 0 , 0 , 0 },
+        { 2 , 0 , 0 },
+        { 2 , 2 , 0 },
+        { 0 , 2 , 0 },
+        { 0 , 0 , 2 },
+        { 2 , 0 , 2 },
+        { 2 , 2 , 2 },
+        { 0 , 2 , 2 },
+        { 1 , 0 , 0 },
+        { 2 , 1 , 0 },
+        { 1 , 2 , 0 },
+        { 0 , 1 , 0 },
+        { 0 , 0 , 1 },
+        { 2 , 0 , 1 },
+        { 2 , 2 , 1 },
+        { 0 , 2 , 1 },
+        { 1 , 0 , 2 },
+        { 2 , 1 , 2 },
+        { 1 , 2 , 2 },
+        { 0 , 1 , 2 },
+        { 1 , 1 , 1 },
+        { 1 , 1 , 0 },
+        { 1 , 1 , 2 },
+        { 0 , 1 , 1 },
+        { 2 , 1 , 1 },
+        { 1 , 0 , 1 },
+        { 1 , 2 , 1 } };
+
+    // Interval [-1,1]
+
+    weights_1d[0] = 0.555555556 ;
+    weights_1d[1] = 0.888888889 ;
+    weights_1d[2] = 0.555555556 ;
+
+    const float points_1d[3] = { -0.774596669 ,
+                                  0.000000000 ,
+                                  0.774596669 };
+
+    for ( unsigned i = 0 ; i < element_node_count ; ++i ) {
+      eval_map[i][0] = tmp_map[i][0];
+      eval_map[i][1] = tmp_map[i][1];
+      eval_map[i][2] = tmp_map[i][2];
+    }
+
+    for ( unsigned xp = 0 ; xp < integration_count_1d ; ++xp ) {
+    for ( unsigned xf = 0 ; xf < function_count_1d ; ++xf ) {
+      values_1d[xp][xf] = eval_value_1d( xf , points_1d[xp] );
+      derivs_1d[xp][xf] = eval_deriv_1d( xf , points_1d[xp] );
+    }}
+  }
+};
+
+//----------------------------------------------------------------------------
+
+template< unsigned NodeCount >
+class HexElement_Data {
+public:
+  static const unsigned spatial_dimension   = 3 ;
+  static const unsigned element_node_count  = NodeCount ;
+  static const unsigned integration_count   = NodeCount ;
+  static const unsigned function_count      = NodeCount ;
+
+  float weights[   integration_count ] ;
+  float values[    integration_count ][ function_count ];
+  float gradients[ integration_count ][ spatial_dimension ][ function_count ];
+
+  HexElement_Data()
+  {
+    HexElement_TensorData< NodeCount > tensor_data ;
+
+    for ( unsigned ip = 0 ; ip < integration_count ; ++ip ) {
+
+      const unsigned ipx = tensor_data.eval_map[ip][0] ;
+      const unsigned ipy = tensor_data.eval_map[ip][1] ;
+      const unsigned ipz = tensor_data.eval_map[ip][2] ;
+
+      weights[ip] = tensor_data.weights_1d[ ipx ] *
+                    tensor_data.weights_1d[ ipy ] *
+                    tensor_data.weights_1d[ ipz ] ;
+
+      for ( unsigned jf = 0 ; jf < function_count ; ++jf ) {
+
+        const unsigned jfx = tensor_data.eval_map[jf][0] ;
+        const unsigned jfy = tensor_data.eval_map[jf][1] ;
+        const unsigned jfz = tensor_data.eval_map[jf][2] ;
+
+        values[ip][jf] = tensor_data.values_1d[ ipx ][ jfx ] *
+                         tensor_data.values_1d[ ipy ][ jfy ] *
+                         tensor_data.values_1d[ ipz ][ jfz ] ;
+
+        gradients[ip][0][jf] = tensor_data.derivs_1d[ ipx ][ jfx ] *
+                               tensor_data.values_1d[ ipy ][ jfy ] *
+                               tensor_data.values_1d[ ipz ][ jfz ] ;
+
+        gradients[ip][1][jf] = tensor_data.values_1d[ ipx ][ jfx ] *
+                               tensor_data.derivs_1d[ ipy ][ jfy ] *
+                               tensor_data.values_1d[ ipz ][ jfz ] ;
+
+        gradients[ip][2][jf] = tensor_data.values_1d[ ipx ][ jfx ] *
+                               tensor_data.values_1d[ ipy ][ jfy ] *
+                               tensor_data.derivs_1d[ ipz ][ jfz ] ;
+      }
+    }
+  }
+};
+
+//----------------------------------------------------------------------------
+
+} /* namespace Example */
+} /* namespace Kokkos */
+
+#endif /* #ifndef KOKKOS_HEXELEMENT_HPP */
+
+
diff --git a/lib/kokkos/example/fixture/Main.cpp b/lib/kokkos/example/fixture/Main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d80828ca521c40cebba1d951e46a04ef067c2745
--- /dev/null
+++ b/lib/kokkos/example/fixture/Main.cpp
@@ -0,0 +1,304 @@
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+
+#include <utility>
+#include <iostream>
+
+#include <Kokkos_Core.hpp>
+
+#include <BoxElemPart.hpp>
+
+namespace Kokkos {
+namespace Example {
+template< class > void test_fixture();
+}
+}
+
+int test_box( const size_t global_size
+            , const size_t global_box[][2]
+            , const bool print_verbose )
+{
+  size_t global_count = 0 ;
+  size_t global_max = 0 ;
+  size_t global_min = Kokkos::Example::box_count( global_box );
+  size_t global_box_max[3][2] = { { 0 , 0 } , { 0 , 0 } , { 0 , 0 } };
+  size_t global_box_min[3][2] = { { 0 , global_box[0][1] } , { 0 , global_box[1][1] } , { 0 , global_box[2][1] } };
+  size_t intersect_error = 0 ;
+  size_t neighbor_max = 0 ;
+
+  for ( size_t global_rank = 0 ; global_rank < global_size ; ++global_rank ) {
+    size_t box[3][2] = { { 0 , global_box[0][1] } , { 0 , global_box[1][1] } , { 0 , global_box[2][1] } };
+    size_t ghost_box[3][2] ;
+    size_t neighbor_count = 0 ;
+
+    Kokkos::Example::box_partition( global_size , global_rank , global_box , box );
+
+    Kokkos::Example::box_ghost_layer( global_box , box , 1 , ghost_box );
+
+    {
+      const size_t n = Kokkos::Example::box_count( box );
+
+      for ( int i = 0 ; i < 3 ; ++i ) {
+        if ( ( box[i][1] - box[i][0] ) < ( global_box_min[i][1] - global_box_min[i][0] ) ) {
+          global_box_min[i][0] = box[i][0] ;
+          global_box_min[i][1] = box[i][1] ;
+        }
+        if ( ( box[i][1] - box[i][0] ) > ( global_box_max[i][1] - global_box_max[i][0] ) ) {
+          global_box_max[i][0] = box[i][0] ;
+          global_box_max[i][1] = box[i][1] ;
+        }
+      }
+
+      global_max = std::max( global_max , n );
+      global_min = std::min( global_min , n );
+      global_count += n ;
+    }
+
+    for ( size_t other_rank = 0 ; other_rank  < global_size ; ++other_rank ) {
+
+      if ( other_rank == global_rank ) continue ;
+
+      size_t other_box[3][2] = { { 0 , global_box[0][1] } , { 0 , global_box[1][1] } , { 0 , global_box[2][1] } };
+      size_t intersect_box[3][2] ;
+
+      Kokkos::Example::box_partition( global_size , other_rank , global_box , other_box );
+
+      Kokkos::Example::box_intersect( intersect_box , box , other_box );
+
+      const size_t n = Kokkos::Example::box_count( intersect_box );
+
+      intersect_error += n ;
+
+      Kokkos::Example::box_intersect( intersect_box , ghost_box , other_box );
+
+      neighbor_count += Kokkos::Example::box_count( intersect_box ) ? 1 : 0 ;
+
+      if ( n ) {
+        std::cout << "box partition intersection error" << std::endl ;
+        std::cout << "box = {"
+                  << " [ " << box[0][0] << " , " << box[0][1] << " )"
+                  << " [ " << box[1][0] << " , " << box[1][1] << " )"
+                  << " [ " << box[2][0] << " , " << box[2][1] << " )"
+                  << " }" << std::endl ;
+        std::cout << "other_box = {"
+                  << " [ " << other_box[0][0] << " , " << other_box[0][1] << " )"
+                  << " [ " << other_box[1][0] << " , " << other_box[1][1] << " )"
+                  << " [ " << other_box[2][0] << " , " << other_box[2][1] << " )"
+                  << " }" << std::endl ;
+        return 0 ;
+      }
+    }
+
+    neighbor_max = std::max( neighbor_max , neighbor_count );
+  }
+
+  if ( print_verbose ) {
+
+    std::cout << "global_part = " << global_size << std::endl ;
+    std::cout << "global_box  = { "
+              << " [ " << global_box[0][0] << " .. " << global_box[0][1] << " ) X"
+              << " [ " << global_box[1][0] << " .. " << global_box[1][1] << " ) X"
+              << " [ " << global_box[2][0] << " .. " << global_box[2][1] << " )"
+              << " }" << std::endl ;
+    std::cout << "count( global_box ) = " << Kokkos::Example::box_count( global_box ) << std::endl ;
+    std::cout << "sum partition( global_box ) = " << global_count << std::endl ;
+    std::cout << "avg partition( global_box ) = " << size_t( double(global_count) / double(global_size)) << std::endl ;
+    std::cout << "min partition( global_box ) = " << global_min << std::endl ;
+    std::cout << "min part X   ( global_box ) = [ " << global_box_min[0][0] << " .. " << global_box_min[0][1] << " )" << std::endl ;
+    std::cout << "min part Y   ( global_box ) = [ " << global_box_min[1][0] << " .. " << global_box_min[1][1] << " )" << std::endl ;
+    std::cout << "min part Z   ( global_box ) = [ " << global_box_min[2][0] << " .. " << global_box_min[2][1] << " )" << std::endl ;
+    std::cout << "max partition( global_box ) = " << global_max << std::endl ;
+    std::cout << "max part X   ( global_box ) = [ " << global_box_max[0][0] << " .. " << global_box_max[0][1] << " )" << std::endl ;
+    std::cout << "max part Y   ( global_box ) = [ " << global_box_max[1][0] << " .. " << global_box_max[1][1] << " )" << std::endl ;
+    std::cout << "max part Z   ( global_box ) = [ " << global_box_max[2][0] << " .. " << global_box_max[2][1] << " )" << std::endl ;
+    std::cout << "sum intersect( global_box ) = " << intersect_error << std::endl ;
+    std::cout << "max neighbor = " << neighbor_max << std::endl ;
+  }
+
+  return neighbor_max ;
+}
+
+void test_elem()
+{
+  const Kokkos::Example::BoxElemPart::Decompose
+    decompose = Kokkos::Example::BoxElemPart:: DecomposeElem ; // DecomposeElem | DecomposeNode ;
+  const size_t global_size = 256 ;
+  const size_t global_nx = 100 ;
+  const size_t global_ny = 120 ;
+  const size_t global_nz = 140 ;
+
+  double node_count_avg = 0 ;
+  size_t node_count_max = 0 ;
+  size_t node_count_min = ( global_nx + 1 ) * ( global_ny + 1 ) * ( global_nz + 1 );
+  double elem_count_avg = 0 ;
+  size_t elem_count_max = 0 ;
+  size_t elem_count_min = global_nx * global_ny * global_nz ;
+  double recv_count_avg = 0 ;
+  size_t recv_count_max = 0 ;
+  size_t recv_count_min = global_size ;
+  double send_count_avg = 0 ;
+  size_t send_count_max = 0 ;
+  size_t send_count_min = global_size ;
+
+  for ( size_t r = 0 ; r < global_size ; ++r ) {
+    const Kokkos::Example::BoxElemPart
+       fixture( Kokkos::Example::BoxElemPart::ElemLinear ,
+                decompose , global_size , r , global_nx , global_ny , global_nz );
+
+    // Print a sample:
+
+    // if ( r == global_size * 2 / 3 ) fixture.print( std::cout );
+
+    // Verify recv/send alignment:
+
+    {
+      size_t recv_lid = fixture.owns_node_count();
+
+      for ( size_t i = 0 ; i < fixture.recv_node_msg_count() ; ++i ) {
+        const size_t recv_rank  = fixture.recv_node_rank( i );
+        const size_t recv_count = fixture.recv_node_count( i );
+
+        const Kokkos::Example::BoxElemPart other_fixture(
+           Kokkos::Example::BoxElemPart::ElemLinear ,
+           decompose , global_size , recv_rank , global_nx , global_ny , global_nz );
+
+        size_t send_item = 0 ;
+
+        size_t j = 0 ;
+        while ( j < other_fixture.send_node_msg_count() && other_fixture.send_node_rank(j) != r ) {
+          send_item += other_fixture.send_node_count( j );
+           ++j ;
+        }
+
+        if ( recv_count != other_fixture.send_node_count(j) ) {
+          std::cout << "Error P[" << r << "].recv(" << recv_count << ") != "
+                    << "P[" << recv_rank << "].send(" << other_fixture.send_node_count(j) << ")"
+                    << std::endl ;
+        }
+        else {
+
+          for ( size_t k = 0 ; k < recv_count ; ++k , ++send_item , ++recv_lid ) {
+
+            const size_t send_lid = other_fixture.send_node_id( send_item );
+
+            size_t recv_coord[3] , send_coord[3] ;
+
+            fixture.local_node_coord( recv_lid , recv_coord );
+
+            other_fixture.local_node_coord( send_lid , send_coord );
+
+            if ( recv_coord[0] != send_coord[0] ||
+                 recv_coord[1] != send_coord[1] ||
+                 recv_coord[2] != send_coord[2] ) {
+              std::cout << "Error P[" << r << "].recv[" << recv_lid << "]{ "
+                        << recv_coord[0] << " , "
+                        << recv_coord[1] << " , "
+                        << recv_coord[2] << " } != "
+                        << "P[" << recv_rank << "].send[" << send_lid << "]{ "
+                        << send_coord[0] << " , "
+                        << send_coord[1] << " , "
+                        << send_coord[2] << " }"
+                        << std::endl ;
+            }
+          }
+        }
+      }
+    }
+
+    node_count_avg += fixture.owns_node_count();
+    elem_count_avg += fixture.uses_elem_count();
+    recv_count_avg += fixture.recv_node_msg_count();
+    send_count_avg += fixture.send_node_msg_count();
+
+    elem_count_min = std::min( (size_t) fixture.uses_elem_count() , elem_count_min );
+    elem_count_max = std::max( (size_t) fixture.uses_elem_count() , elem_count_max );
+    node_count_min = std::min( (size_t) fixture.owns_node_count() , node_count_min );
+    node_count_max = std::max( (size_t) fixture.owns_node_count() , node_count_max );
+
+    recv_count_max = std::max( (size_t) fixture.recv_node_msg_count() , recv_count_max );
+    recv_count_min = std::min( (size_t) fixture.recv_node_msg_count() , recv_count_min );
+    send_count_max = std::max( (size_t) fixture.send_node_msg_count() , send_count_max );
+    send_count_min = std::min( (size_t) fixture.send_node_msg_count() , send_count_min );
+  }
+
+  node_count_avg /= double(global_size);
+  elem_count_avg /= double(global_size);
+  recv_count_avg /= double(global_size);
+  send_count_avg /= double(global_size);
+
+  std::cout << "Elem min(" << elem_count_min << ") avg(" << elem_count_avg << ") max(" << elem_count_max << ") " << std::endl
+            << "Node min(" << node_count_min << ") avg(" << node_count_avg << ") max(" << node_count_max << ") " << std::endl
+            << "Recv min(" << recv_count_min << ") avg(" << recv_count_avg << ") max(" << recv_count_max << ") " << std::endl
+            << "Send min(" << send_count_min << ") avg(" << send_count_avg << ") max(" << send_count_max << ") " << std::endl
+            ;
+}
+
+int main()
+{
+  for ( int i = 1 ; i <= 32 ; ++i ) {
+    const size_t global_size = 16 * i ;
+    const size_t global_box[3][2] = { { 0 , 65 } , { 0 , 65 } , { 0 , 65 } };
+    if ( 30 < test_box( global_size , global_box , false ) ) {
+      test_box( global_size , global_box , true );
+    }
+  }
+
+//  test_elem();
+
+  {
+    std::cout << "test_fixture< Host >" << std::endl ;
+    Kokkos::HostSpace::execution_space::initialize( 1 );
+    Kokkos::Example::test_fixture< Kokkos::HostSpace::execution_space >();
+    Kokkos::HostSpace::execution_space::finalize();
+  }
+
+#if defined( KOKKOS_HAVE_CUDA )
+  {
+    std::cout << "test_fixture< Cuda >" << std::endl ;
+    Kokkos::HostSpace::execution_space::initialize();
+    Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice(0) );
+    Kokkos::Example::test_fixture< Kokkos::Cuda >();
+    Kokkos::Cuda::finalize();
+    Kokkos::HostSpace::execution_space::finalize();
+  }
+#endif
+}
+
diff --git a/lib/kokkos/example/fixture/Makefile b/lib/kokkos/example/fixture/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..990f4f18e7d420f2cb7c991ba2d9732f50ef1c56
--- /dev/null
+++ b/lib/kokkos/example/fixture/Makefile
@@ -0,0 +1,48 @@
+KOKKOS_PATH = ../..
+
+vpath %.cpp ${KOKKOS_PATH}/example/fixture
+
+EXAMPLE_HEADERS = $(wildcard $(KOKKOS_PATH)/example/common/*.hpp ${KOKKOS_PATH}/example/fixture/*.hpp )
+
+default: build_all
+	echo "End Build"
+        
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+	CXX = $(NVCC_WRAPPER)
+	CXXFLAGS ?= -O3
+	LINK = $(CXX)
+	LDFLAGS ?= -lpthread
+else
+	CXX ?= g++
+	CXXFLAGS ?= -O3
+	LINK ?= $(CXX)
+	LDFLAGS ?= -lpthread
+endif
+
+KOKKOS_CXXFLAGS +=	\
+	-I${KOKKOS_PATH}/example/common	\
+	-I${KOKKOS_PATH}/example/fixture
+
+EXE_EXAMPLE_FIXTURE = KokkosExample_Fixture
+OBJ_EXAMPLE_FIXTURE = Main.o TestFixture.o BoxElemPart.o
+
+TARGETS = $(EXE_EXAMPLE_FIXTURE)
+
+#TEST_TARGETS =
+
+$(EXE_EXAMPLE_FIXTURE) : $(OBJ_EXAMPLE_FIXTURE) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_EXAMPLE_FIXTURE) $(KOKKOS_LIBS) $(LIB) -o $(EXE_EXAMPLE_FIXTURE)
+
+build_all : $(TARGETS)
+
+test : build_all
+
+clean: kokkos-clean
+	rm -f *.o $(TARGETS)
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS) $(EXAMPLE_HEADERS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
diff --git a/lib/kokkos/example/fixture/TestFixture.cpp b/lib/kokkos/example/fixture/TestFixture.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9cf2f07322c9f8df004ac1ae6f13eb8ad066181f
--- /dev/null
+++ b/lib/kokkos/example/fixture/TestFixture.cpp
@@ -0,0 +1,58 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <TestFixture.hpp>
+
+namespace Kokkos {
+namespace Example {
+
+template void test_fixture< Kokkos::HostSpace::execution_space >();
+
+#if defined( KOKKOS_HAVE_CUDA )
+template void test_fixture<Kokkos::Cuda>();
+#endif
+
+} /* namespace Example */
+} /* namespace Kokkos */
+
diff --git a/lib/kokkos/example/fixture/TestFixture.hpp b/lib/kokkos/example/fixture/TestFixture.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..dbf5ca56f512a0a3757b1084e294c016fb038154
--- /dev/null
+++ b/lib/kokkos/example/fixture/TestFixture.hpp
@@ -0,0 +1,156 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_EXAMPLE_TESTFIXTURE_HPP
+#define KOKKOS_EXAMPLE_TESTFIXTURE_HPP
+
+#include <utility>
+#include <iostream>
+
+#include <Kokkos_Core.hpp>
+
+#include <BoxElemPart.hpp>
+#include <BoxElemFixture.hpp>
+
+namespace Kokkos {
+namespace Example {
+
+template< class Device >
+struct FixtureVerifyElemNodeCoord
+{
+  typedef Device execution_space ;
+
+  typedef struct { size_t success , error ; } value_type ;
+
+  typedef Kokkos::Example::BoxElemFixture< Device , Kokkos::Example::BoxElemPart::ElemLinear > FixtureType ;
+
+  FixtureType m_fixture ;
+
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type & update ) const { update.success = update.error = 0 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile       value_type & update ,
+             volatile const value_type & input ) const
+    {
+      update.success += input.success ;
+      update.error += input.error ;
+    }
+  
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( size_t ielem , value_type & update ) const
+  {
+    unsigned node_coord[ FixtureType::ElemNode ][3] ;
+
+    for ( unsigned i = 0 ; i < FixtureType::ElemNode ; ++i ) {
+      const unsigned node_id = m_fixture.elem_node(ielem,i);
+      node_coord[i][0] = m_fixture.node_grid(node_id,0);
+      node_coord[i][1] = m_fixture.node_grid(node_id,1);
+      node_coord[i][2] = m_fixture.node_grid(node_id,2);
+    }
+
+    int error = 0 ;
+    for ( unsigned i = 1 ; i < FixtureType::ElemNode ; ++i ) {
+      if ( node_coord[0][0] + m_fixture.elem_node_local(i,0) != node_coord[i][0] ||
+           node_coord[0][1] + m_fixture.elem_node_local(i,1) != node_coord[i][1] ||
+           node_coord[0][2] + m_fixture.elem_node_local(i,2) != node_coord[i][2] ) {
+        error = 1 ;
+      }
+    }
+
+    if ( error ) {
+      ++update.error ;
+    }
+    else {
+      ++update.success ;
+    }
+  }
+
+  FixtureVerifyElemNodeCoord( const FixtureType & f ) : m_fixture(f) {}
+};
+
+
+template< class Device >
+void test_fixture()
+{
+  typedef Kokkos::Example::BoxElemFixture< Device , Kokkos::Example::BoxElemPart::ElemLinear > FixtureType ;
+
+  const Kokkos::Example::BoxElemPart::Decompose
+    decompose = Kokkos::Example::BoxElemPart:: DecomposeElem ; // DecomposeElem | DecomposeNode ;
+
+  const unsigned global_size = 256 ;
+  const unsigned global_nx = 400 ;
+  const unsigned global_ny = 400 ;
+  const unsigned global_nz = 400 ;
+
+  for ( unsigned my_rank = 0 ; my_rank < global_size ; ++my_rank ) {
+
+    const FixtureType fixture( decompose , global_size , my_rank , global_nx , global_ny , global_nz );
+
+    // Verify grid coordinates of element's nodes
+    
+    typename FixtureVerifyElemNodeCoord<Device>::value_type result = { 0 , 0 };
+
+    Kokkos::parallel_reduce( fixture.elem_node().dimension_0() , FixtureVerifyElemNodeCoord<Device>( fixture ) , result );
+
+    if ( result.error ) {
+      std::cout << "P[" << my_rank << ":" << global_size
+                << "] Fixture elem_node_coord"
+                << " success(" << result.success << ")"
+                << " error(" << result.error << ")"
+                << std::endl ;
+    }
+
+    // Check send/recv alignment
+
+
+  }
+}
+
+
+} /* namespace Example */
+} /* namespace Kokkos */
+
+#endif /* #ifndef KOKKOS_EXAMPLE_TESTFIXTURE_HPP */
+
diff --git a/lib/kokkos/example/global_2_local_ids/CMakeLists.txt b/lib/kokkos/example/global_2_local_ids/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9f32fe580246233f0a5358b5d505abfdeebd0d14
--- /dev/null
+++ b/lib/kokkos/example/global_2_local_ids/CMakeLists.txt
@@ -0,0 +1,17 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+SET(SOURCES "")
+
+SET(SOURCES
+  G2L_Main.cpp 
+  )
+
+TRIBITS_ADD_EXECUTABLE(
+  global_2_local_ids
+  SOURCES ${SOURCES}
+  COMM serial mpi
+  )
+
+
diff --git a/lib/kokkos/example/global_2_local_ids/G2L.hpp b/lib/kokkos/example/global_2_local_ids/G2L.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..9023ae04267835ec38f0fd20b9dcd3caf798ad04
--- /dev/null
+++ b/lib/kokkos/example/global_2_local_ids/G2L.hpp
@@ -0,0 +1,266 @@
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+
+#ifndef KOKKOS_GLOBAL_TO_LOCAL_IDS_HPP
+#define KOKKOS_GLOBAL_TO_LOCAL_IDS_HPP
+
+#include <Kokkos_Core.hpp>
+
+#include <Kokkos_UnorderedMap.hpp>
+
+#include <vector>
+#include <algorithm>
+#include <iomanip>
+
+#include <impl/Kokkos_Timer.hpp>
+
+// This test will simulate global ids
+
+namespace G2L {
+
+static const unsigned begin_id_size = 256u;
+static const unsigned end_id_size = 1u << 25;
+static const unsigned id_step = 2u;
+
+//use to help generate global ids
+union helper
+{
+  uint32_t word;
+  uint8_t byte[4];
+};
+
+
+//generate a unique global id from the local id
+template <typename Device>
+struct generate_ids
+{
+  typedef Device execution_space;
+  typedef typename execution_space::size_type size_type;
+  typedef Kokkos::View<uint32_t*,execution_space> local_id_view;
+
+  local_id_view local_2_global;
+
+  generate_ids( local_id_view & ids)
+    : local_2_global(ids)
+  {
+    Kokkos::parallel_for(local_2_global.size(), *this);
+  }
+
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(size_type i) const
+  {
+
+    helper x = {static_cast<uint32_t>(i)};
+
+    // shuffle the bytes of i to create a unique, semi-random global_id
+    x.word = ~x.word;
+
+    uint8_t tmp = x.byte[3];
+    x.byte[3] = x.byte[1];
+    x.byte[1] = tmp;
+
+    tmp = x.byte[2];
+    x.byte[2] = x.byte[0];
+    x.byte[0] = tmp;
+
+    local_2_global[i] = x.word;
+  }
+
+};
+
+// fill a map of global_id -> local_id
+template <typename Device>
+struct fill_map
+{
+  typedef Device execution_space;
+  typedef typename execution_space::size_type size_type;
+  typedef Kokkos::View<const uint32_t*,execution_space, Kokkos::MemoryRandomAccess> local_id_view;
+  typedef Kokkos::UnorderedMap<uint32_t,size_type,execution_space> global_id_view;
+
+  global_id_view global_2_local;
+  local_id_view local_2_global;
+
+  fill_map( global_id_view gIds, local_id_view lIds)
+    : global_2_local(gIds) , local_2_global(lIds)
+  {
+    Kokkos::parallel_for(local_2_global.size(), *this);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(size_type i) const
+  {
+    global_2_local.insert( local_2_global[i], i);
+  }
+
+};
+
+// check that the global id is found and that it maps to the local id
+template <typename Device>
+struct find_test
+{
+  typedef Device execution_space;
+  typedef typename execution_space::size_type size_type;
+  typedef Kokkos::View<const uint32_t*,execution_space, Kokkos::MemoryRandomAccess> local_id_view;
+  typedef Kokkos::UnorderedMap<const uint32_t, const size_type,execution_space> global_id_view;
+
+  global_id_view global_2_local;
+  local_id_view local_2_global;
+
+  typedef size_t value_type;
+
+  find_test( global_id_view gIds, local_id_view lIds, value_type & num_errors)
+    : global_2_local(gIds) , local_2_global(lIds)
+  {
+    Kokkos::parallel_reduce(local_2_global.size(), *this, num_errors);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void init(value_type & v) const
+  { v = 0; }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile value_type & dst, volatile value_type const & src) const
+  { dst += src; }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(size_type i, value_type & num_errors) const
+  {
+    uint32_t index = global_2_local.find( local_2_global[i] );
+
+    if (  !global_2_local.valid_at(index)
+        || global_2_local.key_at(index) != local_2_global[i]
+        || global_2_local.value_at(index) != i)
+      ++num_errors;
+  }
+
+};
+
+// run test
+template <typename Device>
+size_t test_global_to_local_ids(unsigned num_ids, unsigned capacity, unsigned num_find_iterations)
+{
+
+  typedef Device execution_space;
+  typedef typename execution_space::size_type size_type;
+
+  typedef Kokkos::View<uint32_t*,execution_space> local_id_view;
+  typedef Kokkos::UnorderedMap<uint32_t,size_type,execution_space> global_id_view;
+
+  double elasped_time = 0;
+  Kokkos::Timer timer;
+
+  local_id_view local_2_global("local_ids", num_ids);
+  global_id_view global_2_local(capacity);
+
+  int shiftw = 15;
+
+  //create
+  elasped_time = timer.seconds();
+  std::cout << std::setw(shiftw) <<  "allocate: " <<  elasped_time << std::endl;
+  timer.reset();
+
+  // generate unique ids
+  {
+    generate_ids<Device> gen(local_2_global);
+  }
+
+  // generate
+  elasped_time = timer.seconds();
+  std::cout << std::setw(shiftw) << "generate: " <<  elasped_time << std::endl;
+  timer.reset();
+
+  {
+    fill_map<Device> fill(global_2_local, local_2_global);
+  }
+
+  // fill
+  elasped_time = timer.seconds();
+  std::cout << std::setw(shiftw) << "fill: " <<  elasped_time << std::endl;
+  timer.reset();
+
+
+  size_t num_errors = global_2_local.failed_insert();
+
+  if (num_errors == 0u) {
+    for (unsigned i=0; i<num_find_iterations; ++i)
+    {
+      find_test<Device> find(global_2_local, local_2_global,num_errors);
+    }
+
+    // find
+    elasped_time = timer.seconds();
+    std::cout << std::setw(shiftw) << "lookup: " <<  elasped_time << std::endl;
+  }
+  else {
+    std::cout << "    !!! Fill Failed !!!" << std::endl;
+  }
+
+  return num_errors;
+}
+
+template <typename Device>
+size_t run_test(unsigned num_ids, unsigned num_find_iterations)
+{
+  // expect to fail
+  unsigned capacity = (num_ids*2u)/3u;
+  std::cout << " 66% of needed capacity (should fail)" << std::endl;
+  test_global_to_local_ids<Device>(num_ids, capacity, num_find_iterations);
+
+  //should not fail
+  std::cout << " 100% of needed capacity" << std::endl;
+  capacity = num_ids;
+  size_t num_errors = test_global_to_local_ids<Device>(num_ids, capacity, num_find_iterations);
+
+  //should not fail
+  std::cout << " 150% of needed capacity" << std::endl;
+  capacity = (num_ids*3u)/2u;
+  num_errors += test_global_to_local_ids<Device>(num_ids, capacity, num_find_iterations);
+
+  return num_errors;
+}
+
+
+} // namespace G2L
+
+
+#endif //KOKKOS_GLOBAL_TO_LOCAL_IDS_HPP
+
diff --git a/lib/kokkos/example/global_2_local_ids/G2L_Main.cpp b/lib/kokkos/example/global_2_local_ids/G2L_Main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..171ed4b5f6d05bd7ddacd14f915fe44d4e8913a2
--- /dev/null
+++ b/lib/kokkos/example/global_2_local_ids/G2L_Main.cpp
@@ -0,0 +1,149 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+
+#include <G2L.hpp>
+
+namespace G2L {
+
+size_t run_serial(unsigned num_ids, unsigned num_find_iterations)
+{
+#ifdef KOKKOS_HAVE_SERIAL
+  std::cout << "Serial" << std::endl;
+  return run_test<Kokkos::Serial>(num_ids,num_find_iterations);
+#else
+  return 0;
+#endif // KOKKOS_HAVE_SERIAL
+}
+
+size_t run_threads(unsigned num_ids, unsigned num_find_iterations)
+{
+#ifdef KOKKOS_HAVE_PTHREAD
+  std::cout << "Threads" << std::endl;
+  return run_test<Kokkos::Threads>(num_ids,num_find_iterations);
+#else
+  return 0;
+#endif
+}
+
+size_t run_openmp(unsigned num_ids, unsigned num_find_iterations)
+{
+#ifdef KOKKOS_HAVE_OPENMP
+  std::cout << "OpenMP" << std::endl;
+  return run_test<Kokkos::OpenMP>(num_ids,num_find_iterations);
+#else
+  return 0;
+#endif
+}
+
+size_t run_cuda(unsigned num_ids, unsigned num_find_iterations)
+{
+#ifdef KOKKOS_HAVE_CUDA
+  std::cout << "Cuda" << std::endl;
+  return run_test<Kokkos::Cuda>(num_ids,num_find_iterations);
+#else
+  return 0;
+#endif
+}
+
+} // namespace G2L
+
+
+int main(int argc, char *argv[])
+{
+  unsigned num_ids = 100000;
+  unsigned num_find_iterations = 1000;
+
+  if (argc == 3) {
+    num_ids = atoi(argv[1]);
+    num_find_iterations = atoi(argv[2]);
+  }
+  else if (argc != 1) {
+    std::cout << argv[0] << " num_ids num_find_iterations" << std::endl;
+    return 0;
+  }
+
+
+  // query the topology of the host
+  unsigned threads_count = 4 ;
+
+  if (Kokkos::hwloc::available()) {
+    threads_count = Kokkos::hwloc::get_available_numa_count() *
+                    Kokkos::hwloc::get_available_cores_per_numa() *
+                    Kokkos::hwloc::get_available_threads_per_core();
+
+  }
+
+  std::cout << "Threads: " << threads_count << std::endl;
+  std::cout << "Number of ids: " << num_ids << std::endl;
+  std::cout << "Number of find iterations: " << num_find_iterations << std::endl;
+
+  size_t num_errors = 0;
+
+  num_errors += G2L::run_serial(num_ids,num_find_iterations);
+
+#ifdef KOKKOS_HAVE_CUDA
+  Kokkos::HostSpace::execution_space::initialize(threads_count);
+  Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice(0) );
+  num_errors += G2L::run_cuda(num_ids,num_find_iterations);
+  Kokkos::Cuda::finalize();
+  Kokkos::HostSpace::execution_space::finalize();
+#endif
+
+#ifdef KOKKOS_HAVE_PTHREAD
+  Kokkos::Threads::initialize( threads_count );
+  num_errors += G2L::run_threads(num_ids,num_find_iterations);
+  Kokkos::Threads::finalize();
+#endif
+
+#ifdef KOKKOS_HAVE_OPENMP
+  Kokkos::OpenMP::initialize( threads_count );
+  num_errors += G2L::run_openmp(num_ids,num_find_iterations);
+  Kokkos::OpenMP::finalize();
+#endif
+
+
+  return num_errors;
+}
+
diff --git a/lib/kokkos/example/global_2_local_ids/Makefile b/lib/kokkos/example/global_2_local_ids/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..bf8fbea3e09a5d71f900de85ff2100cf41bd5738
--- /dev/null
+++ b/lib/kokkos/example/global_2_local_ids/Makefile
@@ -0,0 +1,53 @@
+KOKKOS_PATH ?= ../..
+
+MAKEFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST)))
+SRC_DIR := $(dir $(MAKEFILE_PATH))
+
+SRC = $(wildcard $(SRC_DIR)/*.cpp)
+OBJ = $(SRC:$(SRC_DIR)/%.cpp=%.o)
+
+#SRC = $(wildcard *.cpp)
+#OBJ = $(SRC:%.cpp=%.o)
+
+default: build
+	echo "Start Build"
+
+# use installed Makefile.kokkos
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+CXX = $(NVCC_WRAPPER)
+CXXFLAGS = -I$(SRC_DIR) -O3
+LINK = $(CXX)
+LINKFLAGS = 
+EXE = $(addsuffix .cuda, $(shell basename $(SRC_DIR)))
+#KOKKOS_DEVICES = "Cuda,OpenMP"
+#KOKKOS_ARCH = "SNB,Kepler35"
+else
+CXX = g++
+CXXFLAGS = -I$(SRC_DIR) -O3
+LINK = $(CXX)
+LINKFLAGS =  
+EXE = $(addsuffix .host, $(shell basename $(SRC_DIR)))
+#KOKKOS_DEVICES = "OpenMP"
+#KOKKOS_ARCH = "SNB"
+endif
+
+DEPFLAGS = -M
+
+LIB =
+
+
+build: $(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: 
+	rm -f *.a *.o *.cuda *.host
+
+# Compilation rules
+
+%.o:$(SRC_DIR)/%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
+
diff --git a/lib/kokkos/example/grow_array/CMakeLists.txt b/lib/kokkos/example/grow_array/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d9ff17049290af181d4f693cf9936627b28d087e
--- /dev/null
+++ b/lib/kokkos/example/grow_array/CMakeLists.txt
@@ -0,0 +1,14 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+SET(SOURCES "")
+
+FILE(GLOB SOURCES *.cpp)
+
+TRIBITS_ADD_EXECUTABLE(
+  grow_array
+  SOURCES ${SOURCES}
+  COMM serial mpi
+  )
+
diff --git a/lib/kokkos/example/grow_array/Makefile b/lib/kokkos/example/grow_array/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..bf8fbea3e09a5d71f900de85ff2100cf41bd5738
--- /dev/null
+++ b/lib/kokkos/example/grow_array/Makefile
@@ -0,0 +1,53 @@
+KOKKOS_PATH ?= ../..
+
+MAKEFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST)))
+SRC_DIR := $(dir $(MAKEFILE_PATH))
+
+SRC = $(wildcard $(SRC_DIR)/*.cpp)
+OBJ = $(SRC:$(SRC_DIR)/%.cpp=%.o)
+
+#SRC = $(wildcard *.cpp)
+#OBJ = $(SRC:%.cpp=%.o)
+
+default: build
+	echo "Start Build"
+
+# use installed Makefile.kokkos
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+CXX = $(NVCC_WRAPPER)
+CXXFLAGS = -I$(SRC_DIR) -O3
+LINK = $(CXX)
+LINKFLAGS = 
+EXE = $(addsuffix .cuda, $(shell basename $(SRC_DIR)))
+#KOKKOS_DEVICES = "Cuda,OpenMP"
+#KOKKOS_ARCH = "SNB,Kepler35"
+else
+CXX = g++
+CXXFLAGS = -I$(SRC_DIR) -O3
+LINK = $(CXX)
+LINKFLAGS =  
+EXE = $(addsuffix .host, $(shell basename $(SRC_DIR)))
+#KOKKOS_DEVICES = "OpenMP"
+#KOKKOS_ARCH = "SNB"
+endif
+
+DEPFLAGS = -M
+
+LIB =
+
+
+build: $(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: 
+	rm -f *.a *.o *.cuda *.host
+
+# Compilation rules
+
+%.o:$(SRC_DIR)/%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
+
diff --git a/lib/kokkos/example/grow_array/grow_array.hpp b/lib/kokkos/example/grow_array/grow_array.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..9daef1a4b084564e84559de634cd5e6ee5bb9425
--- /dev/null
+++ b/lib/kokkos/example/grow_array/grow_array.hpp
@@ -0,0 +1,257 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef EXAMPLE_GROW_ARRAY
+#define EXAMPLE_GROW_ARRAY
+
+#include <stdlib.h>
+
+#include <Kokkos_Core.hpp>
+
+#include <algorithm>
+
+#if defined(KOKKOS_HAVE_CUDA)
+#include <thrust/device_ptr.h>
+#include <thrust/sort.h>
+#endif
+
+namespace Example {
+
+//----------------------------------------------------------------------------
+
+template< class ExecSpace >
+struct SortView {
+
+  template< typename ValueType >
+  SortView( const Kokkos::View<ValueType*,ExecSpace> v , int begin , int end )
+    {
+      std::sort( v.ptr_on_device() + begin , v.ptr_on_device() + end );
+    }
+};
+
+#if defined(KOKKOS_HAVE_CUDA)
+template<>
+struct SortView< Kokkos::Cuda > {
+  template< typename ValueType >
+  SortView( const Kokkos::View<ValueType*,Kokkos::Cuda> v , int begin , int end )
+    {
+      thrust::sort( thrust::device_ptr<ValueType>( v.ptr_on_device() + begin )
+                  , thrust::device_ptr<ValueType>( v.ptr_on_device() + end ) );
+    }
+};
+#endif
+
+
+
+//----------------------------------------------------------------------------
+
+template< class ExecSpace >
+struct GrowArrayFunctor {
+
+  typedef ExecSpace  execution_space ;
+
+  enum { SHIFT = sizeof(int) == 8 ? 6 : 5 }; // 8 or 4 byte int
+  enum { MASK  = ( 1 << SHIFT ) - 1 };
+
+  const Kokkos::View<int*,ExecSpace>  m_search_flags ; // bit flags for values to append
+  const Kokkos::View<int*,ExecSpace>  m_search_array ; // array to append values
+  const Kokkos::View<int,ExecSpace>   m_search_count ; // offset
+  const int m_search_total ;
+  const int m_search_team_chunk ;
+
+  GrowArrayFunctor( int array_length , int search_length , int print = 1 )
+    : m_search_flags( "flags" , ( search_length + MASK ) >> SHIFT ) // One bit per search entry
+    , m_search_array( "array" , array_length )
+    , m_search_count( "count" )
+    , m_search_total( search_length )
+    , m_search_team_chunk( 2048 )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  bool flag_is_set( const int index ) const
+    {
+      // 64 or 32 bit integer:
+
+      const int j = index >> SHIFT ; // which integer flag
+      const int k = 1 << ( index & MASK ); // which bit in that integer
+      const int s = ( j < int(m_search_flags.dimension_0()) ) && ( 0 != ( m_search_flags(j) & k ) );
+
+      return s ;
+    }
+
+  typedef typename Kokkos::TeamPolicy<ExecSpace>::member_type team_member ;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const team_member & member ) const
+    {
+      enum { LOCAL_BUFFER_LENGTH = 16 };
+
+      int local_buffer[ LOCAL_BUFFER_LENGTH ] ;
+      int local_count = 0 ;
+
+      // Each team searches 'm_search_team_chunk' indices.
+      // The threads of a team must iterate together because all
+      // threads in the team must call 'team_scan' to prevent deadlock in the team.
+
+            int search_team_begin = member.league_rank() * m_search_team_chunk ;
+      const int search_team_end   = search_team_begin + m_search_team_chunk ;
+
+      int k = 0 ;
+
+      while ( search_team_begin < search_team_end ) {
+
+        // This iteration searches [ search_team_begin .. search_team_begin + member.team_size() ]
+        const int thread_search_index = search_team_begin + member.team_rank();
+
+        // If this thread's search index is in the range
+        // and the flag is set, push into this thread's local buffer.
+        if ( thread_search_index < m_search_total && flag_is_set(thread_search_index) ) {
+          local_buffer[ local_count ] = thread_search_index ;
+          ++local_count ;
+        }
+
+        // Move the team's search range forward
+        search_team_begin += member.team_size(); // Striding team by team size
+
+        // Count number of times a thread's buffer might have grown:
+        ++k ;
+
+        // Write buffer if end of search or a thread might have filled its buffer.
+        if ( k == LOCAL_BUFFER_LENGTH /* A thread in my team might have filled its buffer */ ||
+             ! ( search_team_begin < search_team_end ) /* Team is at the end of its search */ ) {
+
+          // Team's exclusive scan of threads' contributions, with global offset.
+          // This thread writes its buffer into [ team_offset .. team_offset + local_count )
+          const int team_offset = member.team_scan( local_count , & *m_search_count );
+
+          // Copy locally buffered entries into global array:
+          for ( int i = 0 ; i < local_count ; ++i ) {
+            m_search_array( team_offset + i ) = local_buffer[i] ;
+          }
+
+          k = 0 ;
+          local_count = 0 ;
+        }
+      }
+    }
+};
+
+
+template< class ExecSpace >
+void grow_array( int array_length , int search_length , int print = 1 )
+{
+  typedef GrowArrayFunctor< ExecSpace > FunctorType ;
+
+  FunctorType functor( array_length , search_length , print );
+
+  typename Kokkos::View<int,ExecSpace>::HostMirror  count = Kokkos::create_mirror_view( functor.m_search_count );
+  typename Kokkos::View<int*,ExecSpace>::HostMirror flags = Kokkos::create_mirror_view( functor.m_search_flags );
+
+  // Set at most 'array_length' random bits over the search length.
+  for ( int i = 0 ; i < array_length ; ++i ) {
+    // 'lrand48()' generates random number between [0..2^31]
+    // index = ( lrand48() * search_length ) / ( 2^31 )
+    const long int index = ( lrand48() * search_length ) >> 31 ;
+    // set the bit within the flags:
+    flags( index >> FunctorType::SHIFT ) |= ( 1 << ( index & FunctorType::MASK ) );
+  }
+
+  Kokkos::deep_copy( functor.m_search_flags , flags );
+
+  // Each team works on 'functor.m_search_team_chunk' span of the search_length
+  Kokkos::TeamPolicy< ExecSpace >
+    work( /* #teams */ ( search_length + functor.m_search_team_chunk - 1 ) / functor.m_search_team_chunk
+        , /* threads/team */ Kokkos::TeamPolicy< ExecSpace >::team_size_max( functor ) );
+
+  // Fill array:
+  Kokkos::parallel_for( work , functor );
+
+  // How much was filled:
+  Kokkos::deep_copy( count , functor.m_search_count );
+
+  // Sort array:
+  SortView< ExecSpace >( functor.m_search_array , 0 , *count );
+
+  // Mirror the results:
+  typename Kokkos::View<int*,ExecSpace>::HostMirror results = Kokkos::create_mirror_view( functor.m_search_array );
+  Kokkos::deep_copy( results , functor.m_search_array );
+
+  // Verify results:
+  int result_error_count = 0 ;
+  int flags_error_count = 0 ;
+  for ( int i = 0 ; i < *count ; ++i ) {
+    const int index = results(i);
+    const int entry = index >> FunctorType::SHIFT ;
+    const int bit   = 1 << ( index & FunctorType::MASK );
+    const bool flag = 0 != ( flags( entry ) & bit );
+    if ( ! flag ) {
+      if ( print ) std::cerr << "result( " << i << " : " << index << " )";
+      ++result_error_count ;
+    }
+    flags( entry ) &= ~bit ; // Clear that verified bit
+  }
+
+  for ( int i = 0 ; i < int(flags.dimension_0()) ; ++i ) {
+    // If any uncleared bits then an error
+    if ( flags(i) ) {
+      if ( print ) std::cerr << "flags( " << i << " : " << flags(i) << " )" ;
+      ++flags_error_count ;
+    }
+  }
+
+  if ( result_error_count || flags_error_count ) {
+    std::cerr << std::endl << "Example::GrowArrayFunctor( " << array_length
+              << " , " << search_length
+              << " ) result_error_count( " << result_error_count << " )"
+              << " ) flags_error_count( " << flags_error_count << " )"
+              << std::endl ;
+  }
+}
+
+
+} // namespace Example
+
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef EXAMPLE_GROW_ARRAY */
+
diff --git a/lib/kokkos/example/grow_array/main.cpp b/lib/kokkos/example/grow_array/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4693aa3af47676957ffb11468550b2dadc7fa748
--- /dev/null
+++ b/lib/kokkos/example/grow_array/main.cpp
@@ -0,0 +1,110 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <iostream>
+#include <sstream>
+
+#include <Kokkos_Core.hpp>
+
+#include <grow_array.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+int main( int argc , char ** argv )
+{
+  int num_threads = 4 ;
+  int use_numa = 1 ;
+  int use_core = 1 ;
+  int length_array  = 1000000 ;
+  int span_values = 100000000 ;
+
+
+  if ( Kokkos::hwloc::available() ) {
+    use_numa = Kokkos::hwloc::get_available_numa_count();
+    use_core = Kokkos::hwloc::get_available_cores_per_numa() - 1 ;
+    num_threads = use_numa * use_core * Kokkos::hwloc::get_available_threads_per_core();
+  }
+
+#if defined( KOKKOS_HAVE_SERIAL )
+  {
+    std::cout << "Kokkos::Serial" << std::endl ;
+    // The Serial device accepts these arguments, though it may ignore them.
+    Kokkos::Serial::initialize( num_threads , use_numa , use_core );
+    Example::grow_array< Kokkos::Serial >( length_array , span_values );
+    Kokkos::Serial::finalize ();
+  }
+#endif // defined( KOKKOS_HAVE_SERIAL )
+
+#if defined( KOKKOS_HAVE_PTHREAD )
+  {
+    std::cout << "Kokkos::Threads" << std::endl ;
+    Kokkos::Threads::initialize( num_threads , use_numa , use_core );
+    Example::grow_array< Kokkos::Threads >( length_array , span_values );
+    Kokkos::Threads::finalize();
+  }
+#endif
+
+#if defined( KOKKOS_HAVE_OPENMP )
+  {
+    std::cout << "Kokkos::OpenMP" << std::endl ;
+    Kokkos::OpenMP::initialize( num_threads , use_numa , use_core );
+    Example::grow_array< Kokkos::OpenMP >( length_array , span_values );
+    Kokkos::OpenMP::finalize();
+  }
+#endif
+
+#if defined( KOKKOS_HAVE_CUDA )
+  {
+    std::cout << "Kokkos::Cuda" << std::endl ;
+    Kokkos::HostSpace::execution_space::initialize(1);
+    Kokkos::Cuda::initialize();
+    Example::grow_array< Kokkos::Cuda >( length_array , span_values );
+    Kokkos::Cuda::finalize();
+    Kokkos::HostSpace::execution_space::finalize();
+  }
+#endif
+
+  return 0 ;
+}
+
diff --git a/lib/kokkos/example/ichol/Makefile b/lib/kokkos/example/ichol/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..57e972f042d94c337e8d6b73fffcec2e0d40ad90
--- /dev/null
+++ b/lib/kokkos/example/ichol/Makefile
@@ -0,0 +1,63 @@
+SCOTCH_PATH = /home/hcedwar/scotch/6.0.0
+KOKKOS_PATH = ../..
+
+vpath %.cpp ${KOKKOS_PATH}/example/ichol/src ${KOKKOS_PATH}/example/ichol/example 
+
+EXAMPLE_HEADERS = $(wildcard $(KOKKOS_PATH)/example/ichol/src/*.hpp ${KOKKOS_PATH}/example/ichol/example/*.hpp )
+
+default: build_all
+	echo "End Build"
+        
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+	CXX = $(NVCC_WRAPPER)
+	CXXFLAGS ?= -O3
+	LINK = $(CXX)
+	LDFLAGS ?= -lpthread
+else
+	CXX ?= g++
+	CXXFLAGS ?= -O3
+	LINK ?= $(CXX)
+	LDFLAGS ?= -lpthread
+endif
+
+KOKKOS_CXXFLAGS +=	\
+	-I${KOKKOS_PATH}/example/ichol/src	\
+	-I${KOKKOS_PATH}/example/ichol/example	\
+	-I${SCOTCH_PATH}/include
+
+EXE_EXAMPLE_ICHOL_THREADS = KokkosExample_ichol_threads
+OBJ_EXAMPLE_ICHOL_THREADS = example_chol_performance_device_pthread.o
+
+EXE_EXAMPLE_ICHOL_CUDA = KokkosExample_ichol_cuda
+OBJ_EXAMPLE_ICHOL_CUDA = example_chol_performance_device_cuda.o
+
+TARGETS = $(EXE_EXAMPLE_ICHOL_THREADS) $(EXE_EXAMPLE_ICHOL_CUDA)
+
+#TEST_TARGETS =
+
+$(EXE_EXAMPLE_ICHOL_THREADS) : $(OBJ_EXAMPLE_ICHOL_THREADS) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) \
+	$(OBJ_EXAMPLE_ICHOL_THREADS) $(KOKKOS_LIBS) $(LIB) \
+	-L${SCOTCH_PATH}/lib -lscotch  -lscotcherr  -lscotcherrexit \
+	-o $(EXE_EXAMPLE_ICHOL_THREADS)
+
+$(EXE_EXAMPLE_ICHOL_CUDA) : $(OBJ_EXAMPLE_ICHOL_CUDA) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) \
+	$(OBJ_EXAMPLE_ICHOL_CUDA) $(KOKKOS_LIBS) $(LIB) \
+	-L${SCOTCH_PATH}/lib -lscotch  -lscotcherr  -lscotcherrexit \
+	-o $(EXE_EXAMPLE_ICHOL_CUDA)
+
+build_all : $(TARGETS)
+
+test : build_all
+
+clean: kokkos-clean
+	rm -f *.o $(TARGETS)
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS) $(EXAMPLE_HEADERS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
+
diff --git a/lib/kokkos/example/ichol/example/example_chol_performance_device.hpp b/lib/kokkos/example/ichol/example/example_chol_performance_device.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ca819e4f97028eb0782c7e6c5638945d40f7597b
--- /dev/null
+++ b/lib/kokkos/example/ichol/example/example_chol_performance_device.hpp
@@ -0,0 +1,240 @@
+#pragma once
+#ifndef __EXAMPLE_CHOL_PERFORMANCE_DEVICE_HPP__
+#define __EXAMPLE_CHOL_PERFORMANCE_DEVICE_HPP__
+
+#include <Kokkos_Core.hpp>
+#include <impl/Kokkos_Timer.hpp>
+
+#include "util.hpp"
+
+#include "crs_matrix_base.hpp"
+#include "crs_matrix_view.hpp"
+#include "crs_row_view.hpp"
+
+#include "graph_helper_scotch.hpp"
+#include "symbolic_factor_helper.hpp"
+#include "crs_matrix_helper.hpp"
+
+#include "task_view.hpp"
+
+#include "task_factory.hpp"
+
+#include "chol.hpp"
+
+namespace Tacho {
+
+  using namespace std;
+
+  template<typename ValueType,
+           typename OrdinalType,
+           typename SizeType = OrdinalType,
+           typename SpaceType = void>
+  int exampleCholPerformanceDevice(const string file_input,
+                                   const int treecut,
+                                   const int prunecut,
+                                   const int seed,
+                                   const int nthreads,
+                                   const int max_task_dependence,
+                                   const int max_concurrency,
+                                   const int team_size,
+                                   const int fill_level,
+                                   const int league_size,
+                                   const bool skip_serial,
+                                   const bool verbose) {
+    typedef ValueType   value_type;
+    typedef OrdinalType ordinal_type;
+    typedef SizeType    size_type;
+    typedef typename
+       Kokkos::Impl::is_space< SpaceType >::host_mirror_space::execution_space
+         HostSpaceType ;
+
+    typedef TaskFactory<Kokkos::Experimental::TaskPolicy<SpaceType>,
+      Kokkos::Experimental::Future<int,SpaceType> > TaskFactoryType;
+
+    typedef CrsMatrixBase<value_type,ordinal_type,size_type,SpaceType>
+      CrsMatrixBaseType;
+
+    typedef CrsMatrixBase<value_type,ordinal_type,size_type,HostSpaceType>
+      CrsMatrixBaseHostType;
+
+    typedef Kokkos::MemoryUnmanaged MemoryUnmanaged ;
+
+    typedef CrsMatrixBase<value_type,ordinal_type,size_type,SpaceType,MemoryUnmanaged >
+      CrsMatrixNestedType;
+
+
+    typedef GraphHelper_Scotch<CrsMatrixBaseHostType> GraphHelperType;
+    typedef SymbolicFactorHelper<CrsMatrixBaseHostType> SymbolicFactorHelperType;
+
+    typedef CrsMatrixView<CrsMatrixNestedType> CrsMatrixViewType;
+    typedef TaskView<CrsMatrixViewType,TaskFactoryType> CrsTaskViewType;
+
+    typedef CrsMatrixBase<CrsTaskViewType,ordinal_type,size_type,SpaceType> CrsHierMatrixBaseType;
+
+    typedef CrsMatrixView<CrsHierMatrixBaseType> CrsHierMatrixViewType;
+    typedef TaskView<CrsHierMatrixViewType,TaskFactoryType> CrsHierTaskViewType;
+
+    int r_val = 0;
+
+    Kokkos::Timer timer;
+    double
+      t_import = 0.0,
+      t_reorder = 0.0,
+      t_symbolic = 0.0,
+      t_flat2hier = 0.0,
+      t_factor_task = 0.0;
+
+    cout << "CholPerformanceDevice:: import input file = " << file_input << endl;
+    CrsMatrixBaseHostType AA("AA");
+    {
+      timer.reset();
+
+      ifstream in;
+      in.open(file_input);
+      if (!in.good()) {
+        cout << "Failed in open the file: " << file_input << endl;
+        return ++r_val;
+      }
+      AA.importMatrixMarket(in);
+
+      t_import = timer.seconds();
+
+      if (verbose) {
+        AA.showMe( std::cout );
+        std::cout << endl;
+      }
+    }
+    cout << "CholPerformanceDevice:: import input file::time = " << t_import << endl;
+
+    cout << "CholPerformanceDevice:: reorder the matrix" << endl;
+    CrsMatrixBaseHostType PA("Permuted AA");
+
+    // '*_UU' is the permuted base upper triangular matrix
+    CrsMatrixBaseHostType host_UU("host_UU");
+    CrsMatrixBaseType     device_UU("UU");
+    CrsHierMatrixBaseType device_HU("HU");;
+
+    // typename CrsMatrixBaseHostType host_UU("host_UU");
+
+    {
+      typename GraphHelperType::size_type_array rptr("Graph::RowPtrArray", AA.NumRows() + 1);
+      typename GraphHelperType::ordinal_type_array cidx("Graph::ColIndexArray", AA.NumNonZeros());
+
+      AA.convertGraph(rptr, cidx);
+      GraphHelperType S("ScotchHelper",
+                        AA.NumRows(),
+                        rptr,
+                        cidx,
+                        seed);
+      {
+        timer.reset();
+
+        S.computeOrdering(treecut, 0);
+        S.pruneTree(prunecut);
+
+        PA.copy(S.PermVector(), S.InvPermVector(), AA);
+
+        t_reorder = timer.seconds();
+
+        if (verbose) {
+          S.showMe( std::cout );
+          std::cout << std::endl ;
+          PA.showMe( std::cout );
+          std::cout << std::endl ;
+        }
+      }
+
+      // Symbolic factorization adds non-zero entries
+      // for factorization levels.
+      // Runs on the host process and currently requires std::sort.
+
+      cout << "CholPerformanceDevice:: reorder the matrix::time = " << t_reorder << endl;
+      {
+        SymbolicFactorHelperType F(PA, league_size);
+        timer.reset();
+        F.createNonZeroPattern(fill_level, Uplo::Upper, host_UU);
+        t_symbolic = timer.seconds();
+        cout << "CholPerformanceDevice:: AA (nnz) = " << AA.NumNonZeros() << ", host_UU (nnz) = " << host_UU.NumNonZeros() << endl;
+
+        if (verbose) {
+          F.showMe( std::cout );
+          std::cout << std::endl ;
+          host_UU.showMe( std::cout );
+          std::cout << std::endl ;
+        }
+      }
+      cout << "CholPerformanceDevice:: symbolic factorization::time = " << t_symbolic << endl;
+
+    //----------------------------------------------------------------------
+    // Allocate device_UU conformal to host_UU 
+    // and deep_copy host_UU arrays to device_UU arrays.
+    // Set up device_HU referencing blocks of device_UU
+
+      {
+        timer.reset();
+
+        device_UU.copy( host_UU );
+
+        CrsMatrixHelper::flat2hier(Uplo::Upper, device_UU, device_HU,
+                                   S.NumBlocks(),
+                                   S.RangeVector(),
+                                   S.TreeVector());
+
+        // Filling non-zero block matrixes' row ranges within block view.
+        // This is performed entirely in the 'device_HU' space.
+
+        CrsMatrixHelper::fillRowViewArray( device_HU );
+
+        t_flat2hier = timer.seconds();
+
+        cout << "CholPerformanceDevice:: Hier (dof, nnz) = " << device_HU.NumRows() << ", " << device_HU.NumNonZeros() << endl;
+      }
+      cout << "CholPerformanceDevice:: copy base matrix and construct hierarchical matrix::time = " << t_flat2hier << endl;
+    }
+
+    cout << "CholPerformanceDevice:: max concurrency = " << max_concurrency << endl;
+
+    const size_t max_task_size = 4*sizeof(CrsTaskViewType)+128;
+    cout << "CholPerformanceDevice:: max task size   = " << max_task_size << endl;
+
+    //----------------------------------------------------------------------
+    // From here onward all work is on the device.
+    //----------------------------------------------------------------------
+
+    {
+      typename TaskFactoryType::policy_type policy(max_concurrency,
+                                                   max_task_size,
+                                                   max_task_dependence,
+                                                   team_size);
+
+      cout << "CholPerformanceDevice:: ByBlocks factorize the matrix:: team_size = " << team_size << endl;
+      CrsHierTaskViewType H( device_HU );
+      {
+        timer.reset();
+        {
+          // auto future = policy.proc_create_team(Chol<Uplo::Upper,AlgoChol::ByBlocks>::
+          auto future = policy.proc_create_team(Chol<Uplo::Upper,AlgoChol::ByBlocks,Variant::Two>::
+                                                TaskFunctor<CrsHierTaskViewType>(policy,H), 0);
+          policy.spawn(future);
+          Kokkos::Experimental::wait(policy);
+        }
+        t_factor_task += timer.seconds();
+
+        cout << "CholPerformanceDevice:: policy.allocated_task_count = "
+             << policy.allocated_task_count()
+             << endl ;
+
+        if (verbose) {
+          host_UU.copy( device_UU );
+          host_UU.showMe( std::cout );
+          std::cout << endl;
+        }
+      }
+      cout << "CholPerformanceDevice:: ByBlocks factorize the matrix::time = " << t_factor_task << endl;
+    }
+
+    return r_val;
+  }
+}
+
+#endif
diff --git a/lib/kokkos/example/ichol/example/example_chol_performance_device_cuda.cpp b/lib/kokkos/example/ichol/example/example_chol_performance_device_cuda.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3a0df586b5af15a9c56582d216ecac6e5221853d
--- /dev/null
+++ b/lib/kokkos/example/ichol/example/example_chol_performance_device_cuda.cpp
@@ -0,0 +1,70 @@
+#include <Kokkos_Core.hpp>
+
+#include <Cuda/Kokkos_Cuda_TaskPolicy.hpp>
+
+using namespace std;
+
+typedef double value_type;
+typedef int    ordinal_type;
+typedef int    size_type;
+
+#include "example_chol_performance_device.hpp"
+
+using namespace Tacho;
+
+int main (int argc, char *argv[]) {
+
+  string file_input = "test.mtx";                                                                             
+  int nthreads = 1;                                                                                           
+  int max_task_dependence = 3;                                                                                
+  int max_concurrency = 1024;                                                                                 
+  int team_size = 1;                                                                                          
+  int fill_level = 0;
+  int treecut = 0;
+  int prunecut = 0;
+  int seed = 0;
+  int league_size = 1;                                                                                        
+  bool verbose = false;                                                                                       
+  for (int i=0;i<argc;++i) {                                                                                  
+    if ((strcmp(argv[i],"--file-input")         ==0)) { file_input          = argv[++i];       continue;}     
+    if ((strcmp(argv[i],"--nthreads")           ==0)) { nthreads            = atoi(argv[++i]); continue;}     
+    if ((strcmp(argv[i],"--max-task-dependence")==0)) { max_task_dependence = atoi(argv[++i]); continue;}     
+    if ((strcmp(argv[i],"--max-concurrency")    ==0)) { max_concurrency     = atoi(argv[++i]); continue;}     
+    if ((strcmp(argv[i],"--team-size")          ==0)) { team_size           = atoi(argv[++i]); continue;}     
+
+    if ((strcmp(argv[i],"--fill-level")         ==0)) { fill_level          = atoi(argv[++i]); continue;}     
+    if ((strcmp(argv[i],"--league-size")        ==0)) { league_size         = atoi(argv[++i]); continue;}     
+    if ((strcmp(argv[i],"--treecut")            ==0)) { treecut             = atoi(argv[++i]); continue;}     
+    if ((strcmp(argv[i],"--prunecut")           ==0)) { prunecut            = atoi(argv[++i]); continue;}     
+    if ((strcmp(argv[i],"--seed")               ==0)) { seed                = atoi(argv[++i]); continue;}     
+    if ((strcmp(argv[i],"--enable-verbose")     ==0)) { verbose             = true;            continue;}     
+  }                                                                                                           
+
+  int r_val = 0;
+  {
+    typedef Kokkos::Cuda exec_space;
+
+    Kokkos::DefaultHostExecutionSpace::initialize(nthreads);
+
+    exec_space::initialize();
+    exec_space::print_configuration(cout, true);
+
+    r_val = exampleCholPerformanceDevice
+      <value_type,ordinal_type,size_type,exec_space>
+      (file_input,
+       treecut,
+       prunecut,
+       seed,
+       nthreads,
+       max_task_dependence, max_concurrency, team_size,
+       fill_level, league_size,
+       (nthreads != 1), // skip_serial
+       verbose);
+
+    exec_space::finalize();
+
+    Kokkos::DefaultHostExecutionSpace::finalize();
+  }
+
+  return r_val;
+}
diff --git a/lib/kokkos/example/ichol/example/example_chol_performance_device_pthread.cpp b/lib/kokkos/example/ichol/example/example_chol_performance_device_pthread.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..68f520cf6620888c2a8de2f8cabe06a5b9e8b607
--- /dev/null
+++ b/lib/kokkos/example/ichol/example/example_chol_performance_device_pthread.cpp
@@ -0,0 +1,67 @@
+#include <Kokkos_Core.hpp>
+
+#include <Kokkos_Threads.hpp>
+#include <Threads/Kokkos_Threads_TaskPolicy.hpp>
+
+using namespace std;
+
+typedef double value_type;
+typedef int    ordinal_type;
+typedef int    size_type;
+
+typedef Kokkos::Threads exec_space;
+
+#include "example_chol_performance_device.hpp"
+
+using namespace Tacho;
+
+int main (int argc, char *argv[]) {
+
+  string file_input = "test.mtx";                                                                             
+  int nthreads = 1;                                                                                           
+  int max_task_dependence = 3;                                                                                
+  int max_concurrency = 1024;                                                                                 
+  int team_size = 1;                                                                                          
+  int fill_level = 0;
+  int treecut = 0;
+  int prunecut = 0;
+  int seed = 0;
+  int league_size = 1;                                                                                        
+  bool verbose = false;                                                                                       
+  for (int i=0;i<argc;++i) {                                                                                  
+    if ((strcmp(argv[i],"--file-input")         ==0)) { file_input          = argv[++i];       continue;}     
+    if ((strcmp(argv[i],"--nthreads")           ==0)) { nthreads            = atoi(argv[++i]); continue;}     
+    if ((strcmp(argv[i],"--max-task-dependence")==0)) { max_task_dependence = atoi(argv[++i]); continue;}     
+    if ((strcmp(argv[i],"--max-concurrency")    ==0)) { max_concurrency     = atoi(argv[++i]); continue;}     
+    if ((strcmp(argv[i],"--team-size")          ==0)) { team_size           = atoi(argv[++i]); continue;}     
+
+    if ((strcmp(argv[i],"--fill-level")         ==0)) { fill_level          = atoi(argv[++i]); continue;}     
+    if ((strcmp(argv[i],"--league-size")        ==0)) { league_size         = atoi(argv[++i]); continue;}     
+    if ((strcmp(argv[i],"--treecut")            ==0)) { treecut             = atoi(argv[++i]); continue;}     
+    if ((strcmp(argv[i],"--prunecut")           ==0)) { prunecut            = atoi(argv[++i]); continue;}     
+    if ((strcmp(argv[i],"--seed")               ==0)) { seed                = atoi(argv[++i]); continue;}     
+    if ((strcmp(argv[i],"--enable-verbose")     ==0)) { verbose             = true;            continue;}     
+  }                                                                                                           
+
+  int r_val = 0;
+  {
+    exec_space::initialize(nthreads);
+    exec_space::print_configuration(cout, true);
+
+    r_val = exampleCholPerformanceDevice
+      <value_type,ordinal_type,size_type,exec_space>
+      (file_input,
+       treecut,
+       prunecut,
+       seed,
+       nthreads,
+       max_task_dependence, max_concurrency, team_size,
+       fill_level, league_size,
+       (nthreads != 1), // skip_serial
+       verbose);
+
+    exec_space::finalize();
+  }
+
+  return r_val;
+}
diff --git a/lib/kokkos/example/ichol/src/chol.hpp b/lib/kokkos/example/ichol/src/chol.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e8aa4e9189ffb607c91cc2b86811084b69a45393
--- /dev/null
+++ b/lib/kokkos/example/ichol/src/chol.hpp
@@ -0,0 +1,92 @@
+#pragma once
+#ifndef __CHOL_HPP__
+#define __CHOL_HPP__
+
+/// \file chol.hpp
+/// \brief Incomplete Cholesky factorization front interface.
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+#include "util.hpp"
+#include "control.hpp"
+#include "partition.hpp"
+
+namespace Tacho { 
+
+  using namespace std;
+
+  // tasking interface
+  // * default behavior is for non-by-blocks tasks
+  // * control is only used for by-blocks algorithms
+  // ===============================================
+  template<int ArgUplo, int ArgAlgo, 
+           int ArgVariant = Variant::One,                  
+           template<int,int> class ControlType = Control>  
+  class Chol {
+  public:
+    
+    // function interface
+    // ==================
+    template<typename ExecViewType>
+    KOKKOS_INLINE_FUNCTION
+    static int invoke(typename ExecViewType::policy_type &policy, 
+                      const typename ExecViewType::policy_type::member_type &member, 
+                      typename ExecViewType::matrix_type &A);
+
+    // task-data parallel interface
+    // ============================
+    template<typename ExecViewType>
+    class TaskFunctor {
+    public:
+      typedef typename ExecViewType::policy_type policy_type;
+      typedef typename policy_type::member_type member_type;
+      typedef int value_type;
+      
+    private:
+      typename ExecViewType::matrix_type _A;
+      
+      policy_type _policy;
+      
+    public:
+      KOKKOS_INLINE_FUNCTION
+      TaskFunctor(const policy_type & P ,
+                  const typename ExecViewType::matrix_type & A)
+        : _A(A),
+          _policy(P)
+      { } 
+      
+      string Label() const { return "Chol"; }
+      
+      // task execution
+      KOKKOS_INLINE_FUNCTION
+      void apply(value_type &r_val) {
+        r_val = Chol::invoke<ExecViewType>(_policy, _policy.member_single(), _A);
+      }
+
+      // task-data execution
+      KOKKOS_INLINE_FUNCTION
+      void apply(const member_type &member, value_type &r_val) {
+
+        const int result = Chol::invoke<ExecViewType>(_policy, member, _A);
+
+        if ( 0 == member.team_rank() ) { r_val = result ; }
+
+      }
+
+    };
+
+  };
+}
+
+
+// unblocked version blas operations
+#include "scale.hpp"
+
+// blocked version blas operations
+#include "gemm.hpp"
+#include "trsm.hpp"
+#include "herk.hpp"
+
+// cholesky
+#include "chol_u.hpp"
+
+#endif
diff --git a/lib/kokkos/example/ichol/src/chol_u.hpp b/lib/kokkos/example/ichol/src/chol_u.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..0465ef8f35c8574189c767b6f97dfc7a6344f2cb
--- /dev/null
+++ b/lib/kokkos/example/ichol/src/chol_u.hpp
@@ -0,0 +1,23 @@
+#pragma once
+#ifndef __CHOL_U_HPP__
+#define __CHOL_U_HPP__
+
+/// \file chol_u.hpp
+/// \brief Upper Cholesky factorization variations
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+// testing task-data parallelism
+// #include "chol_u_unblocked_dummy.hpp"
+
+// flame style implementation
+//#include "chol_unblocked.hpp"  
+//#include "chol_u_blocked.hpp"
+
+// triple for loop
+#include "chol_u_unblocked_opt1.hpp"
+#include "chol_u_unblocked_opt2.hpp"
+
+// partitioned block algorithms: see control.hpp
+#include "chol_u_right_look_by_blocks.hpp"
+
+#endif
diff --git a/lib/kokkos/example/ichol/src/chol_u_right_look_by_blocks.hpp b/lib/kokkos/example/ichol/src/chol_u_right_look_by_blocks.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e21bafa9f1db5e9dda1a0e24f21a4552f011d27a
--- /dev/null
+++ b/lib/kokkos/example/ichol/src/chol_u_right_look_by_blocks.hpp
@@ -0,0 +1,394 @@
+#pragma once
+#ifndef __CHOL_U_RIGHT_LOOK_BY_BLOCKS_HPP__
+#define __CHOL_U_RIGHT_LOOK_BY_BLOCKS_HPP__
+
+/// \file chol_u_right_look_by_blocks.hpp
+/// \brief Cholesky factorization by-blocks
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+/// The Partitioned-Block Matrix (PBM) is sparse and a block itself is a view of a sparse matrix. 
+/// The algorithm generates tasks with a given sparse block matrix structure.
+
+// basic utils
+#include "util.hpp"
+#include "control.hpp"
+#include "partition.hpp"
+
+namespace Tacho { 
+  
+  using namespace std;
+
+  template< typename CrsTaskViewType >
+  KOKKOS_INLINE_FUNCTION
+  int releaseFutures( typename CrsTaskViewType::matrix_type & A )
+    {
+      typedef typename CrsTaskViewType::ordinal_type      ordinal_type;
+      typedef typename CrsTaskViewType::row_view_type     row_view_type;
+      typedef typename CrsTaskViewType::future_type       future_type;
+      
+      row_view_type a(A,0);
+      
+      const ordinal_type nnz = a.NumNonZeros();
+
+      for (ordinal_type j=0;j<nnz;++j) {
+        a.Value(j).setFuture( future_type() );
+      }
+
+      return nnz ;
+    }
+  
+  // ========================================
+  // detailed workflow of by-blocks algorithm
+  // ========================================
+  template<int ArgVariant, 
+           template<int,int> class ControlType,
+           typename CrsTaskViewType>
+  class CholUpperRightLookByBlocks {
+  public:
+    KOKKOS_INLINE_FUNCTION
+    static int genScalarTask(typename CrsTaskViewType::policy_type &policy,
+                             typename CrsTaskViewType::matrix_type &A) {
+      typedef typename CrsTaskViewType::value_type        value_type;
+      typedef typename CrsTaskViewType::row_view_type     row_view_type;
+      
+      typedef typename CrsTaskViewType::future_type       future_type;
+      typedef typename CrsTaskViewType::task_factory_type task_factory_type;
+      
+      row_view_type a(A, 0); 
+      value_type &aa = a.Value(0);
+      
+      // construct a task
+      future_type f = task_factory_type::create(policy,
+                                                typename Chol<Uplo::Upper,
+                                                CtrlDetail(ControlType,AlgoChol::ByBlocks,ArgVariant,Chol)>
+                                                ::template TaskFunctor<value_type>(policy,aa));
+      
+
+if ( false ) {
+ printf("Chol [%d +%d)x[%d +%d) spawn depend %d\n"
+       , aa.OffsetRows()
+       , aa.NumRows()
+       , aa.OffsetCols()
+       , aa.NumCols()
+       , int( ! aa.Future().is_null() )
+       );
+}
+
+      // manage dependence
+      task_factory_type::addDependence(policy, f, aa.Future());
+      aa.setFuture(f);
+
+      // spawn a task
+      task_factory_type::spawn(policy, f, true /* high priority */ );
+      
+      return 1;
+    }
+    
+    KOKKOS_INLINE_FUNCTION
+    static int genTrsmTasks(typename CrsTaskViewType::policy_type &policy,
+                            typename CrsTaskViewType::matrix_type &A,
+                            typename CrsTaskViewType::matrix_type &B) {
+      typedef typename CrsTaskViewType::ordinal_type      ordinal_type;
+      typedef typename CrsTaskViewType::row_view_type     row_view_type;
+      typedef typename CrsTaskViewType::value_type        value_type;
+
+      typedef typename CrsTaskViewType::future_type       future_type;
+      typedef typename CrsTaskViewType::task_factory_type task_factory_type;
+      
+      row_view_type a(A,0), b(B,0); 
+      value_type &aa = a.Value(0);
+
+if ( false ) {
+  printf("genTrsmTasks after aa.Future().reference_count = %d\n"
+        , aa.Future().reference_count());
+}
+      const ordinal_type nnz = b.NumNonZeros();
+      for (ordinal_type j=0;j<nnz;++j) {
+        typedef typename
+           Trsm< Side::Left,Uplo::Upper,Trans::ConjTranspose,
+                 CtrlDetail(ControlType,AlgoChol::ByBlocks,ArgVariant,Trsm)>
+           ::template TaskFunctor<double,value_type,value_type>
+             FunctorType ;
+
+        value_type &bb = b.Value(j);
+        
+        future_type f = task_factory_type
+          ::create(policy, FunctorType(policy,Diag::NonUnit, 1.0, aa, bb));
+        
+if ( false ) {
+ printf("Trsm [%d +%d)x[%d +%d) spawn depend %d %d\n"
+       , bb.OffsetRows()
+       , bb.NumRows()
+       , bb.OffsetCols()
+       , bb.NumCols()
+       , int( ! aa.Future().is_null() )
+       , int( ! bb.Future().is_null() )
+       );
+}
+
+        // trsm dependence
+        task_factory_type::addDependence(policy, f, aa.Future());
+        
+        // self
+        task_factory_type::addDependence(policy, f, bb.Future());
+        
+        // place task signature on b
+        bb.setFuture(f);
+        
+        // spawn a task
+        task_factory_type::spawn(policy, f, true /* high priority */);              
+      }
+
+if ( false ) {
+  printf("genTrsmTasks after aa.Future().reference_count = %d\n"
+        , aa.Future().reference_count());
+}
+      
+      return nnz ;
+    }
+    
+    KOKKOS_INLINE_FUNCTION
+    static int genHerkTasks(typename CrsTaskViewType::policy_type &policy,
+                            typename CrsTaskViewType::matrix_type &A,
+                            typename CrsTaskViewType::matrix_type &C) {
+      typedef typename CrsTaskViewType::ordinal_type      ordinal_type;
+      typedef typename CrsTaskViewType::value_type        value_type;
+      typedef typename CrsTaskViewType::row_view_type     row_view_type;
+      
+      typedef typename CrsTaskViewType::future_type       future_type;
+      typedef typename CrsTaskViewType::task_factory_type task_factory_type;
+      
+      // case that X.transpose, A.no_transpose, Y.no_transpose
+      
+      row_view_type a(A,0), c; 
+      
+      const ordinal_type nnz = a.NumNonZeros();
+      ordinal_type herk_count = 0 ; 
+      ordinal_type gemm_count = 0 ; 
+
+      // update herk
+      for (ordinal_type i=0;i<nnz;++i) {
+        const ordinal_type row_at_i = a.Col(i);
+        value_type &aa = a.Value(i);
+        
+        c.setView(C, row_at_i);
+        
+        ordinal_type idx = 0;
+        for (ordinal_type j=i;j<nnz && (idx > -2);++j) {
+          const ordinal_type col_at_j = a.Col(j);
+          value_type &bb = a.Value(j);
+          
+          if (row_at_i == col_at_j) {
+            idx = c.Index(row_at_i, idx);
+            if (idx >= 0) {
+              ++herk_count ;
+              value_type &cc = c.Value(idx);
+              future_type f = task_factory_type
+                ::create(policy, 
+                         typename Herk<Uplo::Upper,Trans::ConjTranspose,
+                         CtrlDetail(ControlType,AlgoChol::ByBlocks,ArgVariant,Herk)>
+                         ::template TaskFunctor<double,value_type,value_type>(policy,-1.0, aa, 1.0, cc));
+            
+
+if ( false ) {
+ printf("Herk [%d +%d)x[%d +%d) spawn %d %d\n"
+       , cc.OffsetRows()
+       , cc.NumRows()
+       , cc.OffsetCols()
+       , cc.NumCols()
+       , int( ! aa.Future().is_null() )
+       , int( ! cc.Future().is_null() )
+       );
+}
+
+              // dependence
+              task_factory_type::addDependence(policy, f, aa.Future());              
+            
+              // self
+              task_factory_type::addDependence(policy, f, cc.Future());
+            
+              // place task signature on y
+              cc.setFuture(f);
+
+              // spawn a task
+              task_factory_type::spawn(policy, f);
+            }
+          } else {
+            idx = c.Index(col_at_j, idx);
+            if (idx >= 0) {
+              ++gemm_count ;
+              value_type &cc = c.Value(idx);
+              future_type f = task_factory_type
+                ::create(policy, 
+                         typename Gemm<Trans::ConjTranspose,Trans::NoTranspose,
+                         CtrlDetail(ControlType,AlgoChol::ByBlocks,ArgVariant,Gemm)>
+                         ::template TaskFunctor<double,value_type,value_type,value_type>(policy,-1.0, aa, bb, 1.0, cc));
+            
+
+if ( false ) {
+ printf("Gemm [%d +%d)x[%d +%d) spawn %d %d %d\n"
+       , cc.OffsetRows()
+       , cc.NumRows()
+       , cc.OffsetCols()
+       , cc.NumCols()
+       , int( ! aa.Future().is_null() )
+       , int( ! bb.Future().is_null() )
+       , int( ! cc.Future().is_null() )
+       );
+}
+ 
+              // dependence
+              task_factory_type::addDependence(policy, f, aa.Future());
+              task_factory_type::addDependence(policy, f, bb.Future());
+            
+              // self
+              task_factory_type::addDependence(policy, f, cc.Future());
+            
+              // place task signature on y
+              cc.setFuture(f);
+            
+              // spawn a task
+              task_factory_type::spawn(policy, f);
+            }
+          }
+        }
+      }
+
+if ( false ) {
+printf("genHerkTask Herk(%ld) Gemm(%ld)\n",(long)herk_count,(long)gemm_count);
+}
+    
+      return herk_count + gemm_count ;
+    }
+    
+  };
+  
+  // specialization for different task generation in right looking by-blocks algorithm
+  // =================================================================================
+  template<int ArgVariant, template<int,int> class ControlType>
+  class Chol<Uplo::Upper,AlgoChol::RightLookByBlocks,ArgVariant,ControlType> {
+  public:
+
+    // function interface
+    // ==================
+    template<typename ExecViewType>
+    KOKKOS_INLINE_FUNCTION
+    static int invoke(typename ExecViewType::policy_type &policy, 
+                      const typename ExecViewType::policy_type::member_type &member, 
+                      typename ExecViewType::matrix_type & A,
+                      int checkpoint )
+      {
+        typedef typename ExecViewType::row_view_type  row_view_type ;
+
+        enum { CYCLE = 2 };
+
+        typename ExecViewType::matrix_type
+          ATL, ATR,      A00, A01, A02,
+          ABL, ABR,      A10, A11, A12,
+                         A20, A21, A22;
+
+        Part_2x2(A,  ATL, ATR,
+                 /**/ABL, ABR,
+                 checkpoint, checkpoint, Partition::TopLeft);
+
+        int tasks_spawned = 0 ;
+        int futures_released = 0 ;
+
+        for ( int i = 0 ; i < CYCLE && ATL.NumRows() < A.NumRows() ; ++i ) {
+          Part_2x2_to_3x3(ATL, ATR, /**/  A00, A01, A02,
+                          /*******/ /**/  A10, A11, A12,
+                          ABL, ABR, /**/  A20, A21, A22,
+                          1, 1, Partition::BottomRight);
+          // -----------------------------------------------------
+          // Spawning tasks:
+
+          // A11 = chol(A11) : #task = 1
+          tasks_spawned +=
+          CholUpperRightLookByBlocks<ArgVariant,ControlType,ExecViewType>
+            ::genScalarTask(policy, A11);
+          
+          // A12 = inv(triu(A11)') * A12 : #tasks = non-zero row blocks
+          tasks_spawned +=
+          CholUpperRightLookByBlocks<ArgVariant,ControlType,ExecViewType>
+            ::genTrsmTasks(policy, A11, A12);
+
+          // A22 = A22 - A12' * A12 : #tasks = highly variable
+          tasks_spawned +=
+          CholUpperRightLookByBlocks<ArgVariant,ControlType,ExecViewType>
+            ::genHerkTasks(policy, A12, A22);
+
+          // -----------------------------------------------------
+          // Can release futures of A11 and A12 
+
+          futures_released += releaseFutures<ExecViewType>( A11 );
+          futures_released += releaseFutures<ExecViewType>( A12 );
+
+if ( false ) {
+  printf("Chol iteration(%d) task_count(%d) cumulative: spawn(%d) release(%d)\n"
+        , int(ATL.NumRows())
+        , policy.allocated_task_count()
+        , tasks_spawned , futures_released
+        );
+}
+
+          // -----------------------------------------------------
+          Merge_3x3_to_2x2(A00, A01, A02, /**/ ATL, ATR,
+                           A10, A11, A12, /**/ /******/
+                           A20, A21, A22, /**/ ABL, ABR,
+                           Partition::TopLeft);
+
+        }
+      
+      return ATL.NumRows();
+    }
+    
+    // task-data parallel interface
+    // ============================
+    template<typename ExecViewType>
+    class TaskFunctor {
+    public:
+      typedef typename ExecViewType::policy_type  policy_type;
+      typedef typename ExecViewType::future_type  future_type;
+      typedef typename policy_type::member_type   member_type;
+      typedef int value_type;
+      
+    private:
+      typename ExecViewType::matrix_type _A;
+      
+      policy_type _policy;
+      int         _checkpoint ;
+      
+    public:
+      KOKKOS_INLINE_FUNCTION
+      TaskFunctor(const policy_type & P ,
+                  const typename ExecViewType::matrix_type & A)
+        : _A(A),
+          _policy(P),
+          _checkpoint(0)
+      { } 
+      
+      string Label() const { return "Chol"; }
+      
+      // task-data execution
+      KOKKOS_INLINE_FUNCTION
+      void apply(const member_type &member, value_type &r_val)
+      {
+        if (member.team_rank() == 0) {
+          // Clear out previous dependence
+          _policy.clear_dependence( this );
+
+          _checkpoint = Chol::invoke<ExecViewType>(_policy, member, _A, _checkpoint);
+
+          if ( _checkpoint < _A.NumRows() ) _policy.respawn_needing_memory(this);
+
+          r_val = 0 ;
+        }
+        return ;
+      }
+
+    };
+
+  };
+}
+
+#endif
diff --git a/lib/kokkos/example/ichol/src/chol_u_unblocked_opt1.hpp b/lib/kokkos/example/ichol/src/chol_u_unblocked_opt1.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..3bb99c71424f491bbb5bea712475fcac116ad24e
--- /dev/null
+++ b/lib/kokkos/example/ichol/src/chol_u_unblocked_opt1.hpp
@@ -0,0 +1,90 @@
+#pragma once
+#ifndef __CHOL_U_UNBLOCKED_OPT1_HPP__
+#define __CHOL_U_UNBLOCKED_OPT1_HPP__
+
+/// \file chol_u_unblocked_opt1.hpp
+/// \brief Unblocked incomplete Chloesky factorization.
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+#include "util.hpp"
+#include "partition.hpp"
+
+namespace Tacho {
+
+  using namespace std;
+
+  template<>
+  template<typename CrsExecViewType>
+  KOKKOS_INLINE_FUNCTION
+  int
+  Chol<Uplo::Upper,AlgoChol::UnblockedOpt,Variant::One>
+  ::invoke(typename CrsExecViewType::policy_type &policy,
+           const typename CrsExecViewType::policy_type::member_type &member,
+           typename CrsExecViewType::matrix_type &A) {
+
+    typedef typename CrsExecViewType::value_type        value_type;
+    typedef typename CrsExecViewType::ordinal_type      ordinal_type;
+    typedef typename CrsExecViewType::row_view_type     row_view_type;
+
+    // row_view_type r1t, r2t;
+
+    for (ordinal_type k=0;k<A.NumRows();++k) {
+      //r1t.setView(A, k);
+      row_view_type &r1t = A.RowView(k);
+
+      // extract diagonal from alpha11
+      value_type &alpha = r1t.Value(0);
+
+      if (member.team_rank() == 0) {
+        // if encounter null diag or wrong index, return -(row + 1)
+        if (abs(alpha) == 0.0 || r1t.Col(0) != k)
+          return -(k + 1);
+
+        // error handling should be more carefully designed
+
+        // sqrt on diag
+        // alpha = sqrt(real(alpha));
+        alpha = sqrt(alpha);
+      }
+      member.team_barrier();
+
+      const ordinal_type nnz_r1t = r1t.NumNonZeros();
+
+      if (nnz_r1t) {
+        // inverse scale
+        Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 1, nnz_r1t),
+                             [&](const ordinal_type j) {
+                               r1t.Value(j) /= alpha;
+                             });
+
+        member.team_barrier();
+
+        // hermitian rank update
+        Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 1, nnz_r1t),
+                             [&](const ordinal_type i) {
+                               const ordinal_type row_at_i = r1t.Col(i);
+                               // const value_type   val_at_i = conj(r1t.Value(i));
+                               const value_type   val_at_i = r1t.Value(i);
+                               
+                               //r2t.setView(A, row_at_i);
+                               row_view_type &r2t = A.RowView(row_at_i);
+                               ordinal_type idx = 0;
+                               
+                               for (ordinal_type j=i;j<nnz_r1t && (idx > -2);++j) {
+                                 const ordinal_type col_at_j = r1t.Col(j);
+                                 idx = r2t.Index(col_at_j, idx);
+                                 
+                                 if (idx >= 0) {
+                                   const value_type val_at_j = r1t.Value(j);
+                                   r2t.Value(idx) -= val_at_i*val_at_j;
+                                 }
+                               }
+                             });
+      }
+    }
+    return 0;
+  }
+
+}
+
+#endif
diff --git a/lib/kokkos/example/ichol/src/chol_u_unblocked_opt2.hpp b/lib/kokkos/example/ichol/src/chol_u_unblocked_opt2.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e7d1dc826235120a84af25ff239fb705c65489f0
--- /dev/null
+++ b/lib/kokkos/example/ichol/src/chol_u_unblocked_opt2.hpp
@@ -0,0 +1,154 @@
+#pragma once
+#ifndef __CHOL_U_UNBLOCKED_OPT2_HPP__
+#define __CHOL_U_UNBLOCKED_OPT2_HPP__
+
+/// \file chol_u_unblocked_opt2.hpp
+/// \brief Unblocked incomplete Chloesky factorization; version for data parallel sharing L1 cache.
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+#include "util.hpp"
+#include "partition.hpp"
+
+namespace Tacho {
+
+  using namespace std;
+
+  template<>
+  template<typename CrsExecViewType>
+  KOKKOS_INLINE_FUNCTION
+  int
+  Chol<Uplo::Upper,AlgoChol::UnblockedOpt,Variant::Two>
+  ::invoke(typename CrsExecViewType::policy_type &policy,
+           const typename CrsExecViewType::policy_type::member_type &member,
+           typename CrsExecViewType::matrix_type &A) {
+
+    typedef typename CrsExecViewType::value_type        value_type;
+    typedef typename CrsExecViewType::ordinal_type      ordinal_type;
+    typedef typename CrsExecViewType::row_view_type     row_view_type;
+
+if ( false && member.team_rank() == 0 ) {
+ printf("Chol [%d +%d)x[%d +%d) begin\n"
+       , A.OffsetRows()
+       , A.NumRows()
+       , A.OffsetCols()
+       , A.NumCols()
+       );
+}
+
+    // row_view_type r1t, r2t;
+
+    for (ordinal_type k=0;k<A.NumRows();++k) {
+      //r1t.setView(A, k);
+      row_view_type &r1t = A.RowView(k);
+
+      // extract diagonal from alpha11
+      value_type &alpha = r1t.Value(0);
+
+      if (member.team_rank() == 0) {
+        // if encounter null diag or wrong index, return -(row + 1)
+        if (abs(alpha) == 0.0 || r1t.Col(0) != k)
+          return -(k + 1);
+
+        // error handling should be more carefully designed
+
+        // sqrt on diag
+        // alpha = sqrt(real(alpha));
+        alpha = sqrt(alpha);
+      }
+      member.team_barrier();
+
+
+if ( false && member.team_rank() == 0 ) {
+ printf("Chol [%d +%d)x[%d +%d) local row %d\n"
+       , A.OffsetRows()
+       , A.NumRows()
+       , A.OffsetCols()
+       , A.NumCols()
+       , int(k)
+       );
+}
+
+
+      const ordinal_type nnz_r1t = r1t.NumNonZeros();
+
+      if (nnz_r1t) {
+        // inverse scale
+        Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 1, nnz_r1t),
+                             [&](const ordinal_type j) {
+                               r1t.Value(j) /= alpha;
+                             });
+
+        member.team_barrier();
+
+
+if ( false && member.team_rank() == 0 ) {
+ printf("Chol [%d +%d)x[%d +%d) local row %d nnz_r1t\n"
+       , A.OffsetRows()
+       , A.NumRows()
+       , A.OffsetCols()
+       , A.NumCols()
+       , int(k)
+       );
+}
+
+        // hermitian rank update
+        for (ordinal_type i=1;i<nnz_r1t;++i) {
+          const ordinal_type row_at_i = r1t.Col(i);
+          // const value_type   val_at_i = conj(r1t.Value(i));
+          const value_type   val_at_i = r1t.Value(i);
+
+          //r2t.setView(A, row_at_i);
+          row_view_type &r2t = A.RowView(row_at_i);
+
+          ordinal_type member_idx = 0 ;
+
+          Kokkos::parallel_for(Kokkos::TeamThreadRange(member, i, nnz_r1t),
+                               [&](const ordinal_type j) {
+                                 if (member_idx > -2) {
+                                   const ordinal_type col_at_j = r1t.Col(j);
+                                   member_idx = r2t.Index(col_at_j, member_idx);
+                                   if (member_idx >= 0) {
+                                     const value_type   val_at_j = r1t.Value(j);
+                                     r2t.Value(member_idx) -= val_at_i*val_at_j;
+                                   }
+                                 }
+                               });
+        }
+      }
+
+
+if ( false ) {
+member.team_barrier();
+if ( member.team_rank() == 0 ) {
+ printf("Chol [%d +%d)x[%d +%d) local row %d end\n"
+       , A.OffsetRows()
+       , A.NumRows()
+       , A.OffsetCols()
+       , A.NumCols()
+       , int(k)
+       );
+}
+}
+
+    }
+
+
+if ( false ) {
+member.team_barrier();
+if ( member.team_rank() == 0 ) {
+ printf("Chol [%d +%d)x[%d +%d) end\n"
+       , A.OffsetRows()
+       , A.NumRows()
+       , A.OffsetCols()
+       , A.NumCols()
+       );
+}
+}
+
+
+    return 0;
+  }
+
+}
+
+#endif
diff --git a/lib/kokkos/example/ichol/src/control.hpp b/lib/kokkos/example/ichol/src/control.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..bf5efef9fded8685f646d81855469f6f363b1e73
--- /dev/null
+++ b/lib/kokkos/example/ichol/src/control.hpp
@@ -0,0 +1,110 @@
+#pragma once
+#ifndef __CONTROL_HPP__
+#define __CONTROL_HPP__
+
+#include "util.hpp"
+
+/// \file control.hpp
+/// \brief A collection of control trees composing high-level variants of algorithms.
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+/// description is a bit wrong
+
+using namespace std;
+
+namespace Tacho {
+
+  // forward declaration for control tree
+  template<int ArgAlgo, int ArgVariant>
+  struct Control {
+    static constexpr int Self[2] = { ArgAlgo, ArgVariant };
+  };
+
+  // ----------------------------------------------------------------------------------
+
+  // - CholByblocks Variant 1
+  // * partitioned block matrix (blocks are sparse)
+  template<> struct Control<AlgoChol::ByBlocks,Variant::One> {
+    // chol var 1 : nested data parallel for is applied in the second inner loop
+    // chol var 2 : nested data parallel for is applied in the most inner loop
+    static constexpr int Chol[2] = { AlgoChol::UnblockedOpt,     Variant::Two };
+    static constexpr int Trsm[2] = { AlgoTrsm::ForFactorBlocked, Variant::One };
+    static constexpr int Herk[2] = { AlgoHerk::ForFactorBlocked, Variant::One };
+    static constexpr int Gemm[2] = { AlgoGemm::ForFactorBlocked, Variant::One };
+  };
+
+  // - CholByBlocks Variant 2
+  // * diagonal blocks have nested dense blocks
+  template<> struct Control<AlgoChol::ByBlocks,Variant::Two> {
+    static constexpr int Chol[2] = { AlgoChol::UnblockedOpt, Variant::One }; 
+    static constexpr int Trsm[2] = { AlgoTrsm::ForFactorBlocked, Variant::One };
+    static constexpr int Herk[2] = { AlgoHerk::ForFactorBlocked, Variant::One };
+    static constexpr int Gemm[2] = { AlgoGemm::ForFactorBlocked, Variant::One };
+  };
+
+  // - CholByBlocks Variant 3
+  // * all blocks have nested dense blocks (full supernodal algorithm)
+  // template<> struct Control<AlgoChol::ByBlocks,Variant::Three> {
+  //   static constexpr int Chol[2] = { AlgoChol::NestedDenseBlock, Variant::One }; 
+  //   static constexpr int Trsm[2] = { AlgoTrsm::NestedDenseBlock, Variant::One };
+  //   static constexpr int Herk[2] = { AlgoHerk::NestedDenseBlock, Variant::One };
+  //   static constexpr int Gemm[2] = { AlgoGemm::NestedDenseBlock, Variant::One };
+  // };
+
+  // - CholByBlocks Variant 4
+  // * diagonal blocks have nested hier dense blocks (hierarchical task scheduling)
+  // template<> struct Control<AlgoChol::ByBlocks,Variant::Four> {
+  //  static constexpr int Chol[2] = { AlgoChol::NestedDenseByBlocks, Variant::One }; 
+  //  static constexpr int Trsm[2] = { AlgoTrsm::ForFactorBlocked,    Variant::One };
+  //  static constexpr int Herk[2] = { AlgoHerk::ForFactorBlocked,    Variant::One };
+  //  static constexpr int Gemm[2] = { AlgoGemm::ForFactorBlocked,    Variant::One };
+  //};
+
+  // - CholByBlocks Variant 5
+  // * diagonal blocks have nested hier dense blocks (hierarchical task scheduling)
+  // template<> struct Control<AlgoChol::ByBlocks,Variant::Four> {
+  //   static constexpr int Chol[2] = { AlgoChol::NestedDenseByBlocks, Variant::One }; 
+  //   static constexpr int Trsm[2] = { AlgoTrsm::NestedDenseByBlocks, Variant::One };
+  //   static constexpr int Herk[2] = { AlgoHerk::NestedDenseByBlocks, Variant::One };
+  //   static constexpr int Gemm[2] = { AlgoGemm::NestedDenseByBlocks, Variant::One };
+  // };
+
+  // ----------------------------------------------------------------------------------
+
+  // - CholNestedDenseBlock
+  // * branch control between sparse and dense operations
+  template<> struct Control<AlgoChol::NestedDenseBlock,Variant::One> {
+    static constexpr int CholSparse[2] = { AlgoChol::UnblockedOpt,   Variant::One };
+    static constexpr int CholDense[2]  = { AlgoChol::ExternalLapack, Variant::One }; 
+  };
+
+  // - CholNestedDenseBlock
+  // * branch control between sparse and dense operations
+  template<> struct Control<AlgoChol::NestedDenseByBlocks,Variant::One> {
+    static constexpr int CholSparse[2]        = { AlgoChol::UnblockedOpt,  Variant::One };
+    static constexpr int CholDenseByBlocks[2] = { AlgoChol::DenseByBlocks, Variant::One }; 
+  };
+
+  // ----------------------------------------------------------------------------------
+
+  // - CholDenseBlock
+  // * dense matrix Cholesky-by-blocks
+  template<> struct Control<AlgoChol::DenseByBlocks,Variant::One> {
+    static constexpr int Chol[2] = { AlgoChol::ExternalLapack, Variant::One };
+    static constexpr int Trsm[2] = { AlgoTrsm::ExternalBlas,   Variant::One };
+    static constexpr int Herk[2] = { AlgoHerk::ExternalBlas,   Variant::One };
+    static constexpr int Gemm[2] = { AlgoGemm::ExternalBlas,   Variant::One };
+  };
+
+  template<> struct Control<AlgoGemm::DenseByBlocks,Variant::One> {
+    static constexpr int Gemm[2] = { AlgoGemm::ExternalBlas, Variant::One };
+  };
+
+  template<> struct Control<AlgoTrsm::DenseByBlocks,Variant::One> {
+    static constexpr int Gemm[2] = { AlgoGemm::ExternalBlas, Variant::One };
+    static constexpr int Trsm[2] = { AlgoTrsm::ExternalBlas, Variant::One };
+  };
+
+}
+
+#endif
diff --git a/lib/kokkos/example/ichol/src/coo.hpp b/lib/kokkos/example/ichol/src/coo.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..977f17e5c5fb2d9ce520548cc04bc15c107a4c60
--- /dev/null
+++ b/lib/kokkos/example/ichol/src/coo.hpp
@@ -0,0 +1,75 @@
+#pragma once
+#ifndef __COO_HPP__
+#define __COO_HPP__
+
+/// \file coo.hpp
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+namespace Tacho { 
+  
+  using namespace std;
+
+  /// \class Coo
+  /// \brief Sparse coordinate format; (i, j, val).
+  template<typename CrsMatType>
+  class Coo {
+  public:
+    typedef typename CrsMatType::ordinal_type ordinal_type;
+    typedef typename CrsMatType::value_type   value_type;
+
+  public:
+    ordinal_type _i,_j;
+    value_type _val;
+
+  public:
+    ordinal_type& Row() { return _i;   } 
+    ordinal_type& Col() { return _j;   }
+    value_type&   Val() { return _val; }
+
+    ordinal_type  Row() const { return _i;   } 
+    ordinal_type  Col() const { return _j;   }
+    value_type    Val() const { return _val; }
+    
+    Coo() {}
+
+    Coo(const ordinal_type i, 
+        const ordinal_type j, 
+        const value_type val) 
+      : _i(i),
+        _j(j),
+        _val(val) 
+    { }
+
+    Coo(const Coo& b)
+      : _i(b._i),
+        _j(b._j),
+        _val(b._val) 
+    { }
+
+    Coo<CrsMatType>& operator=(const Coo<CrsMatType> &y) {
+      this->_i = y._i;
+      this->_j = y._j;
+      this->_val = y._val;
+
+      return *this;
+    }
+
+    /// \brief Compare "less" index i and j only.
+    bool operator<(const Coo<CrsMatType> &y) const {
+      ordinal_type r_val = (this->_i - y._i);
+      return (r_val == 0 ? this->_j < y._j : r_val < 0);
+    }  
+    
+    /// \brief Compare "equality" only index i and j.
+    bool operator==(const Coo<CrsMatType> &y) const {
+      return (this->_i == y._i) && (this->_j == y._j);
+    }  
+ 
+    /// \brief Compare "in-equality" only index i and j.   
+    bool operator!=(const Coo<CrsMatType> &y) const {
+      return !(*this == y);
+    }  
+  };
+  
+}
+#endif
diff --git a/lib/kokkos/example/ichol/src/crs_matrix_base.hpp b/lib/kokkos/example/ichol/src/crs_matrix_base.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ad08b8757e83c68b8a9224a1d41c3087930a2eb4
--- /dev/null
+++ b/lib/kokkos/example/ichol/src/crs_matrix_base.hpp
@@ -0,0 +1,598 @@
+#pragma once
+#ifndef __CRS_MATRIX_BASE_HPP__
+#define __CRS_MATRIX_BASE_HPP__
+
+/// \file crs_matrix_base.hpp
+/// \brief CRS matrix base object interfaces to user provided input matrices.
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+#include "util.hpp"
+#include "coo.hpp"
+
+namespace Tacho { 
+
+  using namespace std;
+
+  template< typename , typename > class TaskView ;
+
+  template < typename CrsMatrixType >
+  struct GetCrsMatrixRowViewType {
+    typedef int type ;
+  };
+
+
+  template < typename CrsMatrixViewType , typename TaskFactoryType >
+  struct GetCrsMatrixRowViewType
+    < TaskView<CrsMatrixViewType,TaskFactoryType> >
+  {
+    typedef typename CrsMatrixViewType::row_view_type type ;
+  };
+
+  /// \class CrsMatrixBase
+  /// \breif CRS matrix base object using Kokkos view and subview
+  template<typename ValueType,
+           typename OrdinalType, 
+           typename SizeType = OrdinalType,
+           typename SpaceType = void,
+           typename MemoryTraits = void>
+  class CrsMatrixBase {
+  public:
+    typedef ValueType    value_type;
+    typedef OrdinalType  ordinal_type;
+    typedef SpaceType    space_type;
+    typedef SizeType     size_type;
+    typedef MemoryTraits memory_traits;
+
+    // 1D view, layout does not matter; no template parameters for that
+    typedef Kokkos::View<size_type*,   space_type,memory_traits> size_type_array;
+    typedef Kokkos::View<ordinal_type*,space_type,memory_traits> ordinal_type_array;
+    typedef Kokkos::View<value_type*,  space_type,memory_traits> value_type_array;
+
+    typedef typename size_type_array::value_type*    size_type_array_ptr;
+    typedef typename ordinal_type_array::value_type* ordinal_type_array_ptr;
+    typedef typename value_type_array::value_type*   value_type_array_ptr;
+
+    // range type
+    template<typename T> using range_type = pair<T,T>;
+
+    // external interface
+    typedef Coo<CrsMatrixBase> ijv_type;
+    
+    friend class CrsMatrixHelper;
+
+  private:
+
+    ordinal_type       _m;       //!< # of rows
+    ordinal_type       _n;       //!< # of cols
+    size_type          _nnz;     //!< # of nonzeros
+    size_type_array    _ap;      //!< pointers to column index and values
+    ordinal_type_array _aj;      //!< column index compressed format
+    value_type_array   _ax;      //!< values
+
+  public:
+
+    typedef typename GetCrsMatrixRowViewType< ValueType >::type row_view_type ;
+    typedef Kokkos::View<row_view_type*,space_type> row_view_type_array;
+
+    row_view_type_array _all_row_views ;
+
+  protected:
+
+    void createInternalArrays(const ordinal_type m, 
+                              const ordinal_type n,
+                              const size_type nnz) {
+      _m = m;
+      _n = n;
+      _nnz = nnz;
+
+      if (static_cast<ordinal_type>(_ap.dimension_0()) < m+1)
+        _ap = size_type_array("CrsMatrixBase::RowPtrArray", m+1);
+      
+      if (static_cast<size_type>(_aj.dimension_0()) < nnz)
+        _aj = ordinal_type_array("CrsMatrixBase::ColsArray", nnz);
+
+      if (static_cast<size_type>(_ax.dimension_0()) < nnz)
+        _ax = value_type_array("CrsMatrixBase::ValuesArray", nnz);
+    }
+
+    // Copy sparse matrix structure from coordinate format in 'mm'
+    // to CRS format in Views _ap, _aj, a_x.
+    void ijv2crs(const vector<ijv_type> &mm) {
+
+      ordinal_type ii = 0;
+      size_type jj = 0;
+      
+      ijv_type prev = mm[0];
+      _ap[ii++] = 0;
+      _aj[jj] = prev.Col();
+      _ax[jj] = prev.Val();
+      ++jj;
+      
+      for (typename vector<ijv_type>::const_iterator it=(mm.begin()+1);it<mm.end();++it) {
+        ijv_type aij = (*it);
+        
+        // row index
+        if (aij.Row() != prev.Row()) {
+          _ap[ii++] = jj; 
+        }
+        
+        if (aij == prev) {
+          --jj;
+          _aj[jj]  = aij.Col();
+          _ax[jj] += aij.Val();
+        } else {
+          _aj[jj] = aij.Col();
+          _ax[jj] = aij.Val();
+        }
+        ++jj;
+        
+        prev = aij;
+      }
+      
+      // add the last index to terminate the storage
+      _ap[ii++] = jj;
+      _nnz = jj;
+    }
+    
+  public:
+
+    KOKKOS_INLINE_FUNCTION
+    void setNumNonZeros() { 
+      if (_m) 
+        _nnz = _ap[_m];
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    ordinal_type NumRows() const { return _m; }
+
+    KOKKOS_INLINE_FUNCTION
+    ordinal_type NumCols() const { return _n; }
+
+    KOKKOS_INLINE_FUNCTION
+    size_type NumNonZeros() const { return _nnz; }
+
+    KOKKOS_INLINE_FUNCTION
+    size_type_array_ptr RowPtr() const { return &_ap[0]; }
+
+    KOKKOS_INLINE_FUNCTION
+    ordinal_type_array_ptr ColPtr() const { return &_aj[0]; }
+
+    KOKKOS_INLINE_FUNCTION
+    value_type_array_ptr ValuePtr() const { return &_ax[0];}
+
+    KOKKOS_INLINE_FUNCTION
+    size_type RowPtr(const ordinal_type i) const { return _ap[i]; }
+    
+    KOKKOS_INLINE_FUNCTION
+    ordinal_type_array_ptr ColsInRow(const ordinal_type i) const { return _aj.data() + _ap[i] ; }
+    
+    KOKKOS_INLINE_FUNCTION
+    value_type_array_ptr ValuesInRow(const ordinal_type i) const { return _ax.data() + _ap[i] ; }
+
+    KOKKOS_INLINE_FUNCTION
+    ordinal_type NumNonZerosInRow(const ordinal_type i) const { return (_ap[i+1] - _ap[i]); } 
+
+    KOKKOS_INLINE_FUNCTION
+    value_type& Value(const ordinal_type k) { return _ax[k]; }
+
+    KOKKOS_INLINE_FUNCTION
+    value_type Value(const ordinal_type k) const { return _ax[k]; }
+
+    /// \brief Default constructor.
+    KOKKOS_INLINE_FUNCTION
+    CrsMatrixBase() 
+      : _m(0),
+        _n(0),
+        _nnz(0),
+        _ap(),
+        _aj(),
+        _ax()
+    { }
+
+    /// \brief Constructor with label
+    CrsMatrixBase(const string & ) 
+      : _m(0),
+        _n(0),
+        _nnz(0),
+        _ap(),
+        _aj(),
+        _ax()
+    { }
+
+    /// \brief Copy constructor (shallow copy), for deep-copy use a method copy
+    template<typename VT,
+             typename OT,
+             typename ST,
+             typename SpT,
+             typename MT>
+    CrsMatrixBase(const CrsMatrixBase<VT,OT,ST,SpT,MT> &b) 
+      : _m(b._m),
+        _n(b._n),
+        _nnz(b._nnz),
+        _ap(b._ap), 
+        _aj(b._aj),
+        _ax(b._ax) 
+    { }
+
+    /// \brief Constructor to allocate internal data structures.
+    CrsMatrixBase(const string & ,
+                  const ordinal_type m, 
+                  const ordinal_type n, 
+                  const ordinal_type nnz) 
+      : _m(m),
+        _n(n),
+        _nnz(nnz),
+        _ap("CrsMatrixBase::RowPtrArray", m+1),
+        _aj("CrsMatrixBase::ColsArray", nnz),
+        _ax("CrsMatrixBase::ValuesArray", nnz)
+    { }
+
+    /// \brief Constructor to attach external arrays to the matrix.
+    CrsMatrixBase(const string &,
+                  const ordinal_type m, 
+                  const ordinal_type n, 
+                  const ordinal_type nnz,
+                  const size_type_array &ap,
+                  const ordinal_type_array &aj,
+                  const value_type_array &ax) 
+      : _m(m),
+        _n(n),
+        _nnz(nnz),
+        _ap(ap), 
+        _aj(aj),
+        _ax(ax) 
+    { }
+    
+  // Allow the copy function access to the input CrsMatrixBase
+  // private data.
+  template<typename, typename, typename, typename, typename>
+  friend class CrsMatrixBase ;
+
+  public:
+    /// \brief deep copy of matrix b, potentially different spaces
+    template< typename SpT >
+    int 
+    copy(const CrsMatrixBase<ValueType,OrdinalType,SizeType,SpT,MemoryTraits> &b) {
+
+      space_type::execution_space::fence();
+
+      createInternalArrays(b._m, b._n, b._nnz);
+
+      space_type::execution_space::fence();
+
+      const auto ap_range = range_type<ordinal_type>(0, min(_ap.dimension_0(), b._ap.dimension_0()));
+      const auto aj_range = range_type<size_type>   (0, min(_aj.dimension_0(), b._aj.dimension_0()));
+      const auto ax_range = range_type<size_type>   (0, min(_ax.dimension_0(), b._ax.dimension_0()));
+
+      Kokkos::deep_copy(Kokkos::subview(  _ap, ap_range), 
+                        Kokkos::subview(b._ap, ap_range));
+      Kokkos::deep_copy(Kokkos::subview(  _aj, aj_range),
+                        Kokkos::subview(b._aj, aj_range));
+
+      Kokkos::deep_copy(Kokkos::subview(  _ax, ax_range),
+                        Kokkos::subview(b._ax, ax_range));
+
+      space_type::execution_space::fence();
+
+      return 0;
+    }
+
+    /// \brief deep copy of lower/upper triangular of matrix b
+    int 
+    copy(const int uplo, 
+         const CrsMatrixBase &b) { 
+
+      createInternalArrays(b._m, b._n, b._nnz);
+
+      // assume that matrix b is sorted.
+      switch (uplo) {
+      case Uplo::Lower: {
+        _nnz = 0;
+        for (ordinal_type i=0;i<_m;++i) {
+          size_type jbegin = b._ap[i];
+          size_type jend   = b._ap[i+1];
+          _ap[i] = _nnz;
+          for (size_type j=jbegin;j<jend && (i >= b._aj[j]);++j,++_nnz) {
+            _aj[_nnz] = b._aj[j];
+            _ax[_nnz] = b._ax[j]; 
+          }
+        }
+        _ap[_m] = _nnz;
+        break;
+      }
+      case Uplo::Upper: {
+        _nnz = 0;
+        for (ordinal_type i=0;i<_m;++i) {
+          size_type j = b._ap[i];
+          size_type jend = b._ap[i+1];
+          _ap[i] = _nnz;
+          for ( ;j<jend && (i > b._aj[j]);++j) ;
+          for ( ;j<jend;++j,++_nnz) {
+            _aj[_nnz] = b._aj[j];
+            _ax[_nnz] = b._ax[j]; 
+          }
+        }
+        _ap[_m] = _nnz;
+        break;
+      }
+      }
+
+      return 0;
+    }
+
+    /// \brief deep copy of matrix b with given permutation vectors
+    template<typename VT,
+             typename OT,
+             typename ST,
+             typename SpT,
+             typename MT>
+    int
+    copy(const typename CrsMatrixBase<VT,OT,ST,SpT,MT>::ordinal_type_array &p,
+         const typename CrsMatrixBase<VT,OT,ST,SpT,MT>::ordinal_type_array &ip,
+         const CrsMatrixBase<VT,OT,ST,SpT,MT> &b) {
+
+      createInternalArrays(b._m, b._n, b._nnz);
+
+      // Question:: do I need to use Kokkos::vector ? 
+      //            in other words, where do we permute matrix in factoriztion ?
+      //            permuting a matrix is a kernel ? 
+      vector<ijv_type> tmp;
+
+      // any chance to use parallel_for ?
+      _nnz = 0;
+      for (ordinal_type i=0;i<_m;++i) {
+        ordinal_type ii = ip[i];
+
+        size_type jbegin = b._ap[ii];
+        size_type jend   = b._ap[ii+1];
+
+        _ap[i] = _nnz;
+        for (size_type j=jbegin;j<jend;++j) {
+          ordinal_type jj = p[b._aj[j]];
+          ijv_type aij(i, jj, b._ax[j]);
+          tmp.push_back(aij);
+        }
+
+        sort(tmp.begin(), tmp.end(), less<ijv_type>());
+        for (auto it=tmp.begin();it<tmp.end();++it) {
+          ijv_type aij = (*it);
+
+          _aj[_nnz] = aij.Col();
+          _ax[_nnz] = aij.Val();
+          ++_nnz;
+        }
+        tmp.clear();
+      }
+      _ap[_m] = _nnz;
+
+      return 0;
+    }
+
+    /// \brief add the matrix b into this non-zero entires
+    template<typename VT,
+             typename OT,
+             typename ST,
+             typename SpT,
+             typename MT>
+    int 
+    add(const CrsMatrixBase<VT,OT,ST,SpT,MT> &b) { 
+
+      const ordinal_type m = min(b._m, _m);
+      for (ordinal_type i=0;i<m;++i) {
+        const size_type jaend = _ap[i+1];
+        const size_type jbend = b._ap[i+1];
+
+        size_type ja = _ap[i];
+        size_type jb = b._ap[i];
+        
+        for ( ;jb<jbend;++jb) {
+          for ( ;(_aj[ja]<b._aj[jb] && ja<jaend);++ja);
+          _ax[ja] += (_aj[ja] == b._aj[jb])*b._ax[jb];
+        }
+      }
+
+      return 0;
+    }
+
+    int symmetrize(const int uplo, 
+                   const bool conjugate = false) {
+      vector<ijv_type> mm;
+      mm.reserve(_nnz*2);
+
+      for (ordinal_type i=0;i<_m;++i) {
+        const size_type jbegin = _ap[i];
+        const size_type jend   = _ap[i+1];
+        for (size_type jj=jbegin;jj<jend;++jj) {
+          const ordinal_type j = _aj[jj];
+          const value_type val = (conjugate ? conj(_ax[j]) : _ax[j]);
+          if        (uplo == Uplo::Lower && i > j) {
+            mm.push_back(ijv_type(i, j, val));
+            mm.push_back(ijv_type(j, i, val));
+          } else if (uplo == Uplo::Upper && i < j) {
+            mm.push_back(ijv_type(i, j, val));
+            mm.push_back(ijv_type(j, i, val));
+          } else if (i == j) {
+            mm.push_back(ijv_type(i, i, val));
+          }
+        }
+      }
+      sort(mm.begin(), mm.end(), less<ijv_type>());
+
+      createInternalArrays(_m, _n, mm.size());
+      
+      ijv2crs(mm);
+      
+      return 0;
+    }
+
+    int hermitianize(int uplo) {
+      return symmetrize(uplo, true);
+    }
+
+    ostream& showMe(ostream &os) const {
+      streamsize prec = os.precision();
+      os.precision(8);
+      os << scientific;
+
+      os << " -- CrsMatrixBase -- " << endl
+         << "    # of Rows          = " << _m << endl
+         << "    # of Cols          = " << _n << endl
+         << "    # of NonZeros      = " << _nnz << endl
+         << endl
+         << "    RowPtrArray length = " << _ap.dimension_0() << endl
+         << "    ColArray    length = " << _aj.dimension_0() << endl 
+         << "    ValueArray  length = " << _ax.dimension_0() << endl
+         << endl;
+      
+      const int w = 10;
+      if (_ap.size() && _aj.size() && _ax.size()) {
+        os << setw(w) <<  "Row" << "  " 
+           << setw(w) <<  "Col" << "  " 
+           << setw(w) <<  "Val" << endl;
+        for (ordinal_type i=0;i<_m;++i) {
+          size_type jbegin = _ap[i], jend = _ap[i+1];
+          for (size_type j=jbegin;j<jend;++j) {
+            value_type val = _ax[j];
+            os << setw(w) <<      i << "  " 
+               << setw(w) << _aj[j] << "  " 
+               << setw(w) <<    val << endl;
+          }
+        }
+      }
+
+      os.unsetf(ios::scientific);
+      os.precision(prec);
+
+      return os;
+    }
+
+    int importMatrixMarket(ifstream &file) {
+
+      vector<ijv_type> mm; 
+      const ordinal_type mm_base = 1; 
+
+      {
+        string header;
+        if (file.is_open()) {
+          getline(file, header);
+          while (file.good()) {
+            char c = file.peek();
+            if (c == '%' || c == '\n') {
+              file.ignore(256, '\n');
+            continue;
+            }
+            break;
+          }
+        } else {
+          ERROR(MSG_INVALID_INPUT(file));
+        }
+
+        // check the header
+        bool symmetry = (header.find("symmetric") != string::npos);
+
+        // read matrix specification
+        ordinal_type m, n;
+        size_type nnz;
+        
+        file >> m >> n >> nnz;
+        
+        mm.reserve(nnz*(symmetry ? 2 : 1));
+        for (size_type i=0;i<nnz;++i) {
+          ordinal_type row, col;
+          value_type val;
+          file >> row >> col >> val;
+          
+          row -= mm_base;
+          col -= mm_base;
+          
+          mm.push_back(ijv_type(row, col, val));
+          if (symmetry && row != col)
+            mm.push_back(ijv_type(col, row, val));
+        }
+        sort(mm.begin(), mm.end(), less<ijv_type>());
+      
+        // construct workspace and set variables
+        createInternalArrays(m, n, mm.size());
+      }
+      
+      // change mm to crs
+      ijv2crs(mm);
+      
+      return 0;
+    }
+    
+    int exportMatrixMarket(ofstream &file,
+                           const string comment,
+                           const int uplo = 0) {
+      streamsize prec = file.precision();
+      file.precision(8);
+      file << scientific;
+
+      file << "%%MatrixMarket matrix coordinate "
+           << (is_fundamental<value_type>::value ? "real " : "complex ")
+           << ((uplo == Uplo::Upper || uplo == Uplo::Lower) ? "symmetric " : "general ")
+           << endl;
+
+      file << comment << endl;
+      
+      // cnt nnz
+      size_type nnz = 0;
+      for (ordinal_type i=0;i<_m;++i) {
+        const size_type jbegin = _ap[i], jend = _ap[i+1];
+        for (size_type j=jbegin;j<jend;++j) {
+          if (uplo == Uplo::Upper && i <= _aj[j]) ++nnz;
+          if (uplo == Uplo::Lower && i >= _aj[j]) ++nnz;
+          if (!uplo) ++nnz;
+        }
+      }
+      file << _m << " " << _n << " " << nnz << endl;
+
+      const int w = 10;
+      for (ordinal_type i=0;i<_m;++i) {
+        const size_type jbegin = _ap[i], jend = _ap[i+1];
+        for (size_type j=jbegin;j<jend;++j) {
+          bool flag = false;
+          if (uplo == Uplo::Upper && i <= _aj[j]) flag = true;
+          if (uplo == Uplo::Lower && i >= _aj[j]) flag = true;
+          if (!uplo) flag = true;
+          if (flag) {
+            value_type val = _ax[j];
+            file << setw(w) << (     i+1) << "  " 
+                 << setw(w) << (_aj[j]+1) << "  " 
+                 << setw(w) <<    val << endl;
+          }
+        }
+      }
+
+      file.unsetf(ios::scientific);
+      file.precision(prec);
+
+      return 0;
+    }
+
+    //----------------------------------------------------------------------
+
+    int convertGraph(size_type_array rptr,
+                     ordinal_type_array cidx) const {
+      ordinal_type ii = 0;
+      size_type jj = 0;
+
+      for (ordinal_type i=0;i<_m;++i) {
+        size_type jbegin = _ap[i], jend = _ap[i+1];
+        rptr[ii++] = jj;
+        for (size_type j=jbegin;j<jend;++j)
+          if (i != _aj[j])
+            cidx[jj++] = _aj[j];
+      }
+      rptr[ii] = jj;
+
+      return 0;
+    }
+
+    //----------------------------------------------------------------------
+
+  };
+
+}
+
+#endif
diff --git a/lib/kokkos/example/ichol/src/crs_matrix_base_import.hpp b/lib/kokkos/example/ichol/src/crs_matrix_base_import.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e1ff0f3a9fd403ae51d68f77358409e1e3cd5cca
--- /dev/null
+++ b/lib/kokkos/example/ichol/src/crs_matrix_base_import.hpp
@@ -0,0 +1,104 @@
+#pragma once
+#ifndef __CRS_MATRIX_BASE_IMPL_HPP__
+#define __CRS_MATRIX_BASE_IMPL_HPP__
+
+/// \file crs_matrix_base_impl.hpp
+/// \brief Implementation of external interfaces to CrsMatrixBase
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+namespace Tacho { 
+
+  using namespace std;
+
+  template<typename VT,
+           typename OT,
+           typename ST,
+           typename SpT,
+           typename MT>
+  inline int 
+  CrsMatrixBase<VT,OT,ST,SpT,MT>::importMatrixMarket(ifstream &file) {
+    // skip initial title comments
+    {
+      ordinal_type m, n;
+      size_type nnz;
+          
+      while (file.good()) {
+        char c = file.peek();
+        if (c == '%' || c == '\n') {
+          file.ignore(256, '\n');
+          continue;
+        }
+        break;
+      }
+          
+      // read matrix specification
+      file >> m >> n >> nnz;
+          
+      // construct workspace and set variables
+      createInternalArrays(m, n, nnz);
+    }
+
+    // read the coordinate format (matrix-market)
+    vector<ijv_type> mm; 
+    mm.reserve(_nnz);
+    {
+      // matrix market use one base index
+      const ordinal_type mm_base = 1; 
+
+      for (size_type i=0;i<_nnz;++i) {
+        ijv_type aij;
+        file >> aij.Row() >> aij.Col() >> aij.Val();
+
+        // one base to zero base
+        aij.Row() -= mm_base;
+        aij.Col() -= mm_base;
+            
+        mm.push_back(aij);
+      }
+      sort(mm.begin(), mm.end(), less<ijv_type>());
+    }
+
+    // change mm to crs
+    {
+      ordinal_type ii = 0;
+      size_type jj = 0;
+
+      ijv_type prev = mm[0];
+      _ap[ii++] = 0;
+      _aj[jj] = prev.Col();
+      _ax[jj] = prev.Val();
+      ++jj;
+
+      for (typename vector<ijv_type>::iterator it=(mm.begin()+1);it<mm.end();++it) {
+        ijv_type aij = (*it);
+        
+        // row index
+        if (aij.Row() != prev.Row()) {
+          _ap[ii++] = jj; 
+        }
+            
+        if (aij == prev) {
+          --jj;
+          _aj[jj]  = aij.Col();
+          _ax[jj] += aij.Val();
+        } else {
+          _aj[jj] = aij.Col();
+          _ax[jj] = aij.Val();
+        }
+        ++jj;
+          
+        prev = aij;
+      }
+          
+      // add the last index to terminate the storage
+      _ap[ii++] = jj;
+      _nnz = jj;
+    }
+      
+    return 0;
+  }
+  
+}
+
+
+#endif
diff --git a/lib/kokkos/example/ichol/src/crs_matrix_helper.hpp b/lib/kokkos/example/ichol/src/crs_matrix_helper.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..5b80e77935fcb968bff8f05e9876a10299a82182
--- /dev/null
+++ b/lib/kokkos/example/ichol/src/crs_matrix_helper.hpp
@@ -0,0 +1,71 @@
+#pragma once
+#ifndef __CRS_MATRIX_HELPER_HPP__
+#define __CRS_MATRIX_HELPER_HPP__
+
+/// \file crs_matrix_helper.hpp
+/// \brief This file includes utility functions to convert between flat and hierarchical matrices.
+/// \author Kyungjoo Kim (kyukim@sandia.gov)  
+
+#include "util.hpp"
+
+namespace Tacho { 
+
+  using namespace std;
+
+  class CrsMatrixHelper {
+  public:
+
+    template< typename CrsHierBase >
+    static int fillRowViewArray( CrsHierBase & HU );
+
+    template<typename CrsFlatBase>
+    static int
+    filterZeros(CrsFlatBase &flat);
+    
+    /// \brief Transform a scalar flat matrix to hierarchical matrix of matrices 1x1; testing only.
+    template<typename CrsFlatBase,
+             typename CrsHierBase>
+    static int
+    flat2hier(CrsFlatBase &flat, 
+              CrsHierBase &hier);
+
+    /// \brief Transform a scalar flat matrix to upper hierarchical matrix given scotch info. 
+    template<typename CrsFlatBase,
+             typename CrsHierBase,
+             typename HostOrdinalTypeArray >
+    static int
+    flat2hier(int uplo, 
+              CrsFlatBase &flat, 
+              CrsHierBase &hier,
+              const typename CrsHierBase::ordinal_type       nblks,
+              const HostOrdinalTypeArray range,
+              const HostOrdinalTypeArray tree);
+
+    /// \brief Transform a scalar flat matrix to upper hierarchical matrix given scotch info. 
+    template<typename CrsFlatBase,
+             typename CrsHierBase,
+             typename HostOrdinalTypeArray >
+    static int
+    flat2hier_upper(CrsFlatBase &flat, 
+                    CrsHierBase &hier,
+                    const typename CrsHierBase::ordinal_type       nblks,
+                    const HostOrdinalTypeArray range,
+                    const HostOrdinalTypeArray tree);
+
+    /// \brief Transform a scalar flat matrix to lower hierarchical matrix given scotch info. 
+    template<typename CrsFlatBase,
+             typename CrsHierBase,
+             typename HostOrdinalTypeArray >
+    static int
+    flat2hier_lower(CrsFlatBase &flat, 
+                    CrsHierBase &hier,
+                    const typename CrsHierBase::ordinal_type       nblks,
+                    const HostOrdinalTypeArray range,
+                    const HostOrdinalTypeArray tree);
+  };
+
+}
+
+#include "crs_matrix_helper_impl.hpp"
+
+#endif
diff --git a/lib/kokkos/example/ichol/src/crs_matrix_helper_impl.hpp b/lib/kokkos/example/ichol/src/crs_matrix_helper_impl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..0fc4c9f1b83c0e48d3e42eb61e8e1cea12b1c187
--- /dev/null
+++ b/lib/kokkos/example/ichol/src/crs_matrix_helper_impl.hpp
@@ -0,0 +1,364 @@
+
+#ifndef __CRS_MATRIX_HELPER_IMPL_HPP__
+#define __CRS_MATRIX_HELPER_IMPL_HPP__
+
+/// \file crs_matrix_helper_impl.hpp
+/// \brief This file includes utility functions to convert between flat and hierarchical matrices.
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+#include "util.hpp"
+
+namespace Tacho {
+
+  using namespace std;
+
+  template< typename CrsHierBase >
+  struct FunctorFillRowViewArray {
+
+    typedef typename CrsHierBase::ordinal_type         ordinal_type ;
+    typedef typename CrsHierBase::row_view_type_array  row_view_type_array ;
+    typedef typename CrsHierBase::value_type_array     ax_type ;
+
+    typedef ordinal_type value_type ;
+
+    row_view_type_array _all_row_views ;
+    ax_type             _ax ;
+
+    FunctorFillRowViewArray( const row_view_type_array & arg_all_row_views
+                           , const ax_type             & arg_ax )
+      : _all_row_views( arg_all_row_views )
+      , _ax( arg_ax )
+      {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()( ordinal_type k , ordinal_type & value ) const
+      { value += _ax(k).NumRows(); }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()( ordinal_type k , ordinal_type & value , bool final ) const
+      {
+        if ( final ) {
+          const int begin = value ;
+          const int end   = begin + _ax(k).NumRows();
+
+          auto sub = Kokkos::subview( _all_row_views, Kokkos::pair<int,int>(begin,end) );
+
+          _ax(k).setRowViewArray( sub );
+        }
+
+        value += _ax(k).NumRows();
+      }
+  };
+
+  template< typename CrsHierBase >
+  int CrsMatrixHelper::fillRowViewArray( CrsHierBase & device_HU )
+  {
+    typedef typename CrsHierBase::row_view_type_array row_view_type_array ;
+    typedef typename CrsHierBase::space_type          space_type ;
+
+    ordinal_type total_row_view_count = 0 ;
+
+    Kokkos::RangePolicy< space_type >
+      range_policy( 0 , device_HU.NumNonZeros() );
+
+    space_type::fence();
+
+    {
+      FunctorFillRowViewArray< CrsHierBase >
+         functor( row_view_type_array() , device_HU._ax );
+
+
+      Kokkos::parallel_reduce( range_policy , functor , total_row_view_count );
+    }
+
+    device_HU._all_row_views =
+      row_view_type_array("RowViews",total_row_view_count);
+
+    space_type::fence();
+
+    {
+      FunctorFillRowViewArray< CrsHierBase >
+         functor( device_HU._all_row_views , device_HU._ax );
+
+      Kokkos::parallel_scan( range_policy , functor );
+    }
+
+    space_type::fence();
+
+    return 0 ;
+  }
+  
+  template<typename CrsFlatBase>
+  int
+  CrsMatrixHelper::filterZeros(CrsFlatBase &flat) {
+    typedef typename CrsFlatBase::ordinal_type           ordinal_type;
+    typedef typename CrsFlatBase::size_type              size_type;
+    typedef typename CrsFlatBase::value_type             value_type;
+    
+    typedef typename CrsFlatBase::ordinal_type_array_ptr ordinal_type_array_ptr;
+    typedef typename CrsFlatBase::value_type_array_ptr   value_type_array_ptr;
+    
+    size_type nz = 0;
+    const value_type zero(0);
+    
+    for (ordinal_type k=0;k<flat.NumNonZeros();++k) 
+      nz += (flat.Value(k) == zero) ;
+    
+    if (nz) {
+      CrsFlatBase resized(flat.Label() + "::ZeroFiltered", 
+                          flat.NumRows(),
+                          flat.NumCols(),
+                          flat.NumNonZeros() - nz);
+      
+      ordinal_type_array_ptr rows = resized.RowPtr(); rows[0] = 0;
+      ordinal_type_array_ptr cols = resized.ColPtr();
+      value_type_array_ptr vals = resized.ValuePtr();    
+      
+      size_type nnz = 0;
+      for (ordinal_type i=0;i<flat.NumRows();++i) {
+        const ordinal_type nnz_in_row = flat.NumNonZerosInRow(i);
+        const ordinal_type_array_ptr cols_in_row = flat.ColsInRow(i);
+        const value_type_array_ptr vals_in_row = flat.ValuesInRow(i);
+        
+        for (ordinal_type j=0;j<nnz_in_row;++j) {
+          if (vals_in_row[j] != zero) {
+            cols[nnz] = cols_in_row[j];
+            vals[nnz] = vals_in_row[j];
+            ++nnz;
+          }
+        }
+        rows[i+1] = nnz;
+      }
+      flat = resized;
+      resized.setNumNonZeros();
+    }
+
+    return 0;
+  }
+
+
+  template<typename CrsFlatBase,
+           typename CrsHierBase>
+  int
+  CrsMatrixHelper::flat2hier(CrsFlatBase &flat,
+                             CrsHierBase &hier) {
+    typedef typename CrsHierBase::ordinal_type           ordinal_type;
+    typedef typename CrsHierBase::size_type              size_type;
+    typedef typename CrsHierBase::ordinal_type_array_ptr ordinal_type_array_ptr;
+
+    size_type nnz = 0;
+
+    hier.createInternalArrays(flat.NumRows(), flat.NumCols(), flat.NumNonZeros());
+
+    for (ordinal_type i=0;i<flat.NumRows();++i) {
+      ordinal_type jsize = flat.NumNonZerosInRow(i);
+
+      hier._ap[i] = nnz;
+      ordinal_type_array_ptr ci = flat.ColsInRow(i);
+      for (ordinal_type j=0;j<jsize;++j,++nnz) {
+        hier._aj[nnz] = ci[j];
+        hier._ax[nnz].setView( flat,     i, 1,
+                              /**/   ci[j], 1);
+      }
+    }
+
+    hier._ap[flat.NumRows()] = nnz;
+    hier._nnz = nnz;
+
+    return 0;
+  }
+
+  template< typename CrsFlatBase ,
+            typename CrsHierBase ,
+            typename HostOrdinalTypeArray >
+  int
+  CrsMatrixHelper::flat2hier(int uplo,
+                             CrsFlatBase &flat,
+                             CrsHierBase &hier,
+                             const typename CrsHierBase::ordinal_type       nblks,
+                             const HostOrdinalTypeArray range ,
+                             const HostOrdinalTypeArray tree) {
+    switch(uplo) {
+    case Uplo::Upper: return flat2hier_upper(flat, hier, nblks, range, tree);
+    case Uplo::Lower: return flat2hier_lower(flat, hier, nblks, range, tree);
+    }
+    return -1;
+  }
+
+  template<typename CrsFlatBase,
+           typename CrsHierBase,
+           typename HostOrdinalTypeArray >
+  int
+  CrsMatrixHelper::flat2hier_upper(CrsFlatBase & device_flat, 
+                                   CrsHierBase & device_hier,
+                                   const typename CrsHierBase::ordinal_type       nblks,
+                                   const HostOrdinalTypeArray range,
+                                   const HostOrdinalTypeArray tree) {
+    typedef typename CrsHierBase::ordinal_type            ordinal_type;
+    typedef typename CrsHierBase::size_type               size_type;
+    
+    //typedef typename CrsHierBase::ordinal_type_array     ordinal_type_array;
+    //typedef typename CrsHierBase::ordinal_type_array_ptr ordinal_type_array_ptr;
+    //typedef typename CrsHierBase::value_type_array_ptr   value_type_array_ptr;
+    
+    size_type nnz = 0;
+    
+    // count nnz and nnz in rows for the upper triangular hier matrix
+    for (ordinal_type i=0;i<nblks;++i) 
+      for (ordinal_type j=i;j != -1;++nnz,j=tree[j]) ;
+    
+    // create upper triangular block matrix
+    device_hier.createInternalArrays(nblks, nblks, nnz);    
+
+    typename CrsHierBase::size_type_array::HostMirror
+      host_ap = Kokkos::create_mirror_view( device_hier._ap );
+
+    typename CrsHierBase::ordinal_type_array::HostMirror
+      host_aj = Kokkos::create_mirror_view( device_hier._aj );
+
+    typename CrsHierBase::value_type_array::HostMirror
+      host_ax = Kokkos::create_mirror_view( device_hier._ax );
+
+    nnz = 0;
+    for (ordinal_type i=0;i<nblks;++i) {
+      host_ap[i] = nnz;
+      for (ordinal_type j=i;j != -1;++nnz,j=tree[j]) {
+        host_aj[nnz] = j;
+        host_ax[nnz].setView( device_flat, range[i], (range[i+1] - range[i]),
+                             /**/          range[j], (range[j+1] - range[j]));
+
+        // this checking might more expensive 
+        // and attempts to access device memory from the host
+        // if (!host_ax[nnz].countNumNonZeros())
+        //  --nnz;
+      }
+    }
+    
+    host_ap[nblks] = nnz;
+
+    Kokkos::deep_copy( device_hier._ap , host_ap );
+    Kokkos::deep_copy( device_hier._aj , host_aj );
+    Kokkos::deep_copy( device_hier._ax , host_ax );
+
+    device_hier._nnz = nnz;
+
+    return 0;
+  }
+
+  // template<typename CrsFlatBase,
+  //          typename CrsHierBase>
+  // int
+  // CrsMatrixHelper::flat2hier_upper(CrsFlatBase &flat,
+  //                                  CrsHierBase &hier,
+  //                                  const typename CrsHierBase::ordinal_type       nblks,
+  //                                  const typename CrsHierBase::ordinal_type_array range,
+  //                                  const typename CrsHierBase::ordinal_type_array tree) {
+  //   typedef typename CrsHierBase::ordinal_type            ordinal_type;
+  //   typedef typename CrsHierBase::size_type               size_type;
+
+  //   typedef typename CrsHierBase::ordinal_type_array     ordinal_type_array;
+  //   //typedef typename CrsHierBase::ordinal_type_array_ptr ordinal_type_array_ptr;
+  //   //typedef typename CrsHierBase::value_type_array_ptr   value_type_array_ptr;
+
+  //   ordinal_type_array sibling("CrsMatrixHelper::flat2hier_upper::sibling", nblks);
+
+  //   // check the end of adjacent siblings (if not adjacent, they are separators)
+  //   ordinal_type p = tree[0];
+  //   for (ordinal_type i=1;i<nblks;++i) {
+  //     const ordinal_type j = tree[i];
+  //     if (p != j) {
+  //       p = j;
+  //       sibling[i-1] = -1;
+  //     }
+  //   }
+  //   sibling[nblks-1] = -1;
+
+  //   size_type nnz = 0;
+
+  //   // count nnz and nnz in rows for the upper triangular hier matrix
+  //   for (ordinal_type i=0;i<nblks;++i) {                  // search for all rows
+  //     for (ordinal_type j=i;j != -1;j=tree[j]) {          // move up
+  //       ordinal_type k=j;
+  //       do {
+  //         ++nnz;
+  //       } while (sibling[k++] != -1);
+  //     }
+  //   }
+
+  //   // create upper triangular block matrix
+  //   hier.createInternalArrays(nblks, nblks, nnz);
+
+  //   nnz = 0;
+  //   for (ordinal_type i=0;i<nblks;++i) {
+  //     hier._ap[i] = nnz;
+  //     for (ordinal_type j=i;j != -1;j=tree[j]) {
+  //       ordinal_type k=j;
+  //       do {
+  //         hier._aj[nnz] = k;
+  //         hier._ax[nnz].setView( flat, range[i], (range[i+1] - range[i]),
+  //                               /**/   range[k], (range[k+1] - range[k]));
+
+  //         // this checking might more expensive
+  //         if (hier._ax[nnz].hasNumNonZeros())
+  //           ++nnz;
+  //       } while (sibling[k++] != -1);
+  //     }
+  //   }
+  //   hier._ap[nblks] = nnz;
+  //   hier._nnz = nnz;
+
+  //   return 0;
+  // }
+
+  template<typename CrsFlatBase,
+           typename CrsHierBase,
+           typename HostOrdinalTypeArray >
+  int
+  CrsMatrixHelper::flat2hier_lower(CrsFlatBase &flat,
+                                   CrsHierBase &hier,
+                                   const typename CrsHierBase::ordinal_type       nblks,
+                                   const HostOrdinalTypeArray range,
+                                   const HostOrdinalTypeArray tree) {
+    ERROR(MSG_NOT_YET_IMPLEMENTED);
+
+    // typedef typename CrsHierBase::ordinal_type           ordinal_type;
+    // typedef typename CrsHierBase::size_type              size_type;
+
+    // typedef typename CrsHierBase::ordinal_type_array     ordinal_type_array;
+    // //typedef typename CrsHierBase::ordinal_type_array_ptr ordinal_type_array_ptr;
+    // //typedef typename CrsHierBase::value_type_array_ptr   value_type_array_ptr;
+
+    // ordinal_type_array tmp = ordinal_type_array("flat2hier:tmp", nblks+1);
+    // size_type nnz = 0;
+
+    // // count nnz and nnz in rows for lower triangular matrix
+    // for (ordinal_type i=0;i<nblks;++i)
+    //   for (ordinal_type j=i;j != -1;++nnz) {
+    //     ++tmp[j];
+    //     j = tree[j];
+    //   }
+
+    // // count nnz and nnz in rows for lower triangular matrix
+    // hier.createInternalArrays(nblks, nblks, nnz);
+    // for (ordinal_type i=1;i<(nblks+1);++i)
+    //   hier._ap[i] = hier._ap[i-1] + tmp[i-1];
+
+    // for (ordinal_type i=0;i<(nblks+1);++i)
+    //   tmp[i] = hier._ap[i];
+
+    // for (ordinal_type i=0;i<nblks;++i)
+    //   for (ordinal_type j=i;j != -1;j=tree[j]) {
+    //     hier._aj[tmp[j]] = i;
+    //     hier._ax[tmp[j]].setView( flat, range[j], (range[j+1] - range[j]),
+    //                              /**/   range[i], (range[i+1] - range[i]));
+    //     ++tmp[j];
+    //   }
+
+    return 0;
+  }
+
+}
+
+
+#endif
+
diff --git a/lib/kokkos/example/ichol/src/crs_matrix_view.hpp b/lib/kokkos/example/ichol/src/crs_matrix_view.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..2a55e6fac9b64eca3eade412a1511913baafab85
--- /dev/null
+++ b/lib/kokkos/example/ichol/src/crs_matrix_view.hpp
@@ -0,0 +1,226 @@
+#pragma once
+#ifndef __CRS_MATRIX_VIEW_HPP__
+#define __CRS_MATRIX_VIEW_HPP__
+
+/// \file crs_matrix_view.hpp
+/// \brief CRS matrix view object creates 2D view to setup a computing region.
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+#include "util.hpp"
+
+namespace Tacho { 
+
+  using namespace std;
+
+  template<typename CrsMatBaseType>  
+  class CrsRowView;
+
+  template<typename CrsMatBaseType>
+  class CrsMatrixView {
+  public:
+    typedef typename CrsMatBaseType::space_type    space_type;
+    
+    typedef typename CrsMatBaseType::value_type    value_type;
+    typedef typename CrsMatBaseType::ordinal_type  ordinal_type;
+    typedef typename CrsMatBaseType::size_type     size_type;
+
+    typedef CrsMatBaseType             mat_base_type;
+    typedef CrsRowView<mat_base_type>  row_view_type;
+
+    // be careful this use rcp and atomic operation
+    // - use setView to create a view if _rows is not necessary
+    // - copy constructor and assignment operator will do soft copy of the object
+    typedef Kokkos::View<row_view_type*,space_type,Kokkos::MemoryUnmanaged> row_view_type_array;
+    
+  private:
+    CrsMatBaseType _base;    // shallow copy of the base object
+    ordinal_type  _offm;     // offset in rows
+    ordinal_type  _offn;     // offset in cols
+    ordinal_type  _m;        // # of rows
+    ordinal_type  _n;        // # of cols
+
+    row_view_type_array _rows;
+    
+  public:
+
+    KOKKOS_INLINE_FUNCTION
+    void setRowViewArray( const row_view_type_array & arg_rows )
+      {
+        _rows = arg_rows ;
+
+        for (ordinal_type i=0;i<_m;++i) {
+          _rows[i].setView(*this, i);
+        }
+      }
+
+    KOKKOS_INLINE_FUNCTION
+    row_view_type& RowView(const ordinal_type i) const { return _rows[i]; }
+
+    KOKKOS_INLINE_FUNCTION
+    void setView(const CrsMatBaseType &base,
+                 const ordinal_type offm, const ordinal_type m,
+                 const ordinal_type offn, const ordinal_type n) {
+      _base = base;
+
+      _offm = offm; _m = m;
+      _offn = offn; _n = n;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    const CrsMatBaseType & BaseObject() const { return _base; }
+
+    KOKKOS_INLINE_FUNCTION
+    ordinal_type  OffsetRows() const { return _offm; }
+
+    KOKKOS_INLINE_FUNCTION
+    ordinal_type  OffsetCols() const { return _offn; }
+
+    KOKKOS_INLINE_FUNCTION    
+    ordinal_type  NumRows() const { return _m; }
+
+    KOKKOS_INLINE_FUNCTION
+    ordinal_type  NumCols() const { return _n; }
+
+    KOKKOS_INLINE_FUNCTION
+    bool hasNumNonZeros() const { 
+      const ordinal_type m = NumRows();
+      for (ordinal_type i=0;i<m;++i) {
+        row_view_type row;
+        row.setView(*this, i);
+        if (row.NumNonZeros()) return true;
+      }
+      return false;
+    }
+
+    inline
+    size_type countNumNonZeros() const { 
+      size_type nnz = 0;
+      const ordinal_type m = NumRows();
+      for (ordinal_type i=0;i<m;++i) {
+        row_view_type row;
+        row.setView(*this, i);
+        nnz += row.NumNonZeros();
+      }
+      return nnz; 
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    CrsMatrixView()
+      : _base(),
+        _offm(0),
+        _offn(0),
+        _m(0),
+        _n(0),
+        _rows()
+    { } 
+
+    KOKKOS_INLINE_FUNCTION
+    CrsMatrixView(const CrsMatrixView &b)
+      : _base(b._base),
+        _offm(b._offm),
+        _offn(b._offn),
+        _m(b._m),
+        _n(b._n),
+        _rows(b._rows)
+    { } 
+
+    KOKKOS_INLINE_FUNCTION
+    CrsMatrixView(const CrsMatBaseType & b)
+      : _base(b),
+        _offm(0),
+        _offn(0),
+        _m(b.NumRows()),
+        _n(b.NumCols()),
+        _rows()
+    { } 
+
+    CrsMatrixView(const CrsMatBaseType & b,
+                  const ordinal_type offm, const ordinal_type m,
+                  const ordinal_type offn, const ordinal_type n) 
+      : _base(b),
+        _offm(offm),
+        _offn(offn),
+        _m(m),
+        _n(n),
+        _rows()
+    { } 
+
+    ostream& showMe(ostream &os) const {
+      const int w = 4;
+      os << "CrsMatrixView, "
+         << " Offs ( " << setw(w) << _offm << ", " << setw(w) << _offn << " ); "
+         << " Dims ( " << setw(w) << _m    << ", " << setw(w) << _n    << " ); "
+         << " NumNonZeros = " << countNumNonZeros() << ";";
+
+      return os;
+    }
+
+  };
+}
+
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#if ! KOKKOS_USING_EXP_VIEW
+
+namespace Kokkos {
+  namespace Impl {
+    
+    //  The Kokkos::View allocation will by default assign each allocated datum to zero.
+    //  This is not the required initialization behavior when
+    //  Tacho::CrsRowView and Tacho::CrsMatrixView
+    //  are used within a Kokkos::View.
+    //  Create a partial specialization of the Kokkos::Impl::AViewDefaultConstruct
+    //  to replace the assignment initialization with placement new initialization.
+    //
+    //  This work-around is necessary until a TBD design refactorization of Kokkos::View.
+    
+    template< class ExecSpace , typename T >
+    struct ViewDefaultConstruct< ExecSpace , Tacho::CrsRowView<T> , true >
+    {
+      typedef Tacho::CrsRowView<T> type ;
+      type * const m_ptr ;
+      
+      KOKKOS_FORCEINLINE_FUNCTION
+      void operator()( const typename ExecSpace::size_type& i ) const
+      { new(m_ptr+i) type(); }
+      
+      ViewDefaultConstruct( type * pointer , size_t capacity )
+        : m_ptr( pointer )
+      {
+        Kokkos::RangePolicy< ExecSpace > range( 0 , capacity );
+        parallel_for( range , *this );
+        ExecSpace::fence();
+      }
+    };
+    
+    template< class ExecSpace , typename T >
+    struct ViewDefaultConstruct< ExecSpace , Tacho::CrsMatrixView<T> , true >
+    {
+      typedef Tacho::CrsMatrixView<T> type ;
+      type * const m_ptr ;
+      
+      KOKKOS_FORCEINLINE_FUNCTION
+      void operator()( const typename ExecSpace::size_type& i ) const
+      { new(m_ptr+i) type(); }
+      
+      ViewDefaultConstruct( type * pointer , size_t capacity )
+        : m_ptr( pointer )
+      {
+        Kokkos::RangePolicy< ExecSpace > range( 0 , capacity );
+        parallel_for( range , *this );
+        ExecSpace::fence();
+      }
+    };
+
+  } // namespace Impl
+} // namespace Kokkos
+
+#endif /* #if ! KOKKOS_USING_EXP_VIEW */
+
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif
diff --git a/lib/kokkos/example/ichol/src/crs_row_view.hpp b/lib/kokkos/example/ichol/src/crs_row_view.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..8556bcb9e637dd64afdf92f4ef6b526a14562d09
--- /dev/null
+++ b/lib/kokkos/example/ichol/src/crs_row_view.hpp
@@ -0,0 +1,185 @@
+#pragma once
+#ifndef __CRS_ROW_VIEW_HPP__
+#define __CRS_ROW_VIEW_HPP__
+
+/// \file crs_row_view.hpp
+/// \brief A view to a row extracted from CrsMatrixView.
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+namespace Tacho { 
+
+  using namespace std;
+
+  /// \class CrsRowView
+  template<typename CrsMatBaseType>
+  class CrsRowView {
+  public:
+    typedef typename CrsMatBaseType::ordinal_type           ordinal_type;
+    typedef typename CrsMatBaseType::value_type             value_type;
+    typedef typename CrsMatBaseType::ordinal_type_array_ptr ordinal_type_array_ptr;
+    typedef typename CrsMatBaseType::value_type_array_ptr   value_type_array_ptr;
+    
+  private:
+    // row info
+    ordinal_type _offn, _n;    
+
+    // this assumes a contiguous memory buffer
+    ordinal_type_array_ptr _aj, _ajn; // column index compressed format in row
+    value_type_array_ptr   _ax;                // values 
+
+    static KOKKOS_INLINE_FUNCTION
+    typename CrsMatBaseType::ordinal_type_array_ptr
+    lower_bound( typename CrsMatBaseType::ordinal_type_array_ptr begin ,
+                 typename CrsMatBaseType::ordinal_type_array_ptr const end ,
+                 typename CrsMatBaseType::ordinal_type           const val )
+      {
+         typename CrsMatBaseType::ordinal_type_array_ptr it = begin ;
+         int count = end - begin ;
+         int step = 0 ;
+         while (count>0) {
+           it = begin ;
+           it += ( step = (count >> 1) );
+           if (*it<val) {
+             begin=++it;
+             count-=step+1;
+           }
+           else { count=step; }
+         }
+         return begin;
+      }
+
+  public:
+    KOKKOS_INLINE_FUNCTION
+    ordinal_type OffsetCols() const { return _offn; }
+
+    KOKKOS_INLINE_FUNCTION
+    ordinal_type NumCols() const { return _n; }
+
+    KOKKOS_INLINE_FUNCTION
+    ordinal_type NumNonZeros() const { return _ajn - _aj; } 
+
+    KOKKOS_INLINE_FUNCTION
+    ordinal_type Col(const ordinal_type j) const { return _aj[j] - _offn; }
+
+    KOKKOS_INLINE_FUNCTION
+    value_type& Value(const ordinal_type j) { return _ax[j]; }
+
+    KOKKOS_INLINE_FUNCTION
+    value_type Value(const ordinal_type j) const { return _ax[j]; }
+    
+    KOKKOS_INLINE_FUNCTION
+    ordinal_type Index(const ordinal_type col ) const {
+      const ordinal_type loc = _offn + col ;
+      // binary search
+      ordinal_type_array_ptr aj = CrsRowView::lower_bound(_aj, _ajn, loc);
+
+      // if found, return index for the location, 
+      // otherwise return -1 (not found), -2 (end of array)
+      return (aj < _ajn ? (*aj == loc ? aj - _aj : -1) : -2);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    ordinal_type Index(const ordinal_type col,
+                       const ordinal_type prev ) const {
+      const ordinal_type loc = _offn + col;
+      ordinal_type_array_ptr aj = _aj + prev;
+
+      // binary search
+      // aj = lower_bound(aj, _ajn, loc);
+
+      // linear search from prev: this gains about 45 % faster
+      for ( ;aj < _ajn && *aj<loc; ++aj); 
+
+      // if found, return index for the location, 
+      // otherwise return -1 (not found), -2 (end of array)
+      return (aj < _ajn ? (*aj == loc ? aj - _aj : -1) : -2);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    value_type ValueAtColumn(const ordinal_type col) const {
+      const ordinal_type j = Index(col);
+      return (j < 0 ? value_type(0) : _ax[j]);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    CrsRowView()
+      : _offn(0),
+        _n(0),
+        _aj(),
+        _ajn(),
+        _ax() 
+    { }
+
+
+    KOKKOS_INLINE_FUNCTION
+    CrsRowView(const ordinal_type           offn,
+               const ordinal_type           n,
+               const ordinal_type_array_ptr aj,
+               const ordinal_type_array_ptr ajn,
+               const value_type_array_ptr   ax) 
+      : _offn(offn),
+        _n(n),
+        _aj(aj),
+        _ajn(ajn),
+        _ax(ax) 
+    { }
+
+    KOKKOS_INLINE_FUNCTION
+    CrsRowView(const CrsMatrixView<CrsMatBaseType> &A, 
+               const ordinal_type i) {
+      this->setView(A, i);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    CrsRowView(const CrsMatBaseType &A, 
+               const ordinal_type i) {
+      this->setView(A, i);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void setView(const CrsMatrixView<CrsMatBaseType> &A, 
+                 const ordinal_type i) {
+      _offn = A.OffsetCols();
+      _n    = A.NumCols();
+
+      const ordinal_type ii = A.OffsetRows() + i;
+
+      const typename CrsMatBaseType::ordinal_type_array_ptr cols = A.BaseObject().ColsInRow(ii);
+      const typename CrsMatBaseType::ordinal_type_array_ptr next = A.BaseObject().ColsInRow(ii+1);
+      const typename CrsMatBaseType::value_type_array_ptr   vals = A.BaseObject().ValuesInRow(ii);
+
+      // [cols..next) is sorted so a log(N) search could performed
+      _aj  = CrsRowView::lower_bound(cols, next, _offn);
+      _ajn = CrsRowView::lower_bound(_aj,  next, _offn+_n);
+
+      _ax  = &vals[_aj - cols];
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void setView(const CrsMatBaseType &A, 
+                 const ordinal_type i) {
+      _offn = 0;
+      _n    = A.NumCols();
+      _aj   = A.ColsInRow(i);
+      _ajn  = A.ColsInRow(i+1);
+      _ax   = A.ValuesInRow(i);
+    }
+
+    ostream& showMe(ostream &os) const {                                                
+      const ordinal_type nnz = NumNonZeros();
+      const ordinal_type offset = OffsetCols();
+      os << "  offset = " << offset
+         << ", nnz = " << nnz
+         << endl; 
+      for (ordinal_type j=0;j<nnz;++j) {
+        const value_type val = _ax[j];
+        os << "(" << Col(j) << ", "
+           << val << ")"
+           << endl;
+      }
+      return os;
+    }
+  };
+}
+
+#endif
diff --git a/lib/kokkos/example/ichol/src/dot.hpp b/lib/kokkos/example/ichol/src/dot.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..acf927e0689759873b441012e187131a54055f88
--- /dev/null
+++ b/lib/kokkos/example/ichol/src/dot.hpp
@@ -0,0 +1,74 @@
+#pragma once
+#ifndef __DOT_HPP__
+#define __DOT_HPP__
+
+/// \file dot.hpp
+/// \brief Sparse dot product.
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+/// dot_type result = x^H y
+
+namespace Tacho { 
+
+  using namespace std;
+
+  template<typename T> struct DotTraits {
+    typedef T dot_type;
+
+    static KOKKOS_FORCEINLINE_FUNCTION 
+    dot_type 
+    // dot(const T &x, const T &y) { return conj<T>(x)*y; }
+    dot(const T &x, const T &y) { return x*y; }
+  }; 
+
+  template<typename CrsRowViewType>
+  KOKKOS_INLINE_FUNCTION 
+  typename CrsRowViewType::value_type
+  dot(const CrsRowViewType x, const CrsRowViewType y) {
+    typedef typename CrsRowViewType::ordinal_type ordinal_type;
+    typedef typename CrsRowViewType::value_type   value_type;
+
+    typedef DotTraits<value_type> dot_traits;
+
+    value_type r_val(0);
+
+    const ordinal_type nnz_x = x.NumNonZeros();
+    const ordinal_type nnz_y = y.NumNonZeros();
+
+    for (ordinal_type jx=0, jy=0;jx<nnz_x && jy<nnz_y;) {
+      const ordinal_type diff = x.Col(jx) - y.Col(jy);
+      const ordinal_type sign = (0 < diff) - (diff < 0);
+      switch (sign) {
+      case  0:
+        r_val += dot_traits::dot(x.Value(jx++), y.Value(jy++));
+        break;
+      case -1: ++jx; break;
+      case  1: ++jy; break;
+      }
+    }
+    
+    return r_val;
+  }
+
+  template<typename CrsRowViewType>
+  KOKKOS_INLINE_FUNCTION 
+  typename CrsRowViewType::value_type
+  dot(const CrsRowViewType x) {
+    typedef typename CrsRowViewType::ordinal_type ordinal_type;
+    typedef typename CrsRowViewType::value_type   value_type;
+
+    typedef DotTraits<value_type> dot_traits;
+
+    value_type r_val(0);
+
+    const ordinal_type nnz = x.NumNonZeros();
+
+    for (ordinal_type j=0;j<nnz;++j) 
+      r_val += dot_traits::dot(x.Value(j), x.Value(j));
+    
+    return r_val;
+  }
+
+}
+
+#endif
diff --git a/lib/kokkos/example/ichol/src/gemm.hpp b/lib/kokkos/example/ichol/src/gemm.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..33c6058ec6fc6727dc62a320cab7bbb1855ea93f
--- /dev/null
+++ b/lib/kokkos/example/ichol/src/gemm.hpp
@@ -0,0 +1,99 @@
+#pragma once
+#ifndef __GEMM_HPP__
+#define __GEMM_HPP__
+
+/// \file gemm.hpp
+/// \brief Sparse matrix-matrix multiplication on given sparse patterns.
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+#include "util.hpp"
+#include "control.hpp"
+#include "partition.hpp"
+
+namespace Tacho {
+
+  using namespace std;
+
+  template<int ArgTransA, int ArgTransB, int ArgAlgo,
+           int ArgVariant = Variant::One,
+           template<int,int> class ControlType = Control>
+  struct Gemm {
+
+    // data-parallel interface
+    // =======================
+    template<typename ScalarType,
+             typename ExecViewTypeA,
+             typename ExecViewTypeB,
+             typename ExecViewTypeC>
+    KOKKOS_INLINE_FUNCTION
+    static int invoke(typename ExecViewTypeA::policy_type &policy,
+                      const typename ExecViewTypeA::policy_type::member_type &member,
+                      const ScalarType alpha,
+                      typename ExecViewTypeA::matrix_type &A,
+                      typename ExecViewTypeB::matrix_type &B,
+                      const ScalarType beta,
+                      typename ExecViewTypeC::matrix_type &C);
+
+    // task-data parallel interface
+    // ============================
+    template<typename ScalarType,
+             typename ExecViewTypeA,
+             typename ExecViewTypeB,
+             typename ExecViewTypeC>
+    class TaskFunctor {
+    public:
+      typedef typename ExecViewTypeA::policy_type policy_type;
+      typedef typename policy_type::member_type member_type;
+      typedef int value_type;
+
+    private:
+      ScalarType _alpha, _beta;
+      typename ExecViewTypeA::matrix_type _A;
+      typename ExecViewTypeB::matrix_type _B;
+      typename ExecViewTypeC::matrix_type _C;
+
+      policy_type _policy;
+
+    public:
+      KOKKOS_INLINE_FUNCTION
+      TaskFunctor(const policy_type & P,
+                  const ScalarType alpha,
+                  const typename ExecViewTypeA::matrix_type & A,
+                  const typename ExecViewTypeB::matrix_type & B,
+                  const ScalarType beta,
+                  const typename ExecViewTypeC::matrix_type & C)
+        : _alpha(alpha),
+          _beta(beta),
+          _A(A),
+          _B(B),
+          _C(C),
+          _policy(P)
+      { }
+
+      string Label() const { return "Gemm"; }
+
+      // task execution
+      KOKKOS_INLINE_FUNCTION
+      void apply(value_type &r_val) {
+        r_val = Gemm::invoke<ScalarType,ExecViewTypeA,ExecViewTypeB,ExecViewTypeC>(_policy, _policy.member_single(),
+                             _alpha, _A, _B, _beta, _C);
+      }
+
+      // task-data execution
+      KOKKOS_INLINE_FUNCTION
+      void apply(const member_type &member, value_type &r_val) {
+        r_val = Gemm::invoke<ScalarType,ExecViewTypeA,ExecViewTypeB,ExecViewTypeC>(_policy, member,
+                             _alpha, _A, _B, _beta, _C);
+      }
+
+    };
+
+  };
+
+}
+
+
+// #include "gemm_nt_nt.hpp"
+#include "gemm_ct_nt.hpp"
+
+#endif
diff --git a/lib/kokkos/example/ichol/src/gemm_ct_nt.hpp b/lib/kokkos/example/ichol/src/gemm_ct_nt.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..13d2518cab90896929ecb58645e61aeb51849394
--- /dev/null
+++ b/lib/kokkos/example/ichol/src/gemm_ct_nt.hpp
@@ -0,0 +1,12 @@
+#pragma once
+#ifndef __GEMM_CT_NT_HPP__
+#define __GEMM_CT_NT_HPP__
+
+/// \file gemm_ct_nt.hpp
+/// \brief Sparse matrix-matrix multiplication on given sparse patterns.
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+#include "gemm_ct_nt_for_factor_blocked.hpp"
+// #include "gemm_ct_nt_for_tri_solve_blocked.hpp"
+
+#endif
diff --git a/lib/kokkos/example/ichol/src/gemm_ct_nt_for_factor_blocked.hpp b/lib/kokkos/example/ichol/src/gemm_ct_nt_for_factor_blocked.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..88a4658482a1504ab6ad6334d65bd34a7dea055f
--- /dev/null
+++ b/lib/kokkos/example/ichol/src/gemm_ct_nt_for_factor_blocked.hpp
@@ -0,0 +1,108 @@
+#pragma once
+#ifndef __GEMM_CT_NT_FOR_FACTOR_BLOCKED_HPP__
+#define __GEMM_CT_NT_FOR_FACTOR_BLOCKED_HPP__
+
+/// \file gemm_ct_nt_for_factor_blocked.hpp
+/// \brief Sparse matrix-matrix multiplication on given sparse patterns.
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+namespace Tacho {
+
+  using namespace std;
+
+  // Gemm used in the factorization phase
+  // ====================================
+  template<>
+  template<typename ScalarType,
+           typename CrsExecViewTypeA,
+           typename CrsExecViewTypeB,
+           typename CrsExecViewTypeC>
+  KOKKOS_INLINE_FUNCTION
+  int
+  Gemm<Trans::ConjTranspose,Trans::NoTranspose,
+       AlgoGemm::ForFactorBlocked>
+  ::invoke(typename CrsExecViewTypeA::policy_type &policy,
+           const typename CrsExecViewTypeA::policy_type::member_type &member,
+           const ScalarType alpha,
+           typename CrsExecViewTypeA::matrix_type &A,
+           typename CrsExecViewTypeB::matrix_type &B,
+           const ScalarType beta,
+           typename CrsExecViewTypeC::matrix_type &C) {
+    typedef typename CrsExecViewTypeA::ordinal_type      ordinal_type;
+    typedef typename CrsExecViewTypeA::value_type        value_type;
+    typedef typename CrsExecViewTypeA::row_view_type     row_view_type;
+
+
+if ( false && member.team_rank() == 0 ) {
+ printf("Gemm [%d +%d)x[%d +%d)\n"
+       , C.OffsetRows()
+       , C.NumRows()
+       , C.OffsetCols()
+       , C.NumCols()
+       );
+}
+
+    // scale the matrix C with beta
+    scaleCrsMatrix<ScalarType,CrsExecViewTypeC>(member, beta, C);
+
+    // Sparse matrix-matrix multiply:
+    // C(i,j) += alpha*A'(i,k)*B(k,j)
+
+    const ordinal_type mA = A.NumRows();
+    for (ordinal_type k=0;k<mA;++k) {
+      row_view_type &a = A.RowView(k);
+      const ordinal_type nnz_a = a.NumNonZeros();
+
+      row_view_type &b = B.RowView(k);
+      const ordinal_type nnz_b = b.NumNonZeros();
+
+      if (nnz_a > 0 && nnz_b > 0 ) {
+#if 0
+        Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(member, 0, nnz_a),
+          [&](const ordinal_type i) {
+             const ordinal_type row_at_i  = a.Col(i);
+             const value_type   val_at_ik = a.Value(i);
+             // const value_type   val_at_ik = conj(a.Value(i));
+
+             row_view_type &c = C.RowView(row_at_i);
+
+             ordinal_type idx = 0;
+             for (ordinal_type j=0;j<nnz_b && (idx > -2);++j) {
+                const ordinal_type col_at_j  = b.Col(j);
+                const value_type   val_at_kj = b.Value(j);
+
+                idx = c.Index(col_at_j, idx);
+                if (idx >= 0)
+                  c.Value(idx) += alpha*val_at_ik*val_at_kj;
+                }
+          });
+#else
+        Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(member, 0, nnz_a * nnz_b ),
+          [&](const ordinal_type ii) {
+             const ordinal_type i = ii / nnz_a ;
+             const ordinal_type j = ii % nnz_a ;
+
+             row_view_type &c = C.RowView( a.Col(i) );
+
+             // Binary search for c's index of b.Col(j)
+             const ordinal_type idx = c.Index( b.Col(j) );
+
+             if (idx >= 0) {
+               // const value_type   val_at_ik = conj(a.Value(i));
+               c.Value(idx) += alpha * a.Value(i) * b.Value(j);
+             }
+          });
+#endif
+
+        member.team_barrier();
+      }
+    }
+
+    return 0;
+  }
+
+}
+
+#endif
diff --git a/lib/kokkos/example/ichol/src/graph_helper_scotch.hpp b/lib/kokkos/example/ichol/src/graph_helper_scotch.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d2dd004579a507439b457a12a6f0de909bf33acd
--- /dev/null
+++ b/lib/kokkos/example/ichol/src/graph_helper_scotch.hpp
@@ -0,0 +1,427 @@
+#pragma once
+#ifndef __GRAPH_HELPER_SCOTCH_HPP__
+#define __GRAPH_HELPER_SCOTCH_HPP__
+
+/// \file graph_helper_scotch.hpp
+/// \brief Interface to scotch reordering
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+#include "scotch.h"
+#include "util.hpp"
+
+namespace Tacho {
+
+  using namespace std;
+
+  template<class CrsMatBaseType>
+  class GraphHelper_Scotch : public Disp {
+  public:
+    typedef typename CrsMatBaseType::ordinal_type ordinal_type;
+    typedef typename CrsMatBaseType::size_type    size_type;
+
+    typedef typename CrsMatBaseType::ordinal_type_array ordinal_type_array;
+    typedef typename CrsMatBaseType::size_type_array    size_type_array;
+
+  private:
+    string _label;
+
+    // scotch main data structure
+    SCOTCH_Graph _graph;
+    SCOTCH_Num _strat;
+    int _level;
+
+    // scotch input has no diagonal contribution
+    ordinal_type _base,_m;
+    ordinal_type_array _cidx;
+
+    size_type _nnz;
+    size_type_array _rptr;
+
+    // scotch output
+    ordinal_type _cblk;
+    ordinal_type_array _perm,_peri,_range,_tree;
+
+    // status flag
+    bool _is_ordered;
+
+  public:
+
+    void setLabel(string label) { _label = label; }
+    string Label() const { return _label; }
+
+    size_type NumNonZeros() const { return _nnz; }
+    ordinal_type NumRows() const { return _m; }
+
+    size_type_array RowPtrVector() const { return _rptr; }
+    ordinal_type_array ColIndexVector() const { return _cidx; }
+
+    ordinal_type_array PermVector()    const { return _perm; }
+    ordinal_type_array InvPermVector() const { return _peri; }
+
+    ordinal_type_array RangeVector()   const { return _range; }
+    ordinal_type_array TreeVector()    const { return _tree; }
+
+    ordinal_type NumBlocks() const { return _cblk; }
+
+    GraphHelper_Scotch() = default;
+
+    // convert graph first
+    GraphHelper_Scotch(const string label,
+                       const ordinal_type m,
+                       const size_type_array rptr,
+                       const ordinal_type_array cidx,
+                       const int seed = GraphHelper::DefaultRandomSeed) {
+
+      _label = "GraphHelper_Scotch::" + label;
+
+      _is_ordered = false;
+      _cblk  = 0;
+
+      // scotch does not allow self-contribution (diagonal term in sparse matrix)
+      _base  = 0; //A.BaseVal();
+      _m     = m; // A.NumRows();
+      _nnz   = rptr[m]; //A.NumNonZeros();
+
+      _rptr  = rptr; //size_type_array(_label+"::RowPtrArray", _m+1);
+      _cidx  = cidx; //ordinal_type_array(_label+"::ColIndexArray", _nnz);
+
+      _perm  = ordinal_type_array(_label+"::PermutationArray", _m);
+      _peri  = ordinal_type_array(_label+"::InvPermutationArray", _m);
+      _range = ordinal_type_array(_label+"::RangeArray", _m);
+      _tree  = ordinal_type_array(_label+"::TreeArray", _m);
+
+      // create a graph structure without diagonals
+      _strat = 0;
+      _level = 0;
+
+      //A.convertGraph(_nnz, _rptr, _cidx);
+
+      int ierr = 0;
+      ordinal_type *rptr_ptr = reinterpret_cast<ordinal_type*>(_rptr.ptr_on_device());
+      ordinal_type *cidx_ptr = reinterpret_cast<ordinal_type*>(_cidx.ptr_on_device());
+
+      if (seed != GraphHelper::DefaultRandomSeed) {
+        SCOTCH_randomSeed(seed);
+        SCOTCH_randomReset();
+      }
+
+      ierr = SCOTCH_graphInit(&_graph);CHKERR(ierr);
+      ierr = SCOTCH_graphBuild(&_graph,             // scotch graph
+                               _base,               // base value
+                               _m,                  // # of vertices
+                               rptr_ptr,                // column index array pointer begin
+                               rptr_ptr+1,              // column index array pointer end
+                               NULL,                // weights on vertices (optional)
+                               NULL,                // label array on vertices (optional)
+                               _nnz,                // # of nonzeros
+                               cidx_ptr,                // column index array
+                               NULL);CHKERR(ierr);  // edge load array (optional)
+      ierr = SCOTCH_graphCheck(&_graph);CHKERR(ierr);
+    }
+    GraphHelper_Scotch(const GraphHelper_Scotch &b) = default;
+
+    virtual~GraphHelper_Scotch() {
+      SCOTCH_graphFree(&_graph);
+    }
+
+    void setStratGraph(const SCOTCH_Num strat = 0) {
+      _strat = strat;
+    }
+
+    void setTreeLevel(const int level = 0) {
+      _level = level;
+    }
+
+    int computeOrdering(const ordinal_type treecut = 0,
+                        const ordinal_type minblksize = 0) {
+      int ierr = 0;
+
+      // pointers for global graph ordering
+      ordinal_type *perm  = _perm.ptr_on_device();
+      ordinal_type *peri  = _peri.ptr_on_device();
+      ordinal_type *range = _range.ptr_on_device();
+      ordinal_type *tree  = _tree.ptr_on_device();
+
+      {
+        const int level = (_level ? _level : max(1, int(log2(_m)-treecut))); // level = log2(_nnz)+10;
+        SCOTCH_Strat stradat;
+        SCOTCH_Num straval = _strat;
+                              //(SCOTCH_STRATLEVELMAX));//   |
+                              //SCOTCH_STRATLEVELMIN   |
+                              //SCOTCH_STRATLEAFSIMPLE |
+                              //SCOTCH_STRATSEPASIMPLE);
+
+        ierr = SCOTCH_stratInit(&stradat);CHKERR(ierr);
+
+        // if both are zero, do not run strategy
+        if (_strat || _level) {
+          cout << "GraphHelper_Scotch:: User provide a strategy and/or level" << endl
+               << "                     strategy = " << _strat << ", level =  " << _level << endl;
+          ierr = SCOTCH_stratGraphOrderBuild (&stradat, straval, level, 0.2);CHKERR(ierr);
+        }
+        ierr = SCOTCH_graphOrder(&_graph,
+                                 &stradat,
+                                 perm,
+                                 peri,
+                                 &_cblk,
+                                 range,
+                                 tree);CHKERR(ierr);
+        SCOTCH_stratExit(&stradat);
+      }
+
+#if 0
+      {
+        // assume there are multiple roots
+        range[_cblk+1] = range[_cblk]; // dummy range
+        tree[_cblk] = -1;              // dummy root
+        for (ordinal_type i=0;i<_cblk;++i)
+          if (tree[i] == -1)           // multiple roots becomes children of the hummy root
+            tree[i] = (_cblk+1);
+        ++_cblk;                       // include the dummy root
+      }
+#endif
+
+      // provided blksize is greater than 0, reorder internally
+      // if (treecut > 0 && minblksize > 0) {
+      //   // graph array
+      //   ordinal_type *rptr_ptr = reinterpret_cast<ordinal_type*>(_rptr.ptr_on_device());
+      //   ordinal_type *cidx_ptr = reinterpret_cast<ordinal_type*>(_cidx.ptr_on_device());
+
+      //   // create workspace in
+      //   size_type_array    rptr_work = size_type_array(_label+"::Block::RowPtrArray", _m+1);
+      //   ordinal_type_array cidx_work = ordinal_type_array(_label+"::Block::ColIndexArray", _nnz);
+
+      //   // create workspace output
+      //   ordinal_type_array perm_work  = ordinal_type_array(_label+"::Block::PermutationArray", _m);
+      //   ordinal_type_array peri_work  = ordinal_type_array(_label+"::Block::InvPermutationArray", _m);
+      //   ordinal_type_array range_work = ordinal_type_array(_label+"::Block::RangeArray", _m);
+      //   ordinal_type_array tree_work  = ordinal_type_array(_label+"::Block::TreeArray", _m);
+
+      //   // scotch input
+      //   ordinal_type *rptr_blk = reinterpret_cast<ordinal_type*>(rptr_work.ptr_on_device());
+      //   ordinal_type *cidx_blk = reinterpret_cast<ordinal_type*>(cidx_work.ptr_on_device());
+
+      //   size_type nnz = 0;
+      //   rptr_blk[0] = nnz;
+
+      //   for (ordinal_type iblk=0;iblk<_cblk;++iblk) {
+      //     // allocate graph
+      //     SCOTCH_Graph graph;
+
+      //     ierr = SCOTCH_graphInit(&graph);CHKERR(ierr);
+
+      //     SCOTCH_Strat stradat;
+      //     SCOTCH_Num straval = (/*SCOTCH_STRATLEVELMAX   |
+      //                             SCOTCH_STRATLEVELMIN   |*/
+      //                           SCOTCH_STRATLEAFSIMPLE |
+      //                           SCOTCH_STRATSEPASIMPLE);
+
+      //     ierr = SCOTCH_stratInit(&stradat);CHKERR(ierr);
+      //     ierr = SCOTCH_stratGraphOrderBuild(&stradat, straval, 0, 0.2);CHKERR(ierr);
+
+      //     const ordinal_type ibegin = range[iblk], iend = range[iblk+1], m = iend - ibegin;
+
+      //     // scotch output
+      //     ordinal_type cblk_blk = 0;
+
+      //     ordinal_type *perm_blk  = perm_work.ptr_on_device()  + ibegin;
+      //     ordinal_type *peri_blk  = peri_work.ptr_on_device()  + ibegin;
+      //     ordinal_type *range_blk = range_work.ptr_on_device() + ibegin;
+      //     ordinal_type *tree_blk  = tree_work.ptr_on_device()  + ibegin;
+
+      //     // if each blk is greater than the given minblksize, reorder internally
+      //     if (m > minblksize) {
+      //       for (int i=ibegin;i<iend;++i) {
+      //         const ordinal_type ii = peri[i];
+      //         const ordinal_type jbegin = rptr_ptr[ii];
+      //         const ordinal_type jend = rptr_ptr[ii+1];
+
+      //         for (int j=jbegin;j<jend;++j) {
+      //           const ordinal_type jj = perm[cidx_ptr[j]];
+      //           if (ibegin <= jj && jj < iend)
+      //             cidx_blk[nnz++] = (jj - ibegin);
+      //         }
+      //         rptr_blk[i+1] = nnz;
+      //       }
+      //       const size_type nnz_blk = nnz - rptr_blk[ibegin];
+
+      //       ierr = SCOTCH_graphBuild(&graph,             // scotch graph
+      //                                0,                  // base value
+      //                                m,                  // # of vertices
+      //                                &rptr_blk[ibegin],  // column index array pointer begin
+      //                                &rptr_blk[ibegin]+1,// column index array pointer end
+      //                                NULL,               // weights on vertices (optional)
+      //                                NULL,               // label array on vertices (optional)
+      //                                nnz_blk,            // # of nonzeros
+      //                                cidx_blk,           // column index array
+      //                                NULL);CHKERR(ierr); // edge load array (optional)
+      //       ierr = SCOTCH_graphCheck(&graph);CHKERR(ierr);
+      //       ierr = SCOTCH_graphOrder(&graph,
+      //                                &stradat,
+      //                                perm_blk,
+      //                                peri_blk,
+      //                                &cblk_blk,
+      //                                range_blk,
+      //                                tree_blk);CHKERR(ierr);
+      //     } else {
+      //       for (ordinal_type i=0;i<m;++i) {
+      //         perm_blk[i] = i;
+      //         peri_blk[i] = i;
+      //       }
+      //       range_blk[1] = m;
+      //       tree_blk[0] = -1;
+      //     }
+
+      //     SCOTCH_stratExit(&stradat);
+      //     SCOTCH_graphFree(&graph);
+
+      //     for (ordinal_type i=0;i<m;++i) {
+      //       const ordinal_type ii = peri_blk[i] + ibegin;
+      //       peri_blk[i] = peri[ii];
+      //     }
+      //     for (ordinal_type i=0;i<m;++i) {
+      //       const ordinal_type ii = i + ibegin;
+      //       peri[ii] = peri_blk[i];
+      //     }
+
+      //   }
+
+      //   for (ordinal_type i=0;i<_m;++i)
+      //     perm[peri[i]] = i;
+      // }
+
+      _is_ordered = true;
+
+      //cout << "SCOTCH level = " << level << endl;
+      //cout << "Range   Tree " << endl;
+      //for (int i=0;i<_cblk;++i)
+      //  cout << _range[i] << " :: " << i << " " << _tree[i] << endl;
+
+      return 0;
+    }
+
+    int pruneTree(const ordinal_type cut) {
+      if (cut <=0 ) return 0;
+
+      ordinal_type_array work = ordinal_type_array(_label+"::WorkArray", _cblk+1);
+      for (ordinal_type iter=0;iter<cut && _cblk > 1;++iter) {
+        // horizontal merging
+        {
+          ordinal_type cnt = 0;
+          ordinal_type parent = _tree[0];
+          work[0] = cnt;
+          for (ordinal_type i=1;i<_cblk;++i) {
+            const ordinal_type myparent = _tree[i];
+            if (myparent == parent) {
+              work[i] = cnt;
+            } else {
+              parent = _tree[i];
+              work[i] = ++cnt;
+            }
+          }
+          work[_cblk] = ++cnt;
+
+          ordinal_type prev = -2;
+          const ordinal_type root = _cblk - 1;
+          for (ordinal_type i=0;i<root;++i) {
+            const ordinal_type myparent = _tree[i];
+            const ordinal_type me = work[i];
+
+            _tree[me] = work[myparent];
+            if (prev != me) {
+              _range[me] = _range[i];
+              prev = me;
+            }
+          }
+          {
+            const ordinal_type me = work[root];
+            _tree[me] = -1;
+            _range[me] = _range[root];
+
+            _range[work[root+1]] = _range[root+1];
+            _cblk = cnt;
+          }
+        }
+
+        // vertical merging
+        if (_cblk == 2) {
+          _tree[0] = -1;
+          _range[0] = 0;
+          _range[1] = _range[2];
+          _cblk = 1;
+        } else {
+          ordinal_type cnt = 0;
+          for (ordinal_type i=0;i<_cblk;++i) {
+            const ordinal_type diff = _tree[i+1] - _tree[i];
+            work[i] = (diff == 1 ? cnt : cnt++);
+          }
+          work[_cblk] = cnt;
+
+          ordinal_type prev = -2;
+          const ordinal_type root = _cblk - 1;
+          for (ordinal_type i=0;i<root;++i) {
+            const ordinal_type myparent = _tree[i];
+            const ordinal_type me = work[i];
+
+            _tree[me] = work[myparent];
+            if (prev != me) {
+              _range[me] = _range[i];
+              prev = me;
+            }
+          }
+          {
+            const ordinal_type me = work[root];
+            _tree[me] = -1;
+            _range[me] = _range[root];
+
+            _range[work[root+1]] = _range[root+1];
+            _cblk = cnt;
+          }
+        }
+      }
+
+      // cleaning
+      {
+        for (ordinal_type i=(_cblk+1);i<_m;++i) {
+          _tree[i] = 0;
+          _range[i] = 0;
+        }
+        _tree[_cblk] = 0;
+      }
+
+      return 0;
+    }
+
+    ostream& showMe(ostream &os) const {
+      streamsize prec = os.precision();
+      os.precision(15);
+      os << scientific;
+
+      os << " -- Scotch input -- " << endl
+         << "    Base Value     = " << _base << endl
+         << "    # of Rows      = " << _m << endl
+         << "    # of NonZeros  = " << _nnz << endl;
+
+      if (_is_ordered)
+        os << " -- Ordering -- " << endl
+           << "    CBLK   = " << _cblk << endl
+           << "  PERM     PERI     RANG     TREE" << endl;
+
+      const int w = 6;
+      for (ordinal_type i=0;i<_m;++i)
+        os << setw(w) << _perm[i] << "   "
+           << setw(w) << _peri[i] << "   "
+           << setw(w) << _range[i] << "   "
+           << setw(w) << _tree[i] << endl;
+
+      os.unsetf(ios::scientific);
+      os.precision(prec);
+
+      return os;
+    }
+
+  };
+
+}
+
+#endif
diff --git a/lib/kokkos/example/ichol/src/herk.hpp b/lib/kokkos/example/ichol/src/herk.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..548c495c448604d2bffd7a5dd1d9745ce440fc9e
--- /dev/null
+++ b/lib/kokkos/example/ichol/src/herk.hpp
@@ -0,0 +1,91 @@
+#pragma once
+#ifndef __HERK_HPP__
+#define __HERK_HPP__
+
+/// \file herk.hpp
+/// \brief Sparse hermitian rank one update on given sparse patterns.
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+#include "util.hpp"
+#include "control.hpp"
+#include "partition.hpp"
+
+namespace Tacho {
+
+  using namespace std;
+
+  template<int ArgUplo, int ArgTrans, int ArgAlgo,
+           int ArgVariant = Variant::One,
+           template<int,int> class ControlType = Control>
+  struct Herk {
+
+    // data-parallel interface
+    // =======================
+    template<typename ScalarType,
+             typename ExecViewTypeA,
+             typename ExecViewTypeC>
+    KOKKOS_INLINE_FUNCTION
+    static int invoke(typename ExecViewTypeA::policy_type &policy,
+                      const typename ExecViewTypeA::policy_type::member_type &member,
+                      const ScalarType alpha,
+                      typename ExecViewTypeA::matrix_type &A,
+                      const ScalarType beta,
+                      typename ExecViewTypeC::matrix_type &C);
+
+    // task-data parallel interface
+    // ============================
+    template<typename ScalarType,
+             typename ExecViewTypeA,
+             typename ExecViewTypeC>
+    class TaskFunctor {
+    public:
+      typedef typename ExecViewTypeA::policy_type policy_type;
+      typedef typename policy_type::member_type member_type;
+      typedef int value_type;
+
+    private:
+      ScalarType _alpha, _beta;
+      typename ExecViewTypeA::matrix_type _A;
+      typename ExecViewTypeC::matrix_type _C;
+
+      policy_type _policy;
+
+    public:
+      KOKKOS_INLINE_FUNCTION
+      TaskFunctor(const policy_type & P,
+                  const ScalarType alpha,
+                  const typename ExecViewTypeA::matrix_type & A,
+                  const ScalarType beta,
+                  const typename ExecViewTypeC::matrix_type & C)
+        : _alpha(alpha),
+          _beta(beta),
+          _A(A),
+          _C(C),
+          _policy(P)
+      { }
+
+      string Label() const { return "Herk"; }
+
+      // task execution
+      KOKKOS_INLINE_FUNCTION
+      void apply(value_type &r_val) {
+        r_val = Herk::invoke<ScalarType,ExecViewTypeA,ExecViewTypeC>(_policy, _policy.member_single(), 
+                             _alpha, _A, _beta, _C);
+      }
+
+      // task-data execution
+      KOKKOS_INLINE_FUNCTION
+      void apply(const member_type &member, value_type &r_val) {
+        r_val = Herk::invoke<ScalarType,ExecViewTypeA,ExecViewTypeC>(_policy, member, 
+                             _alpha, _A, _beta, _C);
+      }
+
+    };
+
+  };
+
+}
+
+#include "herk_u_ct.hpp"
+
+#endif
diff --git a/lib/kokkos/example/ichol/src/herk_u_ct.hpp b/lib/kokkos/example/ichol/src/herk_u_ct.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..6de4a2fa5628f0bdd77da6fdfc916ad112569fce
--- /dev/null
+++ b/lib/kokkos/example/ichol/src/herk_u_ct.hpp
@@ -0,0 +1,11 @@
+#pragma once
+#ifndef __HERK_U_CT_HPP__
+#define __HERK_U_CT_HPP__
+
+/// \file herk_u_ct.hpp
+/// \brief Sparse hermitian rank one update on given sparse patterns.
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+#include "herk_u_ct_for_factor_blocked.hpp"
+
+#endif
diff --git a/lib/kokkos/example/ichol/src/herk_u_ct_for_factor_blocked.hpp b/lib/kokkos/example/ichol/src/herk_u_ct_for_factor_blocked.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..58bba2be3c9c5fba07a3a36a77545bca917778c3
--- /dev/null
+++ b/lib/kokkos/example/ichol/src/herk_u_ct_for_factor_blocked.hpp
@@ -0,0 +1,103 @@
+#pragma once
+#ifndef __HERK_U_CT_FOR_FACTOR_BLOCKED_HPP__
+#define __HERK_U_CT_FOR_FACTOR_BLOCKED_HPP__
+
+/// \file herk_u_ct_for_factor_blocked.hpp
+/// \brief Sparse hermitian rank one update on given sparse patterns.
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+namespace Tacho {
+
+  using namespace std;
+
+
+  // Herk used in the factorization phase
+  // ====================================
+  template<>
+  template<typename ScalarType,
+           typename CrsExecViewTypeA,
+           typename CrsExecViewTypeC>
+  KOKKOS_INLINE_FUNCTION
+  int
+  Herk<Uplo::Upper,Trans::ConjTranspose,
+       AlgoHerk::ForFactorBlocked>
+  ::invoke(typename CrsExecViewTypeA::policy_type &policy,
+           const typename CrsExecViewTypeA::policy_type::member_type &member,
+           const ScalarType alpha,
+           typename CrsExecViewTypeA::matrix_type &A,
+           const ScalarType beta,
+           typename CrsExecViewTypeC::matrix_type &C) {
+    typedef typename CrsExecViewTypeA::ordinal_type      ordinal_type;
+    typedef typename CrsExecViewTypeA::value_type        value_type;
+    typedef typename CrsExecViewTypeA::row_view_type     row_view_type;
+
+
+if ( false && member.team_rank() == 0 ) {
+ printf("Herk [%d +%d)x[%d +%d)\n"
+       , C.OffsetRows()
+       , C.NumRows()
+       , C.OffsetCols()
+       , C.NumCols()
+       );
+}
+
+    // scale the matrix C with beta
+    scaleCrsMatrix<ScalarType,CrsExecViewTypeC>(member, beta, C);
+
+    // C(i,j) += alpha*A'(i,k)*A(k,j)
+    for (ordinal_type k=0;k<A.NumRows();++k) {
+      row_view_type &a = A.RowView(k);
+      const ordinal_type nnz = a.NumNonZeros();
+
+      if (nnz > 0) {
+
+#if 0
+
+        Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(member, 0, nnz),
+            [&](const ordinal_type i) {
+              const ordinal_type row_at_i  = a.Col(i);
+               // const value_type   val_at_ik = conj(a.Value(i));
+               const value_type   val_at_ik = a.Value(i);
+
+               row_view_type &c = C.RowView(row_at_i);
+
+               ordinal_type idx = 0;
+               for (ordinal_type j=i;j<nnz && (idx > -2);++j) {
+                 const ordinal_type col_at_j  = a.Col(j);
+                 const value_type   val_at_kj = a.Value(j);
+
+                 idx = c.Index(col_at_j, idx);
+                 if (idx >= 0)
+                   c.Value(idx) += alpha*val_at_ik*val_at_kj;
+               }
+             });
+#else
+
+        Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(member, 0, nnz*nnz),
+            [&](const ordinal_type ii) {
+               const ordinal_type i = ii / nnz ;
+               const ordinal_type j = ii % nnz ;
+
+               row_view_type &c = C.RowView( a.Col(i) );
+
+               const ordinal_type idx = c.Index( a.Col(j) );
+
+               if (idx >= 0) {
+                 c.Value(idx) += alpha* a.Value(i) * a.Value(j);
+               }
+             });
+
+#endif
+
+        member.team_barrier();
+      }
+    }
+
+    return 0;
+  }
+
+}
+
+#endif
diff --git a/lib/kokkos/example/ichol/src/norm.hpp b/lib/kokkos/example/ichol/src/norm.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..be77ee0dcf2b27f6a7e50fb8eeacb45dc9d50e82
--- /dev/null
+++ b/lib/kokkos/example/ichol/src/norm.hpp
@@ -0,0 +1,82 @@
+#pragma once
+#ifndef __NORM_HPP__
+#define __NORM_HPP__
+
+/// \file norm.hpp
+/// \brief Compute norm of sparse or dense matrices.
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+namespace Tacho {
+
+  using namespace std;
+
+  template<typename DenseExecViewType>
+  KOKKOS_INLINE_FUNCTION
+  auto
+  normOneDenseMatrix(DenseExecViewType &A) -> decltype(real(typename DenseExecViewType::value_type())) {
+    typedef typename DenseExecViewType::ordinal_type  ordinal_type;
+    typedef typename DenseExecViewType::value_type    value_type;
+    typedef decltype(real(value_type())) norm_type;
+
+    const ordinal_type mA = A.NumRows();
+    const ordinal_type nA = A.NumCols();
+
+    norm_type r_val = 0.0;
+
+    for (ordinal_type j=0;j<nA;++j) {
+      norm_type col_sum_at_j = 0.0;
+      for (ordinal_type i=0;i<mA;++i)
+        col_sum_at_j += abs(A.Value(i,j));
+      r_val = max(r_val, col_sum_at_j);
+    }
+    return r_val;
+  }
+
+  template<typename DenseExecViewType>
+  KOKKOS_INLINE_FUNCTION
+  auto
+  normInfDenseMatrix(DenseExecViewType &A) -> decltype(real(typename DenseExecViewType::value_type())) {
+    typedef typename DenseExecViewType::ordinal_type  ordinal_type;
+    typedef typename DenseExecViewType::value_type    value_type;
+    typedef decltype(real(value_type())) norm_type;
+
+    const ordinal_type mA = A.NumRows();
+    const ordinal_type nA = A.NumCols();
+
+    norm_type r_val = 0.0;
+
+    for (ordinal_type i=0;i<mA;++i) {
+      norm_type row_sum_at_i = 0.0;
+      for (ordinal_type j=0;j<nA;++j) 
+        row_sum_at_i += abs(A.Value(i,j));
+      r_val = max(r_val, row_sum_at_i);
+    }
+    return r_val;
+  }
+  
+  template<typename DenseExecViewType>
+  KOKKOS_INLINE_FUNCTION
+  auto
+  normFrobeniusDenseMatrix(DenseExecViewType &A) -> decltype(real(typename DenseExecViewType::value_type())) {
+    typedef typename DenseExecViewType::ordinal_type  ordinal_type;
+    typedef typename DenseExecViewType::value_type    value_type;
+    typedef decltype(real(value_type())) norm_type;
+
+    const ordinal_type mA = A.NumRows();
+    const ordinal_type nA = A.NumCols();
+
+    norm_type r_val = 0.0;
+
+    for (ordinal_type i=0;i<mA;++i) 
+      for (ordinal_type j=0;j<nA;++j) {
+        value_type val = A.Value(i,j);
+        // r_val += conj(val)*val;
+        r_val += val*val;
+      }
+    return sqrt(r_val);
+  }
+
+}
+
+#endif
+
diff --git a/lib/kokkos/example/ichol/src/partition.hpp b/lib/kokkos/example/ichol/src/partition.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a3e9f7095a6b82b62e6c27bc5f91db0e253b0451
--- /dev/null
+++ b/lib/kokkos/example/ichol/src/partition.hpp
@@ -0,0 +1,381 @@
+
+#ifndef __PARTITION_HPP__
+#define __PARTITION_HPP__
+
+/// \file partition.hpp
+/// \brief Matrix partitioning utilities.
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+namespace Tacho { 
+
+  using namespace std;
+
+  template<typename MatView>
+  KOKKOS_INLINE_FUNCTION 
+  void 
+  Part_2x2(const MatView A, MatView &ATL, MatView &ATR, 
+           /**************/ MatView &ABL, MatView &ABR,
+           const typename MatView::ordinal_type bm, 
+           const typename MatView::ordinal_type bn,
+           const int quadrant) {
+    typename MatView::ordinal_type bmm, bnn;
+
+    switch (quadrant) {
+    case Partition::TopLeft:
+      bmm = min(bm, A.NumRows());
+      bnn = min(bn, A.NumCols());                
+      
+      ATL.setView(A.BaseObject(),
+                  A.OffsetRows(), bmm,
+                  A.OffsetCols(), bnn);
+      break;
+    case Partition::TopRight:
+    case Partition::BottomLeft:
+      Kokkos::abort("Tacho::Part_2x2 Not yet implemented");
+      break;
+    case Partition::BottomRight:
+      bmm = A.NumRows() - min(bm, A.NumRows());
+      bnn = A.NumCols() - min(bn, A.NumCols());                
+      
+      ATL.setView(A.BaseObject(),
+                  A.OffsetRows(), bmm,
+                  A.OffsetCols(), bnn);
+      break;
+    default:
+      Kokkos::abort("Tacho::Part_2x2 Invalid Input");
+      break;
+    }
+    
+    ATR.setView(A.BaseObject(),
+                A.OffsetRows(),                 ATL.NumRows(),
+                A.OffsetCols() + ATL.NumCols(), A.NumCols() - ATL.NumCols());
+    
+    ABL.setView(A.BaseObject(),
+                A.OffsetRows() + ATL.NumRows(), A.NumRows() - ATL.NumRows(),
+                A.OffsetCols(),                 ATL.NumCols());
+    
+    ABR.setView(A.BaseObject(),
+                A.OffsetRows() + ATL.NumRows(), A.NumRows() - ATL.NumRows(),
+                A.OffsetCols() + ATL.NumCols(), A.NumCols() - ATL.NumCols());
+  }
+
+  template<typename MatView>
+  KOKKOS_INLINE_FUNCTION 
+  void 
+  Part_1x2(const MatView A, MatView &AL, MatView &AR, 
+           const typename MatView::ordinal_type bn,
+           const int side) {
+    typename MatView::ordinal_type bmm, bnn;
+
+    switch (side) {
+    case Partition::Left:
+      bmm = A.NumRows();
+      bnn = min(bn, A.NumCols());
+      
+      AL.setView(A.BaseObject(),
+                 A.OffsetRows(), bmm,
+                 A.OffsetCols(), bnn);
+      break;
+    case Partition::Right:
+      bmm = A.NumRows();
+      bnn = A.NumCols() - min(bn, A.NumCols());
+
+      AL.setView(A.BaseObject(),
+                 A.OffsetRows(), bmm,
+                 A.OffsetCols(), bnn);
+      break;
+    default:
+      Kokkos::abort("Tacho::Part_1x2 Invalid Input");
+      break;
+    }
+
+    AR.setView(A.BaseObject(),
+               A.OffsetRows(),                A.NumRows(),
+               A.OffsetCols() + AL.NumCols(), A.NumCols() - AL.NumCols());
+  }
+
+  template<typename MatView>
+  KOKKOS_INLINE_FUNCTION 
+  void 
+  Part_2x1(const MatView A, MatView &AT, 
+           /*************/  MatView &AB, 
+           const typename MatView::ordinal_type bm,
+           const int side) {
+    typename MatView::ordinal_type bmm, bnn;
+    
+    switch (side) {
+    case Partition::Top:
+      bmm = min(bm, A.NumRows());
+      bnn = A.NumCols();
+      
+      AT.setView(A.BaseObject(),
+                 A.OffsetRows(), bmm,
+                 A.OffsetCols(), bnn);
+      break;
+    case Partition::Bottom:
+      bmm = A.NumRows() - min(bm, A.NumRows());
+      bnn = A.NumCols();
+
+      AT.setView(A.BaseObject(),
+                 A.OffsetRows(), bmm,
+                 A.OffsetCols(), bnn);
+      break;
+    default:
+      Kokkos::abort("Tacho::Part_2x1 Invalid Input");
+      break;
+    }
+    
+    AB.setView(A.BaseObject(),
+               A.OffsetRows() + AT.NumRows(), A.NumRows() - AT.NumRows(),
+               A.OffsetCols(),                A.NumCols());
+  }
+
+  template<typename MatView>
+  KOKKOS_INLINE_FUNCTION 
+  void 
+  Part_2x2_to_3x3(const MatView ATL, const MatView ATR, MatView &A00, MatView &A01, MatView &A02,
+                  /***********************************/ MatView &A10, MatView &A11, MatView &A12,
+                  const MatView ABL, const MatView ABR, MatView &A20, MatView &A21, MatView &A22,
+                  const typename MatView::ordinal_type bm, 
+                  const typename MatView::ordinal_type bn,
+                  const int quadrant) {
+    switch (quadrant) {
+    case Partition::TopLeft:
+      Part_2x2(ATL, A00, A01,
+               /**/ A10, A11, 
+               bm, bn, Partition::BottomRight);
+
+      Part_2x1(ATR, A02, 
+               /**/ A12,
+               bm, Partition::Bottom);
+
+      Part_1x2(ABL, A20, A21,
+               bn, Partition::Right);
+
+      A22.setView(ABR.BaseObject(),
+                  ABR.OffsetRows(), ABR.NumRows(),
+                  ABR.OffsetCols(), ABR.NumCols());
+      break;
+    case Partition::TopRight:
+    case Partition::BottomLeft:
+      Kokkos::abort("Tacho::Part_???");
+      break;
+    case Partition::BottomRight:
+      A00.setView(ATL.BaseObject(),
+                  ATL.OffsetRows(), ATL.NumRows(),
+                  ATL.OffsetCols(), ATL.NumCols());
+
+      Part_1x2(ATR, A01, A02,
+               bn, Partition::Left);
+
+      Part_2x1(ABL, A10, 
+               /**/ A20,
+               bm, Partition::Top);
+
+      Part_2x2(ABR, A11, A12,
+               /**/ A21, A22, 
+               bm, bn, Partition::TopLeft);
+      break;
+    default:
+      Kokkos::abort("Tacho::Part_???");
+      break;
+    }
+  }
+
+  template<typename MatView>
+  KOKKOS_INLINE_FUNCTION 
+  void 
+  Part_2x1_to_3x1(const MatView AT, MatView &A0, 
+                  /***************/ MatView &A1, 
+                  const MatView AB, MatView &A2, 
+                  const typename MatView::ordinal_type bm, 
+                  const int side) {
+    switch (side) {
+    case Partition::Top:
+      Part_2x1(AT,  A0, 
+               /**/ A1,
+               bm, Partition::Bottom);
+
+      A2.setView(AB.BaseObject(),
+                 AB.OffsetRows(), AB.NumRows(),
+                 AB.OffsetCols(), AB.NumCols());
+      break;
+    case Partition::Bottom:
+      A0.setView(AT.BaseObject(),
+                 AT.OffsetRows(), AT.NumRows(),
+                 AT.OffsetCols(), AT.NumCols());
+
+      Part_2x1(AB,  A1, 
+               /**/ A2,
+               bm, Partition::Top);
+      break;
+    default:
+      Kokkos::abort("Tacho::Part_???");
+      break;
+    }
+  }
+
+  template<typename MatView>
+  KOKKOS_INLINE_FUNCTION 
+  void 
+  Part_1x2_to_1x3(const MatView AL, const MatView AR, 
+                  MatView &A0, MatView &A1, MatView &A2,
+                  const typename MatView::ordinal_type bn, 
+                  const int side) {
+    switch (side) {
+    case Partition::Left:
+      Part_1x2(AL,  A0, A1,
+               bn, Partition::Right);
+
+      A2.setView(AR.BaseObaject(),
+                 AR.OffsetRows(), AR.NumRows(),
+                 AR.OffsetCols(), AR.NumCols());
+      break;
+    case Partition::Right:
+      A0.setView(AL.BaseObject(),
+                 AL.OffsetRows(), AL.NumRows(),
+                 AL.OffsetCols(), AL.NumCols());
+
+      Part_1x2(AR,  A1, A2,
+               bn, Partition::Left);
+      break;
+    default:
+      Kokkos::abort("Tacho::Part_???");
+      break;
+    }
+  }
+
+  template<typename MatView>
+  KOKKOS_INLINE_FUNCTION 
+  void 
+  Merge_2x2(const MatView ATL, const MatView ATR, 
+            const MatView ABL, const MatView ABR, MatView &A) {
+    A.setView(ATL.BaseObject(),
+              ATL.OffsetRows(), ATL.NumRows() + ABR.NumRows(), 
+              ATL.OffsetCols(), ATL.NumCols() + ABR.NumCols());
+  }
+
+  template<typename MatView>
+  KOKKOS_INLINE_FUNCTION 
+  void 
+  Merge_1x2(const MatView AL, const MatView AR, MatView &A) {
+    A.setView(AL.BaseObject(),
+              AL.OffsetRows(), AL.NumRows(),
+              AL.OffsetCols(), AL.NumCols() + AR.NumCols());
+  }
+
+  template<typename MatView>
+  KOKKOS_INLINE_FUNCTION 
+  void 
+  Merge_2x1(const MatView AT, 
+            const MatView AB, MatView &A) {
+    A.setView(AT.BaseObject(),
+              AT.OffsetRows(), AT.NumRows() + AB.NumRows(),
+              AT.OffsetCols(), AT.NumCols());
+  }
+
+  template<typename MatView>
+  KOKKOS_INLINE_FUNCTION 
+  void 
+  Merge_3x3_to_2x2(const MatView A00, const MatView A01, const MatView A02, MatView &ATL, MatView &ATR, 
+                   const MatView A10, const MatView A11, const MatView A12,
+                   const MatView A20, const MatView A21, const MatView A22, MatView &ABL, MatView &ABR,
+                   const int quadrant) {
+    switch (quadrant) {
+    case Partition::TopLeft:
+      Merge_2x2(A00, A01, 
+                A10, A11, ATL);
+      
+      Merge_2x1(A02, 
+                A12, ATR);
+
+      Merge_1x2(A20, A21, ABL);
+      
+      ABR.setView(A22.BaseObject(),
+                  A22.OffsetRows(), A22.NumRows(),
+                  A22.OffsetCols(), A22.NumCols());
+      break;
+    case Partition::TopRight:
+    case Partition::BottomLeft:
+      Kokkos::abort("Tacho::Part_???");
+      break;
+    case Partition::BottomRight:
+      ATL.setView(A00.BaseObject(),
+                  A00.OffsetRows(), A00.NumRows(),
+                  A00.OffsetCols(), A00.NumCols());
+
+      Merge_1x2(A01, A02, ATR);
+
+      Merge_2x1(A10, 
+                A20, ABL);
+
+      Merge_2x2(A11, A12, 
+                A21, A22, ABR);
+      break;
+    default:
+      Kokkos::abort("Tacho::Part_???");
+      break;
+    }
+  }
+
+  template<typename MatView>
+  KOKKOS_INLINE_FUNCTION 
+  void 
+  Merge_3x1_to_2x1(const MatView A0, MatView &AT, 
+                   const MatView A1, 
+                   const MatView A2, MatView &AB, 
+                   const int side) {
+    switch (side) {
+    case Partition::Top:
+      Merge_2x1(A0, 
+                A1, AT);
+
+      AB.setView(A2.BaseObject(),
+                 A2.OffsetRows(), A2.NumRows(),
+                 A2.OffsetCols(), A2.NumCols());
+      break;
+    case Partition::Bottom:
+      AT.setView(A0.BaseObject(),
+                 A0.OffsetRows(), A0.NumRows(),
+                 A0.OffsetCols(), A0.NumCols());
+
+      Merge_2x1(A1, 
+                A2, AB);
+      break;
+    default:
+      Kokkos::abort("Tacho::Part_???");
+      break;
+    }
+  }
+
+  template<typename MatView>
+  KOKKOS_INLINE_FUNCTION 
+  void 
+  Merge_1x3_to_1x2(const MatView A0, const MatView A1, const MatView A2, 
+                   MatView &AL, MatView &AR, 
+                   const int side) {
+    switch (side) {
+    case Partition::Left:
+      Merge_1x2(A0, A1, AL);
+
+      AR.setView(A2.BaseObject(),
+                 A2.OffsetRows(), A2.NumRows(),
+                 A2.OffsetCols(), A2.NumCols());
+      break;
+    case Partition::Right:
+      AL.setView(A0.BaseObject(),
+                 A0.OffsetRows(), A0.NumRows(),
+                 A0.OffsetCols(), A0.NumCols());
+
+      Merge_1x2(A1, A2, AR);
+      break;
+    default:
+      Kokkos::abort("Tacho::Part_???");
+      break;
+    }
+  }
+
+
+}
+
+#endif
diff --git a/lib/kokkos/example/ichol/src/scale.hpp b/lib/kokkos/example/ichol/src/scale.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..3152520966d88caeaede7d81c8a9bf826400d610
--- /dev/null
+++ b/lib/kokkos/example/ichol/src/scale.hpp
@@ -0,0 +1,92 @@
+#pragma once
+#ifndef __SCALE_HPP__
+#define __SCALE_HPP__
+
+/// \file scale.hpp
+/// \brief Scaling sparse matrix.
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+namespace Tacho {
+
+  using namespace std;
+
+  template<typename T> struct ScaleTraits {
+    typedef T scale_type;
+    // assume built-in types have appropriate type conversion
+    static constexpr T one = 1 ;
+    static constexpr T zero = 0 ;
+  };
+
+
+  template<typename ScalarType,
+           typename CrsExecViewType>
+  KOKKOS_INLINE_FUNCTION
+  int
+  scaleCrsMatrix(const typename CrsExecViewType::policy_type::member_type &member,
+                 const ScalarType alpha,
+                 typename CrsExecViewType::matrix_type &A) {
+    typedef typename CrsExecViewType::ordinal_type  ordinal_type;
+    typedef typename CrsExecViewType::value_type    value_type;
+    typedef typename CrsExecViewType::row_view_type row_view_type;
+
+    if (alpha == ScaleTraits<value_type>::one) {
+      // do nothing
+    } else {
+      const ordinal_type mA = A.NumRows();
+      if (mA > 0) {
+        Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, mA),
+                             [&](const ordinal_type i) {
+                               row_view_type &row = A.RowView(i);
+                               for (ordinal_type j=0;j<row.NumNonZeros();++j)
+                                 row.Value(j) *= alpha;
+                             });
+        member.team_barrier();
+      }
+    }
+
+    return 0;
+  }
+
+  template<typename ScalarType,
+           typename DenseExecViewType>
+  KOKKOS_INLINE_FUNCTION
+  int
+  scaleDenseMatrix(const typename DenseExecViewType::policy_type::member_type &member,
+                   const ScalarType alpha,
+                   DenseExecViewType &A) {
+    typedef typename DenseExecViewType::ordinal_type  ordinal_type;
+    typedef typename DenseExecViewType::value_type    value_type;
+
+    if (alpha == ScaleTraits<value_type>::one) {
+      // do nothing
+    } else {
+      if (A.BaseObject().ColStride() > A.BaseObject().RowStride()) {
+        const ordinal_type nA = A.NumCols();
+        if (nA > 0) {
+          Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, nA),
+                               [&](const ordinal_type j) {
+                                 for (ordinal_type i=0;i<A.NumRows();++i)
+                                   A.Value(i, j) *= alpha;
+                               });
+          member.team_barrier();
+        }
+      } else {
+        const ordinal_type mA = A.NumRows();
+        if (mA > 0) {
+          Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, mA),
+                               [&](const ordinal_type i) {
+                                 for (ordinal_type j=0;j<A.NumCols();++j)
+                                   A.Value(i, j) *= alpha;
+                               });
+          member.team_barrier();
+        }
+      }
+    }
+
+    return 0;
+  }
+
+}
+
+#endif
+
diff --git a/lib/kokkos/example/ichol/src/symbolic_factor_helper.hpp b/lib/kokkos/example/ichol/src/symbolic_factor_helper.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f6c381a99817ca5254ef3563fe48941410870ad7
--- /dev/null
+++ b/lib/kokkos/example/ichol/src/symbolic_factor_helper.hpp
@@ -0,0 +1,379 @@
+#pragma once
+#ifndef __SYMBOLIC_FACTOR_HELPER_HPP__
+#define __SYMBOLIC_FACTOR_HELPER_HPP__
+
+/// \file symbolic_factor_helper.hpp
+/// \brief The class compute a nonzero pattern with a given level of fills
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+#include "util.hpp"
+
+namespace Tacho {
+
+  using namespace std;
+
+  template<class CrsMatrixType>
+  class SymbolicFactorHelper : public Disp {
+  public:
+    typedef typename CrsMatrixType::ordinal_type ordinal_type;
+    typedef typename CrsMatrixType::size_type    size_type;
+
+    typedef typename Kokkos::HostSpace::execution_space  host_exec_space ;
+
+    typedef typename CrsMatrixType::ordinal_type_array ordinal_type_array;
+    typedef typename CrsMatrixType::size_type_array    size_type_array;
+    typedef typename CrsMatrixType::value_type_array   value_type_array;
+
+  private:
+    string _label;                   // name of this class
+
+    // matrix index base
+    CrsMatrixType _A;                // input matrix
+    ordinal_type _m, _n;             // matrix dimension
+
+    struct crs_graph {
+      size_type_array _ap;           // row ptr array
+      ordinal_type_array _aj;        // col index array
+      size_type _nnz;                // # of nonzeros
+    };
+    typedef struct crs_graph crs_graph_type;
+    crs_graph_type _in, _out;
+
+    typedef Kokkos::View<ordinal_type**, Kokkos::LayoutLeft, host_exec_space> league_specific_ordinal_type_array;
+    typedef typename league_specific_ordinal_type_array::value_type* league_specific_ordinal_type_array_ptr;
+
+    int _lsize;
+    league_specific_ordinal_type_array _queue, _visited, _distance;
+
+    void createInternalWorkSpace() {
+      _queue    = league_specific_ordinal_type_array(_label+"::QueueArray",    _m, _lsize);
+      _visited  = league_specific_ordinal_type_array(_label+"::VisitedArray",  _m, _lsize);
+      _distance = league_specific_ordinal_type_array(_label+"::DistanceArray", _m, _lsize);
+    }
+
+    void freeInternalWorkSpace() {
+      _queue    = league_specific_ordinal_type_array();
+      _visited  = league_specific_ordinal_type_array();
+      _distance = league_specific_ordinal_type_array();
+    }
+
+  public:
+
+    void setLabel(string label) { _label = label; }
+    string Label() const { return _label; }
+
+    SymbolicFactorHelper(const CrsMatrixType &A,
+                         const int lsize = (host_exec_space::thread_pool_size(0)/
+                                            host_exec_space::thread_pool_size(2)))  {
+
+      _label = "SymbolicFactorHelper::" ;
+
+      // matrix index base and the number of rows
+      _A = A;
+
+      _m = _A.NumRows();
+      _n = _A.NumCols();
+
+      // allocate memory for input crs matrix
+      _in._nnz = _A.NumNonZeros();
+      _in._ap  = size_type_array(_label+"::Input::RowPtrArray", _m+1);
+      _in._aj  = ordinal_type_array(_label+"::Input::ColIndexArray", _in._nnz);
+
+      // adjust graph structure; A is assumed to have a graph without its diagonal
+      A.convertGraph(_in._ap, _in._aj);
+      _in._nnz = _in._ap[_m];
+
+      // league size
+      _lsize = lsize;
+
+      // create workspace per league
+      createInternalWorkSpace();
+    }
+    virtual~SymbolicFactorHelper() {
+      freeInternalWorkSpace();
+    }
+
+    class Queue {
+    private:
+      league_specific_ordinal_type_array_ptr _q;
+      ordinal_type _begin, _end;
+
+    public:
+      Queue(league_specific_ordinal_type_array_ptr q)
+        : _q(q),_begin(0),_end(0) { }
+
+      ordinal_type size() const { return _end - _begin; }
+      bool empty() const { return !size(); }
+
+      void push(const ordinal_type val) { _q[_end++] = val; }
+      ordinal_type pop() { return _q[_begin++]; }
+      ordinal_type end() { return _end; }
+      void reset() { _begin = 0; _end = 0; }
+    };
+
+    class FunctorComputeNonZeroPatternInRow {
+    public:
+      typedef Kokkos::TeamPolicy<host_exec_space> policy_type;
+
+    private:
+      ordinal_type _level, _m;
+      crs_graph_type _graph;
+
+      league_specific_ordinal_type_array _queue;
+      league_specific_ordinal_type_array _visited;
+      league_specific_ordinal_type_array _distance;
+
+      size_type_array _ap;
+      ordinal_type_array _aj;
+
+      ordinal_type _phase;
+
+    public:
+      FunctorComputeNonZeroPatternInRow(const ordinal_type level,
+                                        const ordinal_type m,
+                                        const crs_graph_type &graph,
+                                        league_specific_ordinal_type_array &queue,
+                                        league_specific_ordinal_type_array &visited,
+                                        league_specific_ordinal_type_array &distance,
+                                        size_type_array &ap,
+                                        ordinal_type_array &aj)
+        : _level(level), _m(m), _graph(graph),
+          _queue(queue), _visited(visited), _distance(distance),
+          _ap(ap), _aj(aj), _phase(0)
+      { }
+
+      void setPhaseCountNumNonZeros() { _phase = 0; }
+      void setPhaseComputeColIndex()  { _phase = 1; }
+
+      inline
+      void operator()(const typename policy_type::member_type &member) const {
+        const int lrank = member.league_rank();
+        const int lsize = member.league_size();
+
+        league_specific_ordinal_type_array_ptr queue    = &_queue(0, lrank);
+        league_specific_ordinal_type_array_ptr distance = &_distance(0, lrank);
+        league_specific_ordinal_type_array_ptr visited  = &_visited(0, lrank);
+
+        for (ordinal_type i=0;i<_m;++i)
+          visited[i] = 0;
+
+        // shuffle rows to get better load balance;
+        // for instance, if ND is applied, more fills are generated in the last seperator.
+        for (ordinal_type i=lrank;i<_m;i+=lsize) {
+
+          size_type cnt = 0;
+
+          // account for the diagonal
+          switch (_phase) {
+          case 0:
+            cnt = 1;
+            break;
+          case 1:
+            cnt = _ap[i];
+            _aj[cnt++] = i;
+            break;
+          }
+
+          {
+            Queue q(queue); // fixed size queue
+
+            // initialize work space
+            q.push(i);
+            distance[i] = 0;
+
+            const ordinal_type id = (i+1);
+            visited[i] = id;
+
+            // breath first search for i
+            while (!q.empty()) {
+              const ordinal_type h = q.pop();
+              // loop over j adjancy
+              const ordinal_type jbegin = _graph._ap[h], jend = _graph._ap[h+1];
+              for (ordinal_type j=jbegin;j<jend;++j) {
+                const ordinal_type t = _graph._aj[j];
+                if (visited[t] != id) {
+                  visited[t] = id;
+
+                  if (t < i && (_level < 0 || distance[h] < _level)) {
+                    q.push(t);
+                    distance[t] = distance[h] + 1;
+                  }
+                  if (t > i) {
+                    switch (_phase) {
+                    case 0:
+                      ++cnt;
+                      break;
+                    case 1:
+                      _aj[cnt++] = t;
+                      break;
+                    }
+                  }
+                }
+              }
+            }
+
+            // clear work space
+            for (ordinal_type j=0;j<q.end();++j) {
+              const ordinal_type jj = queue[j];
+              distance[jj] = 0;
+            }
+            q.reset();
+          }
+          switch (_phase) {
+          case 0:
+            _ap[i+1] = cnt;
+            break;
+          case 1:
+            sort(_aj.data() + _ap[i] , _aj.data() + _ap[i+1]);
+            break;
+          }
+        }
+      }
+    };
+
+    class FunctorCountOffsetsInRow {
+    public:
+      typedef Kokkos::RangePolicy<host_exec_space> policy_type;
+      typedef size_type value_type;
+
+    private:
+      size_type_array _off_in_rows;
+
+    public:
+      FunctorCountOffsetsInRow(size_type_array &off_in_rows)
+        : _off_in_rows(off_in_rows)
+      { }
+
+      KOKKOS_INLINE_FUNCTION
+      void init(value_type &update) const {
+        update = 0;
+      }
+
+      KOKKOS_INLINE_FUNCTION
+      void operator()(const typename policy_type::member_type &i, value_type &update, const bool final) const {
+        update += _off_in_rows(i);
+        if (final)
+          _off_in_rows(i) = update;
+      }
+
+      KOKKOS_INLINE_FUNCTION
+      void join(volatile value_type &update,
+                volatile const value_type &input) const {
+        update += input;
+      }
+    };
+
+    int createNonZeroPattern(const ordinal_type level,
+                             const int uplo,
+                             CrsMatrixType &F) {
+      // all output array should be local and rcp in Kokkos::View manage memory (de)allocation
+      size_type_array ap = size_type_array(_label+"::Output::RowPtrArray", _m+1);
+
+      // later determined
+      ordinal_type_array aj;
+      value_type_array ax;
+      size_type nnz  = 0;
+
+      {
+        FunctorComputeNonZeroPatternInRow functor(level, _m, _in,
+                                                  _queue,
+                                                  _visited,
+                                                  _distance,
+                                                  ap,
+                                                  aj);
+
+        functor.setPhaseCountNumNonZeros();
+        Kokkos::parallel_for(typename FunctorComputeNonZeroPatternInRow::policy_type(_lsize, 1), functor);
+      }
+      {
+        FunctorCountOffsetsInRow functor(ap);
+        Kokkos::parallel_scan(typename FunctorCountOffsetsInRow::policy_type(0, _m+1), functor);
+      }
+
+      nnz  = ap[_m];
+      aj = ordinal_type_array(_label+"::Output::ColIndexArray", nnz);
+      ax = value_type_array(_label+"::Output::ValueArray", nnz);
+
+      {
+        FunctorComputeNonZeroPatternInRow functor(level, _m, _in,
+                                                  _queue,
+                                                  _visited,
+                                                  _distance,
+                                                  ap,
+                                                  aj);
+
+        functor.setPhaseComputeColIndex();
+        Kokkos::parallel_for(typename FunctorComputeNonZeroPatternInRow::policy_type(_lsize, 1), functor);
+      }
+
+      {
+        F = CrsMatrixType("dummy", _m, _n, nnz, ap, aj, ax);
+        F.add(_A);
+      }
+
+      // record the symbolic factors
+      _out._nnz = nnz;
+      _out._ap = ap;
+      _out._aj = aj;
+
+      return 0;
+    }
+
+    int createNonZeroPattern(const int uplo,
+                             CrsMatrixType &F) {
+      return createNonZeroPattern(-1, uplo, F);
+    }
+
+    ostream& showMe(ostream &os) const {
+      streamsize prec = os.precision();
+      os.precision(15);
+      os << scientific;
+
+      const int w = 6;
+
+      os << " -- Matrix Dimension -- " << endl
+         << "    # of Rows  = " << _m << endl
+         << "    # of Cols  = " << _n << endl;
+
+      os << endl;
+
+      os << " -- Input Graph Without Diagonals -- " << endl
+         << "    # of NonZeros  = " << _in._nnz << endl ;
+
+      os << " -- Input Graph :: RowPtr -- " << endl;
+      {
+        const ordinal_type n0 = _in._ap.dimension_0();
+        for (ordinal_type i=0;i<n0;++i)
+          os << setw(w) << i
+             << setw(w) << _in._ap[i]
+             << endl;
+      }
+
+      os << endl;
+
+      os << " -- Output Graph With Diagonals-- " << endl
+         << "    # of NonZeros  = " << _out._nnz << endl ;
+
+      os << " -- Output Graph :: RowPtr -- " << endl;
+      {
+        const ordinal_type n0 = _out._ap.dimension_0();
+        for (ordinal_type i=0;i<n0;++i)
+          os << setw(w) << i
+             << setw(w) << _out._ap[i]
+             << endl;
+      }
+
+      os.unsetf(ios::scientific);
+      os.precision(prec);
+
+      return os;
+    }
+
+  };
+
+}
+
+#endif
+
+
+
diff --git a/lib/kokkos/example/ichol/src/symbolic_task.hpp b/lib/kokkos/example/ichol/src/symbolic_task.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f6cdc28ab133d123803fff40d5906cfaa58371ea
--- /dev/null
+++ b/lib/kokkos/example/ichol/src/symbolic_task.hpp
@@ -0,0 +1,118 @@
+#pragma once
+#ifndef __SYMBOLIC_TASK_HPP__
+#define __SYMBOLIC_TASK_HPP__
+
+/// \file symbolic_task.hpp
+/// \brief Provides tasking interface with graphviz output.
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+namespace Tacho { 
+  
+  using namespace std;
+
+  /// \brief Graphviz color mapping for the generated tasks.
+  static map<string,string> g_graphviz_color = {
+    { "chol/scalar", "indianred2"},
+    { "chol/trsm",   "orange2"   },
+    { "chol/gemm",   "lightblue2"} };
+
+  class SymbolicTaskQueue;
+
+  class SymbolicTask {
+  private:
+    string _name;
+    set<SymbolicTask*> _dep_tasks;
+
+  public:
+    // at this moment, make the queue global
+    // but this should be local and work with 
+    // multiple queues with separate thread teams
+    typedef SymbolicTaskQueue queue;
+
+    SymbolicTask() 
+      : _name("no-name") 
+    { }
+    
+    SymbolicTask(const SymbolicTask &b) 
+      : _name(b._name)
+    { }
+    
+    SymbolicTask(const string name) 
+      : _name(name) 
+    { }
+
+    int addDependence(SymbolicTask *b) {
+      if (b != NULL) 
+        _dep_tasks.insert(b);
+      return 0;
+    }
+
+    int clearDependence() {
+      _dep_tasks.clear();
+      return 0;
+    }
+
+    ostream& showMe(ostream &os) const {
+      os << "    uid = " << this << " , name = " << _name << ", # of deps = " << _dep_tasks.size()  << endl;
+      if (_dep_tasks.size()) {
+        for (auto it=_dep_tasks.begin();it!=_dep_tasks.end();++it) 
+          os << "          " << (*it) << " , name = " << (*it)->_name << endl;
+      }
+      return os;
+    }    
+
+    ostream& graphviz(ostream &os) const {
+      os << (long)(this) 
+         << " [label=\"" << _name ;
+      auto it = g_graphviz_color.find(_name);
+      if (it != g_graphviz_color.end())
+        os << "\" ,style=filled,color=\"" << it->second << "\" "; 
+      os << "];";
+      for (auto it=_dep_tasks.begin();it!=_dep_tasks.end();++it) 
+        os << (long)(*it) << " -> " << (long)this << ";";
+      return (os << endl);
+    }
+
+  };
+
+  static vector<SymbolicTask*> g_queue;
+
+  class SymbolicTaskQueue {
+  public:
+    static SymbolicTask* push(SymbolicTask *task) {
+      g_queue.push_back(task);
+      return g_queue.back();
+    }
+
+    static int clear() {
+      for (auto it=g_queue.begin();it!=g_queue.end();++it)
+        delete (*it);
+      g_queue.clear();
+      return 0;
+    }
+
+    static ostream& showMe(ostream &os) {
+      if (g_queue.size()) {
+        os << " -- Symbolic Task Queue -- " << endl;
+        for (auto it=g_queue.begin();it!=g_queue.end();++it)
+          (*it)->showMe(os);
+      } else {
+        os << " -- Symbolic Task Queue is empty -- " << endl;
+      }
+      return os;
+    }
+
+    static ostream& graphviz(ostream &os, 
+                             const double width = 7.5,
+                             const double length = 10.0) {
+      os << "digraph TaskGraph {" << endl;
+      os << "size=\"" << width << "," << length << "\";" << endl;
+      for (auto it=g_queue.begin();it!=g_queue.end();++it) 
+        (*it)->graphviz(os);
+      os << "}" << endl;
+      return (os << endl);
+    }
+  };
+  
+}
+#endif
diff --git a/lib/kokkos/example/ichol/src/task_factory.hpp b/lib/kokkos/example/ichol/src/task_factory.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b829da6737dfa3423b800aa6021b2c33e94b2c78
--- /dev/null
+++ b/lib/kokkos/example/ichol/src/task_factory.hpp
@@ -0,0 +1,77 @@
+#pragma once
+#ifndef __TASK_FACTORY_HPP__
+#define __TASK_FACTORY_HPP__
+
+/// \file task_factory.hpp
+/// \brief A wrapper for task policy and future with a provided space type.
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+namespace Tacho { 
+
+  using namespace std;
+
+  /// \class TaskFactory
+  /// \brief Minimal interface to Kokkos tasking.
+  ///
+  /// TaskFactory is attached to blocks as a template argument in order to 
+  /// create and manage tasking future objects. Note that policy (shared 
+  /// pointer to the task generator) is not a member object in this class.
+  /// This class includes minimum interface for tasking with type decralation 
+  /// of the task policy and template alias of future so that future objects 
+  /// generated in this class will match to their policy and its execution space. 
+  ///
+  template<typename PolicyType,        
+           typename FutureType>
+  class TaskFactory {
+  private:
+    static constexpr int _max_task_dependence = 10 ;
+
+  public:
+    typedef PolicyType policy_type;
+    typedef FutureType future_type;
+    
+    template<typename TaskFunctorType>
+    static KOKKOS_INLINE_FUNCTION
+    future_type create(policy_type &policy, const TaskFunctorType &func) {
+
+      future_type f ;
+      // while ( f.is_null() ) {
+        f = policy.task_create_team(func, _max_task_dependence);
+      // }
+      if ( f.is_null() ) Kokkos::abort("task_create_team FAILED, out of memory");
+      return f ;
+    }
+    
+    static KOKKOS_INLINE_FUNCTION
+    void spawn(policy_type &policy, const future_type &obj, bool priority = false ) {
+      policy.spawn(obj,priority);
+    }
+    
+    static KOKKOS_INLINE_FUNCTION
+    void addDependence(policy_type &policy, 
+                       const future_type &after, const future_type &before) {
+      policy.add_dependence(after, before);
+    }
+
+    template<typename TaskFunctorType>
+    static  KOKKOS_INLINE_FUNCTION
+    void addDependence(policy_type &policy, 
+                       TaskFunctorType *after, const future_type &before) {
+      policy.add_dependence(after, before);
+    }
+
+    template<typename TaskFunctorType>
+    static  KOKKOS_INLINE_FUNCTION
+    void clearDependence(policy_type &policy, TaskFunctorType *func) {
+      policy.clear_dependence(func);
+    }
+
+    template<typename TaskFunctorType>
+    static KOKKOS_INLINE_FUNCTION
+    void respawn(policy_type &policy, TaskFunctorType *func) {
+      policy.respawn(func);
+    }
+  };
+}
+
+#endif
diff --git a/lib/kokkos/example/ichol/src/task_view.hpp b/lib/kokkos/example/ichol/src/task_view.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ce280a325fd6a460c687f15e0a69c4aa6dd0e8b5
--- /dev/null
+++ b/lib/kokkos/example/ichol/src/task_view.hpp
@@ -0,0 +1,104 @@
+#pragma once
+#ifndef __TASK_VIEW_HPP__
+#define __TASK_VIEW_HPP__
+
+/// \file task_view.hpp
+/// \brief Task view is inherited from matrix view and have a member for the task handler.
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+namespace Tacho { 
+
+  using namespace std;
+
+  template<typename MatrixViewType,
+           typename TaskFactoryType>
+  class TaskView : public MatrixViewType {
+  public:
+    typedef          MatrixViewType                matrix_type ;
+    typedef typename MatrixViewType::value_type    value_type;
+    typedef typename MatrixViewType::ordinal_type  ordinal_type;
+
+    typedef TaskFactoryType task_factory_type;
+    typedef typename task_factory_type::policy_type policy_type;
+    typedef typename task_factory_type::future_type future_type;
+
+  private:
+    future_type _f;
+
+  public:
+    KOKKOS_INLINE_FUNCTION
+    void setFuture(const future_type &f)
+      { _f = f; }
+
+    KOKKOS_INLINE_FUNCTION
+    future_type Future() const { return _f; }
+
+    KOKKOS_INLINE_FUNCTION
+    ~TaskView() = default ;
+
+    KOKKOS_INLINE_FUNCTION
+    TaskView() 
+      : MatrixViewType(), _f()
+    { } 
+
+    TaskView(const TaskView &b) = delete ;
+
+    KOKKOS_INLINE_FUNCTION
+    TaskView(typename MatrixViewType::mat_base_type const & b) 
+      : MatrixViewType(b), _f() 
+    { }
+
+    KOKKOS_INLINE_FUNCTION
+    TaskView(typename MatrixViewType::mat_base_type const & b,
+             const ordinal_type offm, const ordinal_type m,
+             const ordinal_type offn, const ordinal_type n) 
+      : MatrixViewType(b, offm, m, offn, n), _f() 
+    { }
+
+  };
+}
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#if ! KOKKOS_USING_EXP_VIEW
+
+namespace Kokkos {
+  namespace Impl {
+
+    //  The Kokkos::View allocation will by default assign each allocated datum to zero.
+    //  This is not the required initialization behavior when
+    //  non-trivial objects are used within a Kokkos::View.
+    //  Create a partial specialization of the Kokkos::Impl::AViewDefaultConstruct
+    //  to replace the assignment initialization with placement new initialization.
+    //
+    //  This work-around is necessary until a TBD design refactorization of Kokkos::View.
+
+    template< class ExecSpace , typename T1, typename T2 >
+    struct ViewDefaultConstruct< ExecSpace , Tacho::TaskView<T1,T2> , true >
+    {
+      typedef Tacho::TaskView<T1,T2> type ;
+      type * const m_ptr ;
+
+      KOKKOS_FORCEINLINE_FUNCTION
+      void operator()( const typename ExecSpace::size_type& i ) const
+      { new(m_ptr+i) type(); }
+
+      ViewDefaultConstruct( type * pointer , size_t capacity )
+        : m_ptr( pointer )
+      {
+        Kokkos::RangePolicy< ExecSpace > range( 0 , capacity );
+        parallel_for( range , *this );
+        ExecSpace::fence();
+      }
+    };
+
+  } // namespace Impl
+} // namespace Kokkos
+
+#endif /* #if ! KOKKOS_USING_EXP_VIEW */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif
diff --git a/lib/kokkos/example/ichol/src/trsm.hpp b/lib/kokkos/example/ichol/src/trsm.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b4a6a7df48967257f824ae73680bf918d457be76
--- /dev/null
+++ b/lib/kokkos/example/ichol/src/trsm.hpp
@@ -0,0 +1,92 @@
+#pragma once
+#ifndef __TRSM_HPP__
+#define __TRSM_HPP__
+
+/// \file trsm.hpp
+/// \brief Sparse triangular solve on given sparse patterns and multiple rhs.
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+#include "util.hpp"
+#include "control.hpp"
+#include "partition.hpp"
+
+namespace Tacho {
+
+  using namespace std;
+
+  template<int ArgSide,int ArgUplo, int ArgTrans, int ArgAlgo,
+           int ArgVariant = Variant::One,
+           template<int,int> class ControlType = Control>
+  struct Trsm {
+
+    // data-parallel interface
+    // =======================
+    template<typename ScalarType,
+             typename ExecViewTypeA,
+             typename ExecViewTypeB>
+    KOKKOS_INLINE_FUNCTION
+    static int invoke(typename ExecViewTypeA::policy_type &policy,
+                      const typename ExecViewTypeA::policy_type::member_type &member,
+                      const int diagA,
+                      const ScalarType alpha,
+                      typename ExecViewTypeA::matrix_type &A,
+                      typename ExecViewTypeB::matrix_type &B);
+
+    // task-data parallel interface
+    // ============================
+    template<typename ScalarType,
+             typename ExecViewTypeA,
+             typename ExecViewTypeB>
+    class TaskFunctor {
+    public:
+      typedef typename ExecViewTypeA::policy_type policy_type;
+      typedef typename policy_type::member_type member_type;
+      typedef int value_type;
+
+    private:
+      int _diagA;
+      ScalarType _alpha;
+      typename ExecViewTypeA::matrix_type _A;
+      typename ExecViewTypeB::matrix_type _B;
+
+      policy_type _policy;
+
+    public:
+      KOKKOS_INLINE_FUNCTION
+      TaskFunctor(const policy_type & P,
+                  const int diagA,
+                  const ScalarType alpha,
+                  const ExecViewTypeA & A,
+                  const ExecViewTypeB & B)
+        : _diagA(diagA),
+          _alpha(alpha),
+          _A(A),
+          _B(B),
+          _policy(P)
+      { }
+
+      string Label() const { return "Trsm"; }
+
+      // task execution
+      KOKKOS_INLINE_FUNCTION
+      void apply(value_type &r_val) {
+        r_val = Trsm::invoke<ScalarType,ExecViewTypeA,ExecViewTypeB>(_policy, _policy.member_single(),
+                             _diagA, _alpha, _A, _B);
+      }
+
+      // task-data execution
+      KOKKOS_INLINE_FUNCTION
+      void apply(const member_type &member, value_type &r_val) {
+        r_val = Trsm::invoke<ScalarType,ExecViewTypeA,ExecViewTypeB>(_policy, member, 
+                             _diagA, _alpha, _A, _B);
+      }
+
+    };
+  };
+
+}
+
+// #include "trsm_l_u_nt.hpp"
+#include "trsm_l_u_ct.hpp"
+
+#endif
diff --git a/lib/kokkos/example/ichol/src/trsm_l_u_ct.hpp b/lib/kokkos/example/ichol/src/trsm_l_u_ct.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b6f3289474518bd88e55db198e4d2ad8efa7e435
--- /dev/null
+++ b/lib/kokkos/example/ichol/src/trsm_l_u_ct.hpp
@@ -0,0 +1,14 @@
+#pragma once
+#ifndef __TRSM_L_U_CT_HPP__
+#define __TRSM_L_U_CT_HPP__
+
+/// \file trsm_l_u_ct.hpp
+/// \brief Sparse triangular solve on given sparse patterns and multiple rhs.
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+///
+#include "gemm.hpp"
+
+#include "trsm_l_u_ct_for_factor_blocked.hpp"
+// #include "trsm_l_u_ct_for_tri_solve_blocked.hpp"
+
+#endif
diff --git a/lib/kokkos/example/ichol/src/trsm_l_u_ct_for_factor_blocked.hpp b/lib/kokkos/example/ichol/src/trsm_l_u_ct_for_factor_blocked.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..7414e5d80f07f895a8cd4e5182acb3fc9976be58
--- /dev/null
+++ b/lib/kokkos/example/ichol/src/trsm_l_u_ct_for_factor_blocked.hpp
@@ -0,0 +1,185 @@
+#pragma once
+#ifndef __TRSM_L_U_CT_FOR_FACTOR_BLOCKED_HPP__
+#define __TRSM_L_U_CT_FOR_FACTOR_BLOCKED_HPP__
+
+/// \file trsm_l_u_ct_for_factor_blocked.hpp
+/// \brief Sparse triangular solve on given sparse patterns and multiple rhs.
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+///
+
+namespace Tacho {
+
+  using namespace std;
+
+  // Trsm used in the factorization phase: data parallel on b1t
+  // ==========================================================
+  template<>
+  template<typename ScalarType,
+           typename CrsExecViewTypeA,
+           typename CrsExecViewTypeB>
+  KOKKOS_INLINE_FUNCTION
+  int
+  Trsm<Side::Left,Uplo::Upper,Trans::ConjTranspose,
+       AlgoTrsm::ForFactorBlocked,Variant::One>
+  ::invoke(typename CrsExecViewTypeA::policy_type &policy,
+           const typename CrsExecViewTypeA::policy_type::member_type &member,
+           const int diagA,
+           const ScalarType alpha,
+           typename CrsExecViewTypeA::matrix_type &A,
+           typename CrsExecViewTypeB::matrix_type &B) {
+    typedef typename CrsExecViewTypeA::ordinal_type      ordinal_type;
+    typedef typename CrsExecViewTypeA::value_type        value_type;
+    typedef typename CrsExecViewTypeA::row_view_type     row_view_type;
+
+
+if ( false && member.team_rank() == 0 ) {
+ printf("Trsm [%d +%d)x[%d +%d)\n"
+       , B.OffsetRows()
+       , B.NumRows()
+       , B.OffsetCols()
+       , B.NumCols()
+       );
+}
+
+    // scale the matrix B with alpha
+    scaleCrsMatrix<ScalarType,CrsExecViewTypeB>(member, alpha, B);
+
+    // Solve a system: AX = B -> B := inv(A) B
+    const ordinal_type mA = A.NumRows();
+    const ordinal_type nB = B.NumCols();
+
+    if (nB > 0) {
+      for (ordinal_type k=0;k<mA;++k) {
+        row_view_type &a = A.RowView(k);
+        // const value_type cdiag = std::conj(a.Value(0)); // for complex<T>
+        const value_type cdiag = a.Value(0);
+
+        // invert
+        row_view_type &b1 = B.RowView(k);
+        const ordinal_type nnz_b1 = b1.NumNonZeros();
+
+        if (diagA != Diag::Unit && nnz_b1 > 0) {
+          // b1t = b1t / conj(diag)
+          Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, nnz_b1),
+                               [&](const ordinal_type j) {
+                                 b1.Value(j) /= cdiag;
+                               });
+        }
+
+        // update
+        const ordinal_type nnz_a = a.NumNonZeros();
+        if (nnz_a > 0) {
+          // B2 = B2 - trans(conj(a12t)) b1t
+          Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, nnz_b1),
+                               [&](const ordinal_type j) {
+                                 // grab b1
+                                 const ordinal_type col_at_j = b1.Col(j);
+                                 const value_type   val_at_j = b1.Value(j);
+
+                                 for (ordinal_type i=1;i<nnz_a;++i) {
+                                   // grab a12t
+                                   const ordinal_type row_at_i = a.Col(i);
+                                   // const value_type   val_at_i = conj(a.Value(i));
+                                   const value_type   val_at_i = a.Value(i);
+
+                                   // grab b2
+                                   row_view_type &b2 = B.RowView(row_at_i);
+
+                                   // check and update
+                                   ordinal_type idx = 0;
+                                   idx = b2.Index(col_at_j, idx);
+                                   if (idx >= 0)
+                                     b2.Value(idx) -= val_at_i*val_at_j;
+                                 }
+                               });
+        }
+        member.team_barrier();
+      }
+    }
+
+    return 0;
+  }
+
+  // Trsm used in the factorization phase: data parallel on a1t
+  // ==========================================================
+  template<>
+  template<typename ScalarType,
+           typename CrsExecViewTypeA,
+           typename CrsExecViewTypeB>
+  KOKKOS_INLINE_FUNCTION
+  int
+  Trsm<Side::Left,Uplo::Upper,Trans::ConjTranspose,
+       AlgoTrsm::ForFactorBlocked,Variant::Two>
+  ::invoke(typename CrsExecViewTypeA::policy_type &policy,
+           const typename CrsExecViewTypeA::policy_type::member_type &member,
+           const int diagA,
+           const ScalarType alpha,
+           typename CrsExecViewTypeA::matrix_type &A,
+           typename CrsExecViewTypeB::matrix_type &B) {
+    typedef typename CrsExecViewTypeA::ordinal_type      ordinal_type;
+    typedef typename CrsExecViewTypeA::value_type        value_type;
+    typedef typename CrsExecViewTypeA::row_view_type     row_view_type;
+
+    // scale the matrix B with alpha
+    scaleCrsMatrix<ScalarType,CrsExecViewTypeB>(member, alpha, B);
+
+    // Solve a system: AX = B -> B := inv(A) B
+    const ordinal_type mA = A.NumRows();
+    const ordinal_type nB = B.NumCols();
+
+    if (nB > 0) {
+      for (ordinal_type k=0;k<mA;++k) {
+        row_view_type &a = A.RowView(k);
+        // const value_type cdiag = conj(a.Value(0));
+        const value_type cdiag = a.Value(0);
+
+        // invert
+        row_view_type &b1 = B.RowView(k);
+        const ordinal_type nnz_b1 = b1.NumNonZeros();
+
+        if (diagA != Diag::Unit && nnz_b1 > 0) {
+          // b1t = b1t / conj(diag)
+          Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, nnz_b1),
+                               [&](const ordinal_type j) {
+                                 b1.Value(j) /= cdiag;
+                               });
+          member.team_barrier();
+        }
+
+        // update
+        const ordinal_type nnz_a = a.NumNonZeros();
+        if (nnz_a > 0) {
+          // B2 = B2 - trans(conj(a12t)) b1t
+          Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 1, nnz_a),
+                               [&](const ordinal_type i) {
+                                 // grab a12t
+                                 const ordinal_type row_at_i = a.Col(i);
+                                 // const value_type   val_at_i = conj(a.Value(i));
+                                 const value_type   val_at_i = a.Value(i);
+
+                                 // grab b2
+                                 row_view_type &b2 = B.RowView(row_at_i);
+
+                                 ordinal_type idx = 0;
+                                 for (ordinal_type j=0;j<nnz_b1 && (idx > -2);++j) {
+                                   // grab b1
+                                   const ordinal_type col_at_j = b1.Col(j);
+                                   const value_type   val_at_j = b1.Value(j);
+
+                                   // check and update
+                                   idx = b2.Index(col_at_j, idx);
+                                   if (idx >= 0)
+                                     b2.Value(idx) -= val_at_i*val_at_j;
+                                 }
+                               });
+          member.team_barrier();
+        }
+      }
+    }
+
+    return 0;
+  }
+
+}
+
+#endif
diff --git a/lib/kokkos/example/ichol/src/util.cpp b/lib/kokkos/example/ichol/src/util.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ef220c48c1b7d58af2289dde4f226a7a102d63ee
--- /dev/null
+++ b/lib/kokkos/example/ichol/src/util.cpp
@@ -0,0 +1,4 @@
+
+
+static int dummy = 1;
+
diff --git a/lib/kokkos/example/ichol/src/util.hpp b/lib/kokkos/example/ichol/src/util.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..020475bc52daad5c864d7caa8ed34d03157a0046
--- /dev/null
+++ b/lib/kokkos/example/ichol/src/util.hpp
@@ -0,0 +1,237 @@
+#pragma once
+#ifndef __UTIL_HPP__
+#define __UTIL_HPP__
+
+#include <stdio.h>
+#include <string.h>
+
+#include <string>
+#include <iostream>
+#include <iomanip>
+#include <fstream>
+#include <vector>
+#include <set>
+#include <map>
+#include <algorithm>
+#include <memory>
+
+#include <cmath>
+#include <complex>
+
+#include <limits>
+
+/// \file util.hpp
+/// \brief Utility functions and constant integer class like an enum class.
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+///
+/// This provides utility functions for implementing mini-app for incomplete
+/// sparse matrix factorization with task-data parallelism e.g., parameter
+/// classes, error handling, ostream << overloading.
+///
+/// Note: The reference of the "static const int" members in the enum-like
+/// classes should not be used as function arguments but their values only.
+
+
+using namespace std;
+
+namespace Tacho {
+
+#undef CHKERR
+#define CHKERR(ierr)                                                    \
+  if (ierr != 0) { cout << endl << ">> Error in " << __FILE__ << ", " << __LINE__ << " : " << ierr << endl; }
+
+#define MSG_NOT_YET_IMPLEMENTED ">> Not yet implemented"
+#define MSG_INVALID_INPUT(what) ">> Invaid input argument: " #what
+#define MSG_INVALID_TEMPLATE_ARGS ">> Invaid template arguments"
+#define ERROR(msg)                                                      \
+  { cout << endl << ">> Error in " << __FILE__ << ", " << __LINE__ << endl << msg << endl; }
+
+  // control id
+#undef  Ctrl
+#define Ctrl(name,algo,variant) name<algo,variant>
+
+  // control leaf
+#undef CtrlComponent
+#define CtrlComponent(name,algo,variant,component,id)                  \
+  Ctrl(name,algo,variant)::component[id]
+
+  // control recursion
+#undef CtrlDetail
+#define CtrlDetail(name,algo,variant,component) \
+  CtrlComponent(name,algo,variant,component,0),CtrlComponent(name,algo,variant,component,1),name
+
+  /// \class GraphHelper
+  class GraphHelper {
+  public:
+    static const int DefaultRandomSeed = -1;
+  };
+
+
+  /// \class Partition
+  /// \brief Matrix partition parameters.
+  class Partition {
+  public:
+    static const int Top         = 101;
+    static const int Bottom      = 102;
+
+    static const int Left        = 201;
+    static const int Right       = 202;
+
+    static const int TopLeft     = 401;
+    static const int TopRight    = 402;
+    static const int BottomLeft  = 403;
+    static const int BottomRight = 404;
+  };
+
+  /// \class Uplo
+  /// \brief Matrix upper/lower parameters.
+  class Uplo {
+  public:
+    static const int Upper = 501;
+    static const int Lower = 502;
+  };
+
+  /// \class Side
+  /// \brief Matrix left/right parameters.
+  class Side {
+  public:
+    static const int Left  = 601;
+    static const int Right = 602;
+  };
+
+  /// \class Diag
+  /// \brief Matrix unit/non-unit diag parameters.
+  class Diag {
+  public:
+    static const int Unit    = 701;
+    static const int NonUnit = 702;
+  };
+
+  /// \class Trans
+  /// \brief Matrix upper/lower parameters.
+  class Trans {
+  public:
+    static const int Transpose     = 801;
+    static const int ConjTranspose = 802;
+    static const int NoTranspose   = 803;
+  };
+
+  /// \class Loop
+  /// \brief outer/innner parameters
+  class Loop {
+  public:
+    static const int Outer = 901;
+    static const int Inner = 902;
+    static const int Fused = 903;
+  };
+
+  class Variant {
+  public:
+    static const int One   = 1;
+    static const int Two   = 2;
+    static const int Three = 3;
+    static const int Four  = 4;
+  };
+
+  /// \class AlgoChol
+  /// \brief Algorithmic variants in sparse factorization and sparse BLAS operations.
+  class AlgoChol {
+  public:
+    // One side factorization on flat matrices
+    static const int Dummy                  = 1000;
+    static const int Unblocked              = 1001;
+    static const int UnblockedOpt           = 1002;
+    static const int Blocked                = 1101; // testing only
+
+    static const int RightLookByBlocks      = 1201; // backbone structure is right looking
+    static const int ByBlocks               = RightLookByBlocks;
+
+    static const int NestedDenseBlock       = 1211;
+    static const int NestedDenseByBlocks    = 1212;
+
+    static const int RightLookDenseByBlocks = 1221;
+    static const int DenseByBlocks          = RightLookDenseByBlocks;
+
+    static const int ExternalLapack         = 1231;
+    static const int ExternalPardiso        = 1232;
+  };
+
+  // aliasing name space
+  typedef AlgoChol AlgoTriSolve;
+
+  class AlgoBlasLeaf {
+  public:
+    // One side factorization on flat matrices
+    static const int ForFactorBlocked = 2001;
+
+    // B and C are dense matrices and used for solve phase
+    static const int ForTriSolveBlocked = 2011;
+
+    static const int ExternalBlas = 2021;
+  };
+
+  class AlgoGemm : public AlgoBlasLeaf {
+  public:
+    static const int DenseByBlocks = 2101;
+  };
+
+  class AlgoTrsm : public AlgoBlasLeaf {
+  public:
+    static const int DenseByBlocks = 2201;
+  };
+
+  class AlgoHerk : public AlgoBlasLeaf {
+  public:
+    static const int DenseByBlocks = 2301;
+  };
+
+  /// \brief Interface for overloaded stream operators.
+  template<typename T>
+  inline
+  ostream& operator<<(ostream &os, const unique_ptr<T> &p) {
+    return p->showMe(os);
+  }
+
+  /// \class Disp
+  /// \brief Interface for the stream operator.
+  class Disp {
+    friend ostream& operator<<(ostream &os, const Disp &disp);
+  public:
+    Disp() { }
+    virtual ostream& showMe(ostream &os) const {
+      return os;
+    }
+  };
+
+  /// \brief Implementation of the overloaded stream operator.
+  inline
+  ostream& operator<<(ostream &os, const Disp &disp) {
+    return disp.showMe(os);
+  }
+
+  template<typename T> struct NumericTraits {};
+
+  template<>
+  struct NumericTraits<float> {
+    typedef float real_type;
+    static real_type epsilon() { return numeric_limits<float>::epsilon(); }
+  };
+  template<>
+  struct NumericTraits<double> {
+    typedef double real_type;
+    static real_type epsilon() { return numeric_limits<double>::epsilon(); }
+  };
+  template<>
+  struct NumericTraits<complex<float> > {
+    typedef float real_type;
+    static real_type epsilon() { return numeric_limits<float>::epsilon(); }
+  };
+  template<>
+  struct NumericTraits<complex<double> > {
+    typedef double real_type;
+    static real_type epsilon() { return numeric_limits<double>::epsilon(); }
+  };
+
+}
+
+#endif
diff --git a/lib/kokkos/example/md_skeleton/CMakeLists.txt b/lib/kokkos/example/md_skeleton/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..28412c37847deb211db5b6256a78a0e904d8dcaf
--- /dev/null
+++ b/lib/kokkos/example/md_skeleton/CMakeLists.txt
@@ -0,0 +1,16 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+SET(SOURCES "")
+SET(LIBRARIES "")
+
+FILE(GLOB SOURCES *.cpp )
+
+TRIBITS_ADD_EXECUTABLE(
+  md_skeleton 
+  SOURCES ${SOURCES}
+  COMM serial mpi
+  DEPLIBS ${LIBRARIES}
+  )
+
diff --git a/lib/kokkos/example/md_skeleton/Makefile b/lib/kokkos/example/md_skeleton/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..bf8fbea3e09a5d71f900de85ff2100cf41bd5738
--- /dev/null
+++ b/lib/kokkos/example/md_skeleton/Makefile
@@ -0,0 +1,53 @@
+KOKKOS_PATH ?= ../..
+
+MAKEFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST)))
+SRC_DIR := $(dir $(MAKEFILE_PATH))
+
+SRC = $(wildcard $(SRC_DIR)/*.cpp)
+OBJ = $(SRC:$(SRC_DIR)/%.cpp=%.o)
+
+#SRC = $(wildcard *.cpp)
+#OBJ = $(SRC:%.cpp=%.o)
+
+default: build
+	echo "Start Build"
+
+# use installed Makefile.kokkos
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+CXX = $(NVCC_WRAPPER)
+CXXFLAGS = -I$(SRC_DIR) -O3
+LINK = $(CXX)
+LINKFLAGS = 
+EXE = $(addsuffix .cuda, $(shell basename $(SRC_DIR)))
+#KOKKOS_DEVICES = "Cuda,OpenMP"
+#KOKKOS_ARCH = "SNB,Kepler35"
+else
+CXX = g++
+CXXFLAGS = -I$(SRC_DIR) -O3
+LINK = $(CXX)
+LINKFLAGS =  
+EXE = $(addsuffix .host, $(shell basename $(SRC_DIR)))
+#KOKKOS_DEVICES = "OpenMP"
+#KOKKOS_ARCH = "SNB"
+endif
+
+DEPFLAGS = -M
+
+LIB =
+
+
+build: $(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: 
+	rm -f *.a *.o *.cuda *.host
+
+# Compilation rules
+
+%.o:$(SRC_DIR)/%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
+
diff --git a/lib/kokkos/example/md_skeleton/README b/lib/kokkos/example/md_skeleton/README
new file mode 100644
index 0000000000000000000000000000000000000000..1ce682b0a6ec64175587d70c593e39ba8d304d75
--- /dev/null
+++ b/lib/kokkos/example/md_skeleton/README
@@ -0,0 +1,3 @@
+To build this example on a 2012-model Macbook Pro with NVIDIA Kepler GPU:
+
+./build.cuda_std g++_osx cuda_osx 30 opt
diff --git a/lib/kokkos/example/md_skeleton/force.cpp b/lib/kokkos/example/md_skeleton/force.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a31944f859164e0aef5f9f29c90c59cb16cd526b
--- /dev/null
+++ b/lib/kokkos/example/md_skeleton/force.cpp
@@ -0,0 +1,192 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+/* Define values which set the max number of registers used for the Force Kernel
+ * Its 32 * 2048 / (KOKKOS_CUDA_MAX_THREADS * KOKKOS_CUDA_MIN_BLOCKS)
+ * Have to be set before including Kokkos header files.
+ */
+
+#define KOKKOS_CUDA_MAX_THREADS 512
+#define KOKKOS_CUDA_MIN_BLOCKS 3
+
+#include <system.h>
+#include <cstdio>
+
+
+/* Simple Lennard Jones Force Kernel using neighborlists
+ * Calculates for every pair of atoms (i,j) with distance smaller r_cut
+ * f_ij = 4*epsilon * ( (sigma/r_ij)^12 - (sigma/r_ij)^6 )
+ * where r_ij is the distance of atoms (i,j).
+ * The force on atom i is the sum over f_ij:
+ * f_i = sum_j (f_ij)
+ * Neighborlists are used in order to pre calculate which atoms j are
+ * close enough to i to be able to contribute. By choosing a larger neighbor
+ * cutoff then the force cutoff, the neighbor list can be reused several times
+ * (typically 10 - 100).
+ */
+
+struct ForceFunctor {
+
+  typedef t_x_array::execution_space execution_space; //Device Type for running the kernel
+  typedef double2 value_type; // When energy calculation is requested return energy, and virial
+
+  t_x_array_randomread x;       //atom positions
+  t_f_array f;                  //atom forces
+  t_int_1d_const numneigh;      //number of neighbors per atom
+  t_neighbors_const neighbors;  //neighborlist
+  double cutforcesq;            //force cutoff
+  double epsilon;               //Potential parameter
+  double sigma6;                //Potential parameter
+
+
+  ForceFunctor(System s) {
+    x = s.d_x;
+    f = s.f;
+    numneigh = s.numneigh;
+    neighbors = s.neighbors;
+    cutforcesq = s.force_cutsq;
+    epsilon = 1.0;
+    sigma6 = 1.0;
+  }
+
+  /* Operator for not calculating energy and virial */
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int &i) const {
+    force<0>(i);
+  }
+
+  /* Operator for calculating energy and virial */
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int &i, double2 &energy_virial) const {
+    double2 ev = force<1>(i);
+    energy_virial.x += ev.x;
+    energy_virial.y += ev.y;
+  }
+
+  template<int EVFLAG>
+  KOKKOS_INLINE_FUNCTION
+  double2 force(const int &i) const
+  {
+    const int numneighs = numneigh[i];
+    const double xtmp = x(i, 0);
+    const double ytmp = x(i, 1);
+    const double ztmp = x(i, 2);
+    double fix = 0;
+    double fiy = 0;
+    double fiz = 0;
+    double energy = 0;
+    double virial = 0;
+
+    //pragma simd forces vectorization (ignoring the performance objections of the compiler)
+    //give hint to compiler that fix, fiy and fiz are used for reduction only
+
+  #ifdef USE_SIMD
+    #pragma simd reduction (+: fix,fiy,fiz,energy,virial)
+  #endif
+    for(int k = 0; k < numneighs; k++) {
+      const int j = neighbors(i, k);
+      const double delx = xtmp - x(j, 0);
+      const double dely = ytmp - x(j, 1);
+      const double delz = ztmp - x(j, 2);
+      const double rsq = delx * delx + dely * dely + delz * delz;
+
+      //if(i==0) printf("%i %i %lf %lf\n",i,j,rsq,cutforcesq);
+      if(rsq < cutforcesq) {
+        const double sr2 = 1.0 / rsq;
+        const double sr6 = sr2 * sr2 * sr2  * sigma6;
+        const double force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
+        fix += delx * force;
+        fiy += dely * force;
+        fiz += delz * force;
+
+        if(EVFLAG) {
+          energy += sr6 * (sr6 - 1.0) * epsilon;
+          virial += delx * delx * force + dely * dely * force + delz * delz * force;
+        }
+      }
+    }
+
+    f(i, 0) += fix;
+    f(i, 1) += fiy;
+    f(i, 2) += fiz;
+
+    double2 energy_virial ;
+    energy_virial.x = 4.0 * energy ;
+    energy_virial.y = 0.5 * virial ;
+    return energy_virial;
+  }
+
+  /* init and join functions when doing the reduction to obtain energy and virial */
+
+  KOKKOS_FUNCTION
+  static void init(volatile value_type &update) {
+    update.x = update.y = 0;
+  }
+  KOKKOS_FUNCTION
+  static void join(volatile value_type &update ,
+                   const volatile value_type &source) {
+    update.x += source.x ;
+    update.y += source.y ;
+  }
+
+};
+
+
+/* Calling function */
+
+double2 force(System &s,int evflag) {
+
+  ForceFunctor f(s);
+
+  double2 ev ; ev.x = 0 ; ev.y = 0 ;
+  if(!evflag)
+    Kokkos::parallel_for(s.nlocal,f);
+  else
+    Kokkos::parallel_reduce(s.nlocal,f,ev);
+
+  execution_space::fence();
+  return ev;
+}
+
diff --git a/lib/kokkos/example/md_skeleton/main.cpp b/lib/kokkos/example/md_skeleton/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..58cf76cab014fde0183d80a5dae347069a72f874
--- /dev/null
+++ b/lib/kokkos/example/md_skeleton/main.cpp
@@ -0,0 +1,205 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cstdio>
+#include <cstring>
+#include <cstdlib>
+#include "system.h"
+
+int create_system(System &system, int nx, int ny, int nz, double rho);
+int neigh_setup(System &system);
+int neigh_build(System &system);
+double2 force(System &system,int evflag);
+
+/* simple MD Skeleton which
+ *   - constructs a simple FCC lattice,
+ *   - computes a neighborlist
+ *   - compute LJ-Force kernel a number of times
+ */
+
+int main(int argc, char** argv) {
+
+  printf("Running MD Skeleton\n");
+  /* Thread numbers for Host */
+
+  int num_threads = 1;
+  int teams = 1;
+  int device = 0; // Default device for GPU runs
+
+  /* avoid unused variable warnings */
+  (void)num_threads;
+  (void)teams;
+  (void)device;
+
+  /* Default value for number of force calculations */
+
+  int iter = 100;
+
+  /* Default value for system size (4*nx*ny*nz atoms)
+   * nx, ny and nz are set to system_size if not specified on commandline */
+
+  int system_size = 20;
+  int nx = -1;
+  int ny = -1;
+  int nz = -1;
+
+  int neighbor_size = 1; // Default bin size for neighbor list construction
+
+  double rho = 0.8442; // Number density of the system
+  double delta = 0; // Scaling factor for random offsets of atom positions
+
+
+  /* read in command-line arguments */
+
+  for(int i = 0; i < argc; i++) {
+    if((strcmp(argv[i], "-t") == 0) || (strcmp(argv[i], "--num_threads") == 0)) {
+      num_threads = atoi(argv[++i]);
+      continue;
+    }
+
+    if((strcmp(argv[i], "--teams") == 0)) {
+      teams = atoi(argv[++i]);
+      continue;
+    }
+
+    if((strcmp(argv[i], "-d") == 0) || (strcmp(argv[i], "--device") == 0))  {
+      device = atoi(argv[++i]);
+      continue;
+    }
+
+    if((strcmp(argv[i], "--delta") == 0)) {
+      delta = atof(argv[++i]);
+      continue;
+    }
+
+    if((strcmp(argv[i], "-i") == 0) || (strcmp(argv[i], "--iter") == 0))  {
+      iter = atoi(argv[++i]);
+      continue;
+    }
+
+    if((strcmp(argv[i], "-rho") == 0)) {
+      rho = atoi(argv[++i]);
+      continue;
+    }
+
+    if((strcmp(argv[i], "-s") == 0) || (strcmp(argv[i], "--size") == 0)) {
+      system_size = atoi(argv[++i]);
+      continue;
+    }
+
+    if((strcmp(argv[i], "-nx") == 0)) {
+      nx = atoi(argv[++i]);
+      continue;
+    }
+
+    if((strcmp(argv[i], "-ny") == 0)) {
+      ny = atoi(argv[++i]);
+      continue;
+    }
+
+    if((strcmp(argv[i], "-nz") == 0)) {
+      nz = atoi(argv[++i]);
+      continue;
+    }
+
+    if((strcmp(argv[i], "-b") == 0) || (strcmp(argv[i], "--neigh_bins") == 0))  {
+      neighbor_size = atoi(argv[++i]);
+      continue;
+    }
+  }
+
+  if( nx < 0 ) nx = system_size;
+  if( ny < 0 ) ny = system_size;
+  if( nz < 0 ) nz = system_size;
+
+  printf("-> Init Device\n");
+
+#if defined( KOKKOS_HAVE_CUDA )
+  Kokkos::HostSpace::execution_space::initialize(teams*num_threads);
+  Kokkos::Cuda::SelectDevice select_device(device);
+  Kokkos::Cuda::initialize(select_device);
+#elif defined( KOKKOS_HAVE_OPENMP )
+  Kokkos::OpenMP::initialize(teams*num_threads);
+#elif defined( KOKKOS_HAVE_PTHREAD )
+  Kokkos::Threads::initialize(teams*num_threads);
+#endif
+
+  System system;
+  system.neigh_cut = 2.8;
+  system.force_cut = 2.5;
+  system.force_cutsq = system.force_cut*system.force_cut;
+  system.delta = delta;
+
+  printf("-> Build system\n");
+  create_system(system,nx,ny,nz,rho);
+
+  printf("-> Created %i atoms and %i ghost atoms\n",system.nlocal,system.nghost);
+
+  system.nbinx = system.box.xprd/neighbor_size+1;
+  system.nbiny = system.box.yprd/neighbor_size+1;
+  system.nbinz = system.box.zprd/neighbor_size+1;
+
+
+  printf("-> Building Neighborlist\n");
+
+  neigh_setup(system);
+  neigh_build(system);
+
+  double2 ev = force(system,1);
+
+  printf("-> Calculate Energy: %f Virial: %f\n",ev.x,ev.y);
+
+  printf("-> Running %i force calculations\n",iter);
+
+  Kokkos::Timer timer;
+
+  for(int i=0;i<iter;i++) {
+    force(system,0);
+  }
+
+
+  double time = timer.seconds();
+  printf("Time: %e s for %i iterations with %i atoms\n",time,iter,system.nlocal);
+
+  execution_space::finalize();
+}
diff --git a/lib/kokkos/example/md_skeleton/neighbor.cpp b/lib/kokkos/example/md_skeleton/neighbor.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2a77932946f9dec0badb133ac1d6a08d5465d240
--- /dev/null
+++ b/lib/kokkos/example/md_skeleton/neighbor.cpp
@@ -0,0 +1,430 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <system.h>
+#include <cstdio>
+#include <Kokkos_Core.hpp>
+
+#define SMALL 1.0e-6
+#define FACTOR 0.999
+
+/* BinningFunctor puts atoms into bins of the simulation box
+ * Neighborlists are then created by checking only distances of atoms
+ * in adjacent bins. That makes neighborlist construction a O(N) operation.
+ */
+
+struct BinningFunctor {
+  typedef t_int_2d::execution_space execution_space;
+
+  System s;
+
+  int atoms_per_bin;
+
+  BinningFunctor(System _s): s(_s) {
+    atoms_per_bin = s.bins.dimension_1();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int &i) const
+  {
+    const int ibin = coord2bin(s.d_x(i, 0), s.d_x(i, 1), s.d_x(i, 2));
+
+    const int ac = Kokkos::atomic_fetch_add(&s.bincount[ibin], 1);
+
+    if(ac < atoms_per_bin) {
+      s.bins(ibin, ac) = i;
+    } else if(s.d_resize(0) < ac) {
+      s.d_resize(0) = ac;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  int coord2bin(double x, double y, double z) const
+  {
+    int ix, iy, iz;
+
+    if(x >= s.box.xprd)
+      ix = (int)((x - s.box.xprd) * s.bininvx) + s.nbinx - s.mbinxlo;
+    else if(x >= 0.0)
+      ix = (int)(x * s.bininvx) - s.mbinxlo;
+    else
+      ix = (int)(x * s.bininvx) - s.mbinxlo - 1;
+
+    if(y >= s.box.yprd)
+      iy = (int)((y - s.box.yprd) * s.bininvy) + s.nbiny - s.mbinylo;
+    else if(y >= 0.0)
+      iy = (int)(y * s.bininvy) - s.mbinylo;
+    else
+      iy = (int)(y * s.bininvy) - s.mbinylo - 1;
+
+    if(z >= s.box.zprd)
+      iz = (int)((z - s.box.zprd) * s.bininvz) + s.nbinz - s.mbinzlo;
+    else if(z >= 0.0)
+      iz = (int)(z * s.bininvz) - s.mbinzlo;
+    else
+      iz = (int)(z * s.bininvz) - s.mbinzlo - 1;
+
+    return (iz * s.mbiny * s.mbinx + iy * s.mbinx + ix + 1);
+  }
+};
+
+/* Build the actual neighborlist*/
+
+struct BuildFunctor {
+
+  typedef t_int_2d::execution_space execution_space;
+
+  System s;
+
+  int maxneighs;
+  BuildFunctor(System _s): s(_s) {
+    maxneighs = s.neighbors.dimension_1();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int &i) const
+  {
+    int n = 0;
+
+    const t_int_1d_const_um bincount_c = s.bincount;
+
+    const double xtmp = s.d_x(i, 0);
+    const double ytmp = s.d_x(i, 1);
+    const double ztmp = s.d_x(i, 2);
+
+    const int ibin = coord2bin(xtmp, ytmp, ztmp);
+
+    // loop over all bins in neighborhood (includes ibin)
+    for(int k = 0; k < s.nstencil; k++) {
+      const int jbin = ibin + s.d_stencil[k];
+
+      // get subview of jbin
+      const t_int_1d_const_um loc_bin =
+          Kokkos::subview(s.bins,jbin,Kokkos::ALL());
+
+      if(ibin == jbin)
+        for(int m = 0; m < bincount_c[jbin]; m++) {
+          const int j = loc_bin[m];
+
+          //for same bin as atom i skip j if i==j
+          if (j == i) continue;
+
+          const double delx = xtmp - s.d_x(j, 0);
+          const double dely = ytmp - s.d_x(j, 1);
+          const double delz = ztmp - s.d_x(j, 2);
+          const double rsq = delx * delx + dely * dely + delz * delz;
+
+          if(rsq <= s.neigh_cutsq && n<maxneighs) s.neighbors(i,n++) = j;
+        }
+      else {
+        for(int m = 0; m < bincount_c[jbin]; m++) {
+          const int j = loc_bin[m];
+
+          const double delx = xtmp - s.d_x(j, 0);
+          const double dely = ytmp - s.d_x(j, 1);
+          const double delz = ztmp - s.d_x(j, 2);
+          const double rsq = delx * delx + dely * dely + delz * delz;
+
+          if(rsq <= s.neigh_cutsq && n<maxneighs) s.neighbors(i,n++) = j;
+        }
+      }
+    }
+
+    s.numneigh[i] = n;
+
+    if(n >= maxneighs) {
+      if(n >= s.d_resize(0)) s.d_resize(0) = n;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  int coord2bin(double x, double y, double z) const
+  {
+    int ix, iy, iz;
+
+    if(x >= s.box.xprd)
+      ix = (int)((x - s.box.xprd) * s.bininvx) + s.nbinx - s.mbinxlo;
+    else if(x >= 0.0)
+      ix = (int)(x * s.bininvx) - s.mbinxlo;
+    else
+      ix = (int)(x * s.bininvx) - s.mbinxlo - 1;
+
+    if(y >= s.box.yprd)
+      iy = (int)((y - s.box.yprd) * s.bininvy) + s.nbiny - s.mbinylo;
+    else if(y >= 0.0)
+      iy = (int)(y * s.bininvy) - s.mbinylo;
+    else
+      iy = (int)(y * s.bininvy) - s.mbinylo - 1;
+
+    if(z >= s.box.zprd)
+      iz = (int)((z - s.box.zprd) * s.bininvz) + s.nbinz - s.mbinzlo;
+    else if(z >= 0.0)
+      iz = (int)(z * s.bininvz) - s.mbinzlo;
+    else
+      iz = (int)(z * s.bininvz) - s.mbinzlo - 1;
+
+    return (iz * s.mbiny * s.mbinx + iy * s.mbinx + ix + 1);
+  }
+};
+
+/* Reset an array to zero */
+
+struct MemsetZeroFunctor {
+  typedef t_x_array::execution_space  execution_space ;
+  void* ptr;
+  KOKKOS_INLINE_FUNCTION void operator()(const int i) const {
+    ((int*)ptr)[i] = 0;
+  }
+};
+
+/* Calculate distance of two bins */
+
+double bindist(System &s, int i, int j, int k)
+{
+  double delx, dely, delz;
+
+  if(i > 0)
+    delx = (i - 1) * s.binsizex;
+  else if(i == 0)
+    delx = 0.0;
+  else
+    delx = (i + 1) * s.binsizex;
+
+  if(j > 0)
+    dely = (j - 1) * s.binsizey;
+  else if(j == 0)
+    dely = 0.0;
+  else
+    dely = (j + 1) * s.binsizey;
+
+  if(k > 0)
+    delz = (k - 1) * s.binsizez;
+  else if(k == 0)
+    delz = 0.0;
+  else
+    delz = (k + 1) * s.binsizez;
+
+  return (delx * delx + dely * dely + delz * delz);
+}
+
+/* Setup the neighborlist construction
+ * Determine binsizes, a stencil for defining adjacency, etc.
+ */
+
+void neigh_setup(System &s) {
+
+  s.neigh_cutsq = s.neigh_cut * s.neigh_cut;
+
+  /*
+  c bins must evenly divide into box size,
+  c   becoming larger than cutneigh if necessary
+  c binsize = 1/2 of cutoff is near optimal
+
+  if (flag == 0) {
+    nbinx = 2.0 * xprd / cutneigh;
+    nbiny = 2.0 * yprd / cutneigh;
+    nbinz = 2.0 * zprd / cutneigh;
+    if (nbinx == 0) nbinx = 1;
+    if (nbiny == 0) nbiny = 1;
+    if (nbinz == 0) nbinz = 1;
+  }
+  */
+
+  s.binsizex = s.box.xprd / s.nbinx;
+  s.binsizey = s.box.yprd / s.nbiny;
+  s.binsizez = s.box.zprd / s.nbinz;
+  s.bininvx = 1.0 / s.binsizex;
+  s.bininvy = 1.0 / s.binsizey;
+  s.bininvz = 1.0 / s.binsizez;
+
+  double coord = s.box.xlo - s.neigh_cut - SMALL * s.box.xprd;
+  s.mbinxlo = static_cast<int>(coord * s.bininvx);
+
+  if(coord < 0.0) s.mbinxlo = s.mbinxlo - 1;
+
+  coord = s.box.xhi + s.neigh_cut + SMALL * s.box.xprd;
+  int mbinxhi = static_cast<int>(coord * s.bininvx);
+
+  coord = s.box.ylo - s.neigh_cut - SMALL * s.box.yprd;
+  s.mbinylo = static_cast<int>(coord * s.bininvy);
+
+  if(coord < 0.0) s.mbinylo = s.mbinylo - 1;
+
+  coord = s.box.yhi + s.neigh_cut + SMALL * s.box.yprd;
+  int mbinyhi = static_cast<int>(coord * s.bininvy);
+
+  coord = s.box.zlo - s.neigh_cut - SMALL * s.box.zprd;
+  s.mbinzlo = static_cast<int>(coord * s.bininvz);
+
+  if(coord < 0.0) s.mbinzlo = s.mbinzlo - 1;
+
+  coord = s.box.zhi + s.neigh_cut + SMALL * s.box.zprd;
+  int mbinzhi = static_cast<int>(coord * s.bininvz);
+
+  /* extend bins by 1 in each direction to insure stencil coverage */
+
+  s.mbinxlo = s.mbinxlo - 1;
+  mbinxhi = mbinxhi + 1;
+  s.mbinx = mbinxhi - s.mbinxlo + 1;
+
+  s.mbinylo = s.mbinylo - 1;
+  mbinyhi = mbinyhi + 1;
+  s.mbiny = mbinyhi - s.mbinylo + 1;
+
+  s.mbinzlo = s.mbinzlo - 1;
+  mbinzhi = mbinzhi + 1;
+  s.mbinz = mbinzhi - s.mbinzlo + 1;
+
+  /*
+  compute bin stencil of all bins whose closest corner to central bin
+  is within neighbor cutoff
+  for partial Newton (newton = 0),
+  stencil is all surrounding bins including self
+  for full Newton (newton = 1),
+  stencil is bins to the "upper right" of central bin, does NOT include self
+  next(xyz) = how far the stencil could possibly extend
+  factor < 1.0 for special case of LJ benchmark so code will create
+  correct-size stencil when there are 3 bins for every 5 lattice spacings
+  */
+
+  int nextx = static_cast<int>(s.neigh_cut * s.bininvx);
+
+  if(nextx * s.binsizex < FACTOR * s.neigh_cut) nextx++;
+
+  int nexty = static_cast<int>(s.neigh_cut * s.bininvy);
+
+  if(nexty * s.binsizey < FACTOR * s.neigh_cut) nexty++;
+
+  int nextz = static_cast<int>(s.neigh_cut * s.bininvz);
+
+  if(nextz * s.binsizez < FACTOR * s.neigh_cut) nextz++;
+
+  int nmax = (2 * nextz + 1) * (2 * nexty + 1) * (2 * nextx + 1);
+  s.d_stencil = t_int_1d("stencil", nmax);
+  s.h_stencil = Kokkos::create_mirror_view(s.d_stencil);
+  s.nstencil = 0;
+  int kstart = -nextz;
+
+  for(int k = kstart; k <= nextz; k++) {
+    for(int j = -nexty; j <= nexty; j++) {
+      for(int i = -nextx; i <= nextx; i++) {
+        if(bindist(s,i, j, k) < s.neigh_cutsq) {
+          s.h_stencil(s.nstencil++) = k * s.mbiny * s.mbinx + j * s.mbinx + i;
+        }
+      }
+    }
+  }
+
+  /* Allocate neighbor arrays */
+
+  Kokkos::deep_copy(s.d_stencil, s.h_stencil);
+  s.mbins = s.mbinx * s.mbiny * s.mbinz;
+  s.bincount = t_int_1d("bincount", s.mbins);
+  s.bins = t_int_2d("bins", s.mbins, 8);
+
+  s.neighbors = t_neighbors("neighbors",s.natoms,80);
+  s.numneigh = t_int_1d("numneigh",s.natoms);
+  s.d_resize = t_int_scalar("resize");
+  s.h_resize = Kokkos::create_mirror_view(s.d_resize);
+}
+
+
+/* Build the neighborlist
+ * This is a try and rerun algorithm for handling the case where the bins array
+ * and the neighbors array are not big enough. So if one is too small, it will
+ * reallocate and rerun the binnind algorithm or the neighborlist construction.
+ */
+
+void neigh_build(System &s) {
+
+  /* Binning of atoms */
+
+  s.h_resize(0) = 1;
+
+  while(s.h_resize(0) > 0) {
+    s.h_resize(0) = 0;
+    Kokkos::deep_copy(s.d_resize, s.h_resize);
+
+    MemsetZeroFunctor f_zero;
+    f_zero.ptr = (void*) s.bincount.ptr_on_device();
+    Kokkos::parallel_for(s.mbins, f_zero);
+    execution_space::fence();
+
+    BinningFunctor f(s);
+    Kokkos::parallel_for(s.natoms, f);
+    execution_space::fence();
+
+    /* Check if bins was large enough, if nor reallocated and rerun */
+
+    deep_copy(s.h_resize, s.d_resize);
+
+    if(s.h_resize(0)) {
+      int atoms_per_bin = s.h_resize(0)+2;
+      s.bins = t_int_2d("bins", s.mbins, atoms_per_bin);
+    }
+  }
+
+  /* Neighborlist construction */
+
+  s.h_resize(0) = 1;
+
+  while(s.h_resize(0)) {
+    s.h_resize(0) = 0;
+
+    Kokkos::deep_copy(s.d_resize, s.h_resize);
+
+    BuildFunctor f(s);
+    Kokkos::parallel_for(s.nlocal, f);
+
+    execution_space::fence();
+
+    /* Check if neighbors was large enough, if nor reallocated and rerun */
+
+    deep_copy(s.h_resize, s.d_resize);
+
+    if(s.h_resize(0)) {
+      int maxneighs = s.h_resize(0) * 1.2;
+      s.neighbors = t_neighbors("neighbors", s.natoms, maxneighs);
+    }
+  }
+}
diff --git a/lib/kokkos/example/md_skeleton/setup.cpp b/lib/kokkos/example/md_skeleton/setup.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7815a89101ce53dbe7c7b9cf51e0ab314acbd36b
--- /dev/null
+++ b/lib/kokkos/example/md_skeleton/setup.cpp
@@ -0,0 +1,271 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <system.h>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+/* initialize atoms on fcc lattice in parallel fashion */
+
+#define MAX(a,b) (a>b?a:b)
+#define MIN(a,b) (a<b?a:b)
+
+
+int create_system(System &system, int nx, int ny, int nz, double rho)
+{
+  /* Box Setup */
+
+  double lattice = pow((4.0 / rho), (1.0 / 3.0));
+  system.box.xprd = nx * lattice;
+  system.box.yprd = ny * lattice;
+  system.box.zprd = nz * lattice;
+  system.box.xlo = 0;
+  system.box.ylo = 0;
+  system.box.zlo = 0;
+  system.box.xhi = system.box.xprd;
+  system.box.yhi = system.box.yprd;
+  system.box.zhi = system.box.zprd;
+
+
+  int ghost_dist = int(system.neigh_cut/lattice) + 1;
+
+  /* total # of atoms */
+
+  system.nlocal = 4 * nx * ny * nz;
+  system.nghost = 4 * (nx + 2 * ghost_dist) *
+                      (ny + 2 * ghost_dist) *
+                      (nz + 2 * ghost_dist) -
+                      system.nlocal;
+  system.natoms = system.nlocal + system.nghost;
+
+  system.d_x = t_x_array("X",system.natoms);
+  system.h_x = Kokkos::create_mirror_view(system.d_x);
+  system.f = t_f_array("F",system.natoms);
+
+  /* determine loop bounds of lattice subsection that overlaps my sub-box
+     insure loop bounds do not exceed nx,ny,nz */
+
+  double alat = pow((4.0 / rho), (1.0 / 3.0));
+  int ilo = static_cast<int>(system.box.xlo / (0.5 * alat) - 1);
+  int ihi = static_cast<int>(system.box.xhi / (0.5 * alat) + 1);
+  int jlo = static_cast<int>(system.box.ylo / (0.5 * alat) - 1);
+  int jhi = static_cast<int>(system.box.yhi / (0.5 * alat) + 1);
+  int klo = static_cast<int>(system.box.zlo / (0.5 * alat) - 1);
+  int khi = static_cast<int>(system.box.zhi / (0.5 * alat) + 1);
+
+  ilo = MAX(ilo, 0);
+  ihi = MIN(ihi, 2 * nx - 1);
+  jlo = MAX(jlo, 0);
+  jhi = MIN(jhi, 2 * ny - 1);
+  klo = MAX(klo, 0);
+  khi = MIN(khi, 2 * nz - 1);
+
+
+
+  /* generates positions of atoms on fcc sublattice*/
+
+  srand(3718273);
+  /* create non-ghost atoms */
+  {
+    double xtmp, ytmp, ztmp;
+    int sx = 0;
+    int sy = 0;
+    int sz = 0;
+    int ox = 0;
+    int oy = 0;
+    int oz = 0;
+    int subboxdim = 8;
+
+    int n = 0;
+    int iflag = 0;
+
+    while(oz * subboxdim <= khi) {
+      const int k = oz * subboxdim + sz;
+      const int j = oy * subboxdim + sy;
+      const int i = ox * subboxdim + sx;
+
+      if(iflag) continue;
+
+      if(((i + j + k) % 2 == 0) &&
+          (i >= ilo) && (i <= ihi) &&
+          (j >= jlo) && (j <= jhi) &&
+          (k >= klo) && (k <= khi)) {
+
+        const int nold = n;
+        while(nold == n) {
+          xtmp = 0.5 * alat * i + system.delta/1000*(rand()%1000-500);
+          ytmp = 0.5 * alat * j + system.delta/1000*(rand()%1000-500);
+          ztmp = 0.5 * alat * k + system.delta/1000*(rand()%1000-500);
+
+          if(xtmp >= system.box.xlo && xtmp < system.box.xhi &&
+              ytmp >= system.box.ylo && ytmp < system.box.yhi &&
+              ztmp >= system.box.zlo && ztmp < system.box.zhi) {
+            system.h_x(n,0) = xtmp;
+            system.h_x(n,1) = ytmp;
+            system.h_x(n,2) = ztmp;
+            n++;
+          }
+        }
+      }
+
+      sx++;
+
+      if(sx == subboxdim) {
+        sx = 0;
+        sy++;
+      }
+
+      if(sy == subboxdim) {
+        sy = 0;
+        sz++;
+      }
+
+      if(sz == subboxdim) {
+        sz = 0;
+        ox++;
+      }
+
+      if(ox * subboxdim > ihi) {
+        ox = 0;
+        oy++;
+      }
+
+      if(oy * subboxdim > jhi) {
+        oy = 0;
+        oz++;
+      }
+    }
+
+    /* check that correct # of atoms were created */
+
+    if(system.nlocal != n) {
+      printf("Created incorrect # of atoms\n");
+
+      return 1;
+    }
+  }
+
+  /* create ghost atoms */
+
+  {
+    double xtmp, ytmp, ztmp;
+
+    int ilo_g = ilo - 2 * ghost_dist;
+    int jlo_g = jlo - 2 * ghost_dist;
+    int klo_g = klo - 2 * ghost_dist;
+    int ihi_g = ihi + 2 * ghost_dist;
+    int jhi_g = jhi + 2 * ghost_dist;
+    int khi_g = khi + 2 * ghost_dist;
+
+    int subboxdim = 8;
+    int sx = 0;
+    int sy = 0;
+    int sz = 0;
+    int ox = subboxdim * ilo_g;
+    int oy = subboxdim * jlo_g;
+    int oz = subboxdim * klo_g;
+
+    int n = system.nlocal;
+    int iflag = 0;
+
+
+    while(oz * subboxdim <= khi_g) {
+      const int k = oz * subboxdim + sz;
+      const int j = oy * subboxdim + sy;
+      const int i = ox * subboxdim + sx;
+
+      if(iflag) continue;
+
+      if(((i + j + k) % 2 == 0) &&
+          (i >= ilo_g) && (i <= ihi_g) &&
+          (j >= jlo_g) && (j <= jhi_g) &&
+          (k >= klo_g) && (k <= khi_g) &&
+          ((i < ilo) || (i > ihi) ||
+           (j < jlo) || (j > jhi) ||
+           (k < klo) || (k > khi))
+          ) {
+
+        xtmp = 0.5 * alat * i;
+        ytmp = 0.5 * alat * j;
+        ztmp = 0.5 * alat * k;
+
+        system.h_x(n,0) = xtmp + system.delta/1000*(rand()%1000-500);;
+        system.h_x(n,1) = ytmp + system.delta/1000*(rand()%1000-500);;
+        system.h_x(n,2) = ztmp + system.delta/1000*(rand()%1000-500);;
+        n++;
+      }
+
+      sx++;
+
+      if(sx == subboxdim) {
+        sx = 0;
+        sy++;
+      }
+
+      if(sy == subboxdim) {
+        sy = 0;
+        sz++;
+      }
+
+      if(sz == subboxdim) {
+        sz = 0;
+        ox++;
+        //printf("%i %i %i // %i %i %i\n",ox,oy,oz,i,j,k);
+      }
+
+      if(ox * subboxdim > ihi_g) {
+        ox = subboxdim * ilo_g;
+        oy++;
+      }
+
+      if(oy * subboxdim > jhi_g) {
+        oy = subboxdim * jlo_g;
+        oz++;
+      }
+    }
+  }
+
+  Kokkos::deep_copy(system.d_x,system.h_x);
+  return 0;
+}
+
diff --git a/lib/kokkos/example/md_skeleton/system.h b/lib/kokkos/example/md_skeleton/system.h
new file mode 100644
index 0000000000000000000000000000000000000000..0184a119ff2d260442e624cd1f5e8a890cefe24f
--- /dev/null
+++ b/lib/kokkos/example/md_skeleton/system.h
@@ -0,0 +1,92 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef SYSTEM_H_
+#define SYSTEM_H_
+
+#include <types.h>
+
+struct Box {
+  double xprd, yprd, zprd;
+  double xlo, xhi;
+  double ylo, yhi;
+  double zlo, zhi;
+};
+
+struct System {
+  Box box;
+
+  int natoms;
+  int nlocal;
+  int nghost;
+
+  t_x_array d_x;
+  t_x_array_host h_x;
+
+  t_f_array f;
+
+  t_neighbors neighbors;
+  t_int_1d numneigh;
+
+  double delta;
+
+  double neigh_cut,neigh_cutsq;
+
+  int mbins;
+  int nbinx,nbiny,nbinz;
+  int mbinx,mbiny,mbinz;
+  int mbinxlo,mbinylo,mbinzlo;
+  double binsizex,binsizey,binsizez;
+  double bininvx,bininvy,bininvz;
+
+  t_int_1d bincount;
+  t_int_2d bins;
+  t_int_scalar d_resize;
+  t_int_scalar_host h_resize;
+  t_int_1d d_stencil;
+  t_int_1d_host h_stencil;
+  int nstencil;
+
+  double force_cut,force_cutsq;
+};
+#endif
diff --git a/lib/kokkos/example/md_skeleton/types.h b/lib/kokkos/example/md_skeleton/types.h
new file mode 100644
index 0000000000000000000000000000000000000000..7f92b7cd0f8089d93c1e18e5dff3ad1508316867
--- /dev/null
+++ b/lib/kokkos/example/md_skeleton/types.h
@@ -0,0 +1,118 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef TYPES_H_
+#define TYPES_H_
+
+/* Determine default device type and necessary includes */
+
+#include <Kokkos_Core.hpp>
+
+typedef Kokkos::DefaultExecutionSpace execution_space ;
+
+#if ! defined( KOKKOS_HAVE_CUDA )
+  struct double2 {
+    double x, y;
+    KOKKOS_INLINE_FUNCTION
+    double2(double xinit, double yinit) {
+      x = xinit;
+      y = yinit;
+    }
+    KOKKOS_INLINE_FUNCTION
+    double2() {
+      x = 0.0;
+      y = 0.0;
+    }
+    KOKKOS_INLINE_FUNCTION
+    double2& operator += (const double2& src) {
+      x+=src.x;
+      y+=src.y;
+      return *this;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    volatile double2& operator += (const volatile double2& src) volatile {
+      x+=src.x;
+      y+=src.y;
+      return *this;
+    }
+
+  };
+#endif
+
+#include <impl/Kokkos_Timer.hpp>
+
+/* Define types used throughout the code */
+
+//Position arrays
+typedef Kokkos::View<double*[3], Kokkos::LayoutRight, execution_space>                                   t_x_array ;
+typedef t_x_array::HostMirror                                                                        t_x_array_host ;
+typedef Kokkos::View<const double*[3], Kokkos::LayoutRight, execution_space>                             t_x_array_const ;
+typedef Kokkos::View<const double*[3], Kokkos::LayoutRight, execution_space, Kokkos::MemoryRandomAccess >  t_x_array_randomread ;
+
+//Force array
+typedef Kokkos::View<double*[3],  execution_space>                                                       t_f_array ;
+
+
+//Neighborlist
+typedef Kokkos::View<int**, execution_space >                                                            t_neighbors ;
+typedef Kokkos::View<const int**, execution_space >                                                      t_neighbors_const ;
+typedef Kokkos::View<int*, execution_space, Kokkos::MemoryUnmanaged >                                    t_neighbors_sub ;
+typedef Kokkos::View<const int*, execution_space, Kokkos::MemoryUnmanaged >                              t_neighbors_const_sub ;
+
+//1d int array
+typedef Kokkos::View<int*, execution_space >                                                             t_int_1d ;
+typedef t_int_1d::HostMirror                                                                         t_int_1d_host ;
+typedef Kokkos::View<const int*, execution_space >                                                       t_int_1d_const ;
+typedef Kokkos::View<int*, execution_space , Kokkos::MemoryUnmanaged>                                    t_int_1d_um ;
+typedef Kokkos::View<const int* , execution_space , Kokkos::MemoryUnmanaged>                             t_int_1d_const_um ;
+
+//2d int array
+typedef Kokkos::View<int**, Kokkos::LayoutRight, execution_space >                                       t_int_2d ;
+typedef t_int_2d::HostMirror                                                                         t_int_2d_host ;
+
+//Scalar ints
+typedef Kokkos::View<int[1], Kokkos::LayoutLeft, execution_space>                                        t_int_scalar ;
+typedef t_int_scalar::HostMirror                                                                     t_int_scalar_host ;
+
+#endif /* TYPES_H_ */
diff --git a/lib/kokkos/example/multi_fem/BoxMeshFixture.hpp b/lib/kokkos/example/multi_fem/BoxMeshFixture.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..195bb4a6ae483dfc3ef07b78b17de4dea9b02226
--- /dev/null
+++ b/lib/kokkos/example/multi_fem/BoxMeshFixture.hpp
@@ -0,0 +1,610 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_BOXMESHFIXTURE_HPP
+#define KOKKOS_BOXMESHFIXTURE_HPP
+
+#include <cmath>
+#include <stdexcept>
+#include <sstream>
+
+#include <Kokkos_Core.hpp>
+#include <BoxMeshPartition.hpp>
+#include <FEMesh.hpp>
+#include <HexElement.hpp>
+
+//----------------------------------------------------------------------------
+
+struct FixtureElementHex8 {
+
+  static const unsigned element_node_count = 8 ;
+
+  HybridFEM::HexElement_TensorData< element_node_count > elem_data ;
+  BoxBoundsLinear box_bounds ;
+
+  FixtureElementHex8() : elem_data(), box_bounds() {}
+
+  static void create_node_boxes_from_vertex_boxes(
+    const BoxType                & vertex_box_global ,
+    const std::vector< BoxType > & vertex_box_parts ,
+          BoxType                & node_box_global ,
+          std::vector< BoxType > & node_box_parts )
+  {
+    node_box_global = vertex_box_global ;
+    node_box_parts  = vertex_box_parts  ;
+  }
+
+  void elem_to_node( const unsigned node_local , unsigned coord[] ) const
+  {
+    coord[0] += elem_data.eval_map[ node_local ][0] ;
+    coord[1] += elem_data.eval_map[ node_local ][1] ;
+    coord[2] += elem_data.eval_map[ node_local ][2] ;
+  }
+};
+
+struct FixtureElementHex27 {
+  static const unsigned element_node_count = 27 ;
+
+  HybridFEM::HexElement_TensorData< element_node_count > elem_data ;
+  BoxBoundsQuadratic box_bounds ;
+
+  FixtureElementHex27() : elem_data(), box_bounds() {}
+
+  static void create_node_boxes_from_vertex_boxes(
+    const BoxType                & vertex_box_global ,
+    const std::vector< BoxType > & vertex_box_parts ,
+          BoxType                & node_box_global ,
+          std::vector< BoxType > & node_box_parts )
+  {
+    node_box_global = vertex_box_global ;
+    node_box_parts  = vertex_box_parts  ;
+
+    node_box_global[0][1] = 2 * node_box_global[0][1] - 1 ;
+    node_box_global[1][1] = 2 * node_box_global[1][1] - 1 ;
+    node_box_global[2][1] = 2 * node_box_global[2][1] - 1 ;
+
+    for ( unsigned i = 0 ; i < vertex_box_parts.size() ; ++i ) {
+      node_box_parts[i][0][0] = 2 * node_box_parts[i][0][0] ;
+      node_box_parts[i][1][0] = 2 * node_box_parts[i][1][0] ;
+      node_box_parts[i][2][0] = 2 * node_box_parts[i][2][0] ;
+
+      node_box_parts[i][0][1] =
+        std::min( node_box_global[0][1] , 2 * node_box_parts[i][0][1] );
+      node_box_parts[i][1][1] =
+        std::min( node_box_global[1][1] , 2 * node_box_parts[i][1][1] );
+      node_box_parts[i][2][1] =
+        std::min( node_box_global[2][1] , 2 * node_box_parts[i][2][1] );
+    }
+  }
+
+  void elem_to_node( const unsigned node_local , unsigned coord[] ) const
+  {
+    coord[0] = 2 * coord[0] + elem_data.eval_map[ node_local ][0] ;
+    coord[1] = 2 * coord[1] + elem_data.eval_map[ node_local ][1] ;
+    coord[2] = 2 * coord[2] + elem_data.eval_map[ node_local ][2] ;
+  }
+};
+
+//----------------------------------------------------------------------------
+
+template< typename Scalar , class Device , class ElementSpec >
+struct BoxMeshFixture {
+
+  typedef Scalar  coordinate_scalar_type ;
+  typedef Device  execution_space ;
+
+  static const unsigned element_node_count = ElementSpec::element_node_count ;
+
+  typedef HybridFEM::FEMesh< coordinate_scalar_type ,
+                             element_node_count ,
+                             execution_space > FEMeshType ;
+
+  typedef typename FEMeshType::node_coords_type    node_coords_type ;
+  typedef typename FEMeshType::elem_node_ids_type  elem_node_ids_type ;
+  typedef typename FEMeshType::node_elem_ids_type  node_elem_ids_type ;
+
+
+  static void verify(
+    const typename FEMeshType::node_coords_type::HostMirror   & node_coords ,
+    const typename FEMeshType::elem_node_ids_type::HostMirror & elem_node_ids ,
+    const typename FEMeshType::node_elem_ids_type::HostMirror & node_elem_ids )
+  {
+    typedef typename FEMeshType::size_type         size_type ;
+    //typedef typename node_coords_type::value_type  coords_type ; // unused
+
+    const size_type node_count_total = node_coords.dimension_0();
+    const size_type elem_count_total = elem_node_ids.dimension_0();
+
+    const ElementSpec element ;
+
+    for ( size_type node_index = 0 ;
+                    node_index < node_count_total ; ++node_index ) {
+
+      for ( size_type
+              j = node_elem_ids.row_map[ node_index ] ;
+              j < node_elem_ids.row_map[ node_index + 1 ] ; ++j ) {
+
+        const size_type elem_index = node_elem_ids.entries(j,0);
+        const size_type node_local = node_elem_ids.entries(j,1);
+        const size_type en_id      = elem_node_ids(elem_index,node_local);
+
+        if ( node_index != en_id ) {
+          std::ostringstream msg ;
+          msg << "BoxMeshFixture node_elem_ids error"
+              << " : node_index(" << node_index
+              << ") entry(" << j
+              << ") elem_index(" << elem_index
+              << ") node_local(" << node_local
+              << ") elem_node_id(" << en_id
+              << ")" ;
+          throw std::runtime_error( msg.str() );
+        }
+      }
+    }
+
+    for ( size_type elem_index = 0 ;
+                    elem_index < elem_count_total; ++elem_index ) {
+
+      coordinate_scalar_type elem_node_coord[ element_node_count ][3] ;
+
+      for ( size_type nn = 0 ; nn < element_node_count ; ++nn ) {
+        const size_type node_index = elem_node_ids( elem_index , nn );
+
+        for ( size_type nc = 0 ; nc < 3 ; ++nc ) {
+          elem_node_coord[nn][nc] = node_coords( node_index , nc );
+        }
+      }
+
+
+      for ( size_type nn = 0 ; nn < element_node_count ; ++nn ) {
+
+        const unsigned ix = element.elem_data.eval_map[nn][0] ;
+        const unsigned iy = element.elem_data.eval_map[nn][1] ;
+        const unsigned iz = element.elem_data.eval_map[nn][2] ;
+
+        if ( elem_node_coord[nn][0] != elem_node_coord[0][0] + ix ||
+             elem_node_coord[nn][1] != elem_node_coord[0][1] + iy ||
+             elem_node_coord[nn][2] != elem_node_coord[0][2] + iz ) {
+
+          std::ostringstream msg ;
+          msg << "BoxMeshFixture elem_node_coord mapping failure { "
+              << elem_node_coord[nn][0] << " "
+              << elem_node_coord[nn][1] << " "
+              << elem_node_coord[nn][2] << " } != { "
+              << elem_node_coord[ 0][0] + ix << " "
+              << elem_node_coord[ 0][1] + iy << " "
+              << elem_node_coord[ 0][2] + iz
+              << " }" ;
+          throw std::runtime_error( msg.str() );
+        }
+      }
+    }
+  }
+
+  //------------------------------------
+  // Initialize element-node connectivity:
+  // Order elements that only depend on owned nodes first.
+  // These elements could be computed while waiting for
+  // received node data.
+
+  static void layout_elements_interior_exterior(
+    const BoxType                vertex_box_local_used ,
+    const BoxType                vertex_box_local_owned ,
+    const BoxType                node_box_local_used ,
+    const std::vector<size_t> &  node_used_id_map ,
+    const ElementSpec            element_fixture ,
+    const size_t                 elem_count_interior ,
+    const typename elem_node_ids_type::HostMirror elem_node_ids )
+  {
+    size_t elem_index_interior = 0 ;
+    size_t elem_index_boundary = elem_count_interior ;
+
+    for ( size_t iz = vertex_box_local_used[2][0] ;
+                 iz < vertex_box_local_used[2][1] - 1 ; ++iz ) {
+    for ( size_t iy = vertex_box_local_used[1][0] ;
+                 iy < vertex_box_local_used[1][1] - 1 ; ++iy ) {
+    for ( size_t ix = vertex_box_local_used[0][0] ;
+                 ix < vertex_box_local_used[0][1] - 1 ; ++ix ) {
+
+      size_t elem_index ;
+
+      // If lower and upper vertices are owned then element is interior
+      if ( contain( vertex_box_local_owned, ix,   iy,   iz ) &&
+           contain( vertex_box_local_owned, ix+1, iy+1, iz+1 ) ) {
+        elem_index = elem_index_interior++ ;
+      }
+      else {
+        elem_index = elem_index_boundary++ ;
+      }
+
+      for ( size_t nn = 0 ; nn < element_node_count ; ++nn ) {
+        unsigned coord[3] = { static_cast<unsigned>(ix) , static_cast<unsigned>(iy) , static_cast<unsigned>(iz) };
+
+        element_fixture.elem_to_node( nn , coord );
+
+        const size_t node_local_id =
+          box_map_id( node_box_local_used ,
+                      node_used_id_map ,
+                      coord[0] , coord[1] , coord[2] );
+
+        elem_node_ids( elem_index , nn ) = node_local_id ;
+      }
+    }}}
+  }
+
+  //------------------------------------
+  // Nested partitioning of elements by number of thread 'gangs'
+
+  static void layout_elements_partitioned(
+    const BoxType                vertex_box_local_used ,
+    const BoxType                /*vertex_box_local_owned*/ ,
+    const BoxType                node_box_local_used ,
+    const std::vector<size_t> &  node_used_id_map ,
+    const ElementSpec            element_fixture ,
+    const size_t                 thread_gang_count ,
+    const typename elem_node_ids_type::HostMirror elem_node_ids )
+  {
+    std::vector< BoxType > element_box_gangs( thread_gang_count );
+
+    BoxType element_box_local_used = vertex_box_local_used ;
+
+    element_box_local_used[0][1] -= 1 ;
+    element_box_local_used[1][1] -= 1 ;
+    element_box_local_used[2][1] -= 1 ;
+
+    box_partition_rcb( element_box_local_used , element_box_gangs );
+
+    size_t elem_index = 0 ;
+
+    for ( size_t ig = 0 ; ig < thread_gang_count ; ++ig ) {
+
+      const BoxType box = element_box_gangs[ig] ;
+
+      for ( size_t iz = box[2][0] ; iz < box[2][1] ; ++iz ) {
+      for ( size_t iy = box[1][0] ; iy < box[1][1] ; ++iy ) {
+      for ( size_t ix = box[0][0] ; ix < box[0][1] ; ++ix , ++elem_index ) {
+
+        for ( size_t nn = 0 ; nn < element_node_count ; ++nn ) {
+          unsigned coord[3] = { static_cast<unsigned>(ix) , static_cast<unsigned>(iy) , static_cast<unsigned>(iz) };
+
+          element_fixture.elem_to_node( nn , coord );
+
+          const size_t node_local_id =
+            box_map_id( node_box_local_used ,
+                        node_used_id_map ,
+                        coord[0] , coord[1] , coord[2] );
+
+          elem_node_ids( elem_index , nn ) = node_local_id ;
+        }
+      }}}
+    }
+  }
+
+  //------------------------------------
+
+  static FEMeshType create( const size_t proc_count ,
+                            const size_t proc_local ,
+                            const size_t gang_count ,
+                            const size_t elems_x ,
+                            const size_t elems_y ,
+                            const size_t elems_z ,
+                            const double x_coord_curve = 1 ,
+                            const double y_coord_curve = 1 ,
+                            const double z_coord_curve = 1 )
+  {
+    const size_t vertices_x = elems_x + 1 ;
+    const size_t vertices_y = elems_y + 1 ;
+    const size_t vertices_z = elems_z + 1 ;
+
+    const BoxBoundsLinear vertex_box_bounds ;
+    const ElementSpec element ;
+
+    // Partition based upon vertices:
+
+    BoxType vertex_box_global ;
+    std::vector< BoxType > vertex_box_parts( proc_count );
+
+    vertex_box_global[0][0] = 0 ; vertex_box_global[0][1] = vertices_x ;
+    vertex_box_global[1][0] = 0 ; vertex_box_global[1][1] = vertices_y ;
+    vertex_box_global[2][0] = 0 ; vertex_box_global[2][1] = vertices_z ;
+
+    box_partition_rcb( vertex_box_global , vertex_box_parts );
+
+    const BoxType vertex_box_local_owned = vertex_box_parts[ proc_local ];
+
+    // Determine interior and used vertices:
+
+    BoxType vertex_box_local_interior ;
+    BoxType vertex_box_local_used ;
+
+    vertex_box_bounds.apply( vertex_box_global ,
+                             vertex_box_local_owned ,
+                             vertex_box_local_interior ,
+                             vertex_box_local_used );
+
+    // Element counts:
+
+    const long local_elems_x =
+      ( vertex_box_local_used[0][1] - vertex_box_local_used[0][0] ) - 1 ;
+    const long local_elems_y =
+      ( vertex_box_local_used[1][1] - vertex_box_local_used[1][0] ) - 1 ;
+    const long local_elems_z =
+      ( vertex_box_local_used[2][1] - vertex_box_local_used[2][0] ) - 1 ;
+
+    const size_t elem_count_total = std::max( long(0) , local_elems_x ) *
+                                    std::max( long(0) , local_elems_y ) *
+                                    std::max( long(0) , local_elems_z );
+
+    const long interior_elems_x =
+      ( vertex_box_local_owned[0][1] - vertex_box_local_owned[0][0] ) - 1 ;
+    const long interior_elems_y =
+      ( vertex_box_local_owned[1][1] - vertex_box_local_owned[1][0] ) - 1 ;
+    const long interior_elems_z =
+      ( vertex_box_local_owned[2][1] - vertex_box_local_owned[2][0] ) - 1 ;
+
+    const size_t elem_count_interior = std::max( long(0) , interior_elems_x ) *
+                                       std::max( long(0) , interior_elems_y ) *
+                                       std::max( long(0) , interior_elems_z );
+
+    // Expand vertex boxes to node boxes:
+
+    BoxType node_box_global ;
+    BoxType node_box_local_used ;
+    std::vector< BoxType > node_box_parts ;
+
+    element.create_node_boxes_from_vertex_boxes(
+      vertex_box_global , vertex_box_parts ,
+      node_box_global , node_box_parts );
+
+    // Node communication maps:
+
+    size_t node_count_interior = 0 ;
+    size_t node_count_owned    = 0 ;
+    size_t node_count_total    = 0 ;
+    std::vector<size_t>                 node_used_id_map ;
+    std::vector<size_t>                 node_part_counts ;
+    std::vector< std::vector<size_t> >  node_send_map ;
+
+    box_partition_maps( node_box_global ,
+                        node_box_parts ,
+                        element.box_bounds ,
+                        proc_local ,
+                        node_box_local_used ,
+                        node_used_id_map ,
+                        node_count_interior ,
+                        node_count_owned ,
+                        node_count_total ,
+                        node_part_counts ,
+                        node_send_map );
+
+    size_t node_count_send = 0 ;
+    for ( size_t i = 0 ; i < node_send_map.size() ; ++i ) {
+      node_count_send += node_send_map[i].size();
+    }
+
+    size_t recv_msg_count = 0 ;
+    size_t send_msg_count = 0 ;
+    size_t send_count = 0 ;
+
+    for ( size_t i = 1 ; i < proc_count ; ++i ) {
+      if ( node_part_counts[i] ) ++recv_msg_count ;
+      if ( node_send_map[i].size() ) {
+        ++send_msg_count ;
+        send_count += node_send_map[i].size();
+      }
+    }
+
+    // Finite element mesh:
+
+    FEMeshType mesh ;
+
+    if ( node_count_total ) {
+      mesh.node_coords = node_coords_type( "node_coords", node_count_total );
+    }
+
+    if ( elem_count_total ) {
+      mesh.elem_node_ids =
+        elem_node_ids_type( "elem_node_ids", elem_count_total );
+    }
+
+    mesh.parallel_data_map.assign( node_count_interior ,
+                                   node_count_owned ,
+                                   node_count_total ,
+                                   recv_msg_count ,
+                                   send_msg_count ,
+                                   send_count );
+
+    typename node_coords_type::HostMirror node_coords =
+      Kokkos::create_mirror( mesh.node_coords );
+
+    typename elem_node_ids_type::HostMirror elem_node_ids =
+      Kokkos::create_mirror( mesh.elem_node_ids );
+
+    //------------------------------------
+    // set node coordinates to grid location for subsequent verification
+
+    for ( size_t iz = node_box_local_used[2][0] ;
+                 iz < node_box_local_used[2][1] ; ++iz ) {
+
+    for ( size_t iy = node_box_local_used[1][0] ;
+                 iy < node_box_local_used[1][1] ; ++iy ) {
+
+    for ( size_t ix = node_box_local_used[0][0] ;
+                 ix < node_box_local_used[0][1] ; ++ix ) {
+
+      const size_t node_local_id =
+        box_map_id( node_box_local_used , node_used_id_map , ix , iy , iz );
+
+      node_coords( node_local_id , 0 ) = ix ;
+      node_coords( node_local_id , 1 ) = iy ;
+      node_coords( node_local_id , 2 ) = iz ;
+    }}}
+
+    //------------------------------------
+    // Initialize element-node connectivity:
+
+    if ( 1 < gang_count ) {
+      layout_elements_partitioned( vertex_box_local_used ,
+                                   vertex_box_local_owned ,
+                                   node_box_local_used ,
+                                   node_used_id_map ,
+                                   element ,
+                                   gang_count ,
+                                   elem_node_ids );
+    }
+    else {
+      layout_elements_interior_exterior( vertex_box_local_used ,
+                                         vertex_box_local_owned ,
+                                         node_box_local_used ,
+                                         node_used_id_map ,
+                                         element ,
+                                         elem_count_interior ,
+                                         elem_node_ids );
+    }
+
+    //------------------------------------
+    // Populate node->element connectivity:
+
+    std::vector<size_t> node_elem_work( node_count_total , (size_t) 0 );
+
+    for ( size_t i = 0 ; i < elem_count_total ; ++i ) {
+      for ( size_t n = 0 ; n < element_node_count  ; ++n ) {
+        ++node_elem_work[ elem_node_ids(i,n) ];
+      }
+    }
+
+    mesh.node_elem_ids =
+      Kokkos::create_staticcrsgraph< node_elem_ids_type >( "node_elem_ids" , node_elem_work );
+
+    typename node_elem_ids_type::HostMirror
+      node_elem_ids = Kokkos::create_mirror( mesh.node_elem_ids );
+
+    for ( size_t i = 0 ; i < node_count_total ; ++i ) {
+      node_elem_work[i] = node_elem_ids.row_map[i];
+    }
+
+    // Looping in element order insures the list of elements
+    // is sorted by element index.
+
+    for ( size_t i = 0 ; i < elem_count_total ; ++i ) {
+      for ( size_t n = 0 ; n < element_node_count ; ++n ) {
+        const unsigned nid = elem_node_ids(i, n);
+        const unsigned j = node_elem_work[nid] ; ++node_elem_work[nid] ;
+
+        node_elem_ids.entries( j , 0 ) = i ;
+        node_elem_ids.entries( j , 1 ) = n ;
+      }
+    }
+    //------------------------------------
+    // Verify setup with node coordinates matching grid indices.
+    verify( node_coords , elem_node_ids , node_elem_ids );
+
+    //------------------------------------
+    // Scale node coordinates to problem extent with
+    // nonlinear mapping.
+    {
+      const double problem_extent[3] =
+        { static_cast<double>( vertex_box_global[0][1] - 1 ) ,
+          static_cast<double>( vertex_box_global[1][1] - 1 ) ,
+          static_cast<double>( vertex_box_global[2][1] - 1 ) };
+
+      const double grid_extent[3] =
+        { static_cast<double>( node_box_global[0][1] - 1 ) ,
+          static_cast<double>( node_box_global[1][1] - 1 ) ,
+          static_cast<double>( node_box_global[2][1] - 1 ) };
+
+      for ( size_t i = 0 ; i < node_count_total ; ++i ) {
+        const double x_unit = node_coords(i,0) / grid_extent[0] ;
+        const double y_unit = node_coords(i,1) / grid_extent[1] ;
+        const double z_unit = node_coords(i,2) / grid_extent[2] ;
+
+        node_coords(i,0) = coordinate_scalar_type( problem_extent[0] * std::pow( x_unit , x_coord_curve ) );
+        node_coords(i,1) = coordinate_scalar_type( problem_extent[1] * std::pow( y_unit , y_coord_curve ) );
+        node_coords(i,2) = coordinate_scalar_type( problem_extent[2] * std::pow( z_unit , z_coord_curve ) );
+      }
+    }
+
+    Kokkos::deep_copy( mesh.node_coords ,   node_coords );
+    Kokkos::deep_copy( mesh.elem_node_ids , elem_node_ids );
+    Kokkos::deep_copy( mesh.node_elem_ids.entries , node_elem_ids.entries );
+
+    //------------------------------------
+    // Communication lists:
+    {
+      recv_msg_count = 0 ;
+      send_msg_count = 0 ;
+      send_count = 0 ;
+
+      for ( size_t i = 1 ; i < proc_count ; ++i ) {
+
+        // Order sending starting with the local processor rank
+        // to try to smooth out the amount of messages simultaneously
+        // send to a particular processor.
+
+        const int proc = ( proc_local + i ) % proc_count ;
+        if ( node_part_counts[i] ) {
+          mesh.parallel_data_map.host_recv(recv_msg_count,0) = proc ;
+          mesh.parallel_data_map.host_recv(recv_msg_count,1) = node_part_counts[i] ;
+          ++recv_msg_count ;
+        }
+        if ( node_send_map[i].size() ) {
+          mesh.parallel_data_map.host_send(send_msg_count,0) = proc ;
+          mesh.parallel_data_map.host_send(send_msg_count,1) = node_send_map[i].size() ;
+          for ( size_t j = 0 ; j < node_send_map[i].size() ; ++j , ++send_count ) {
+            mesh.parallel_data_map.host_send_item(send_count) = node_send_map[i][j] - node_count_interior ;
+          }
+          ++send_msg_count ;
+        }
+      }
+    }
+
+    return mesh ;
+  }
+};
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_BOXMESHFIXTURE_HPP */
+
+
diff --git a/lib/kokkos/example/multi_fem/BoxMeshPartition.cpp b/lib/kokkos/example/multi_fem/BoxMeshPartition.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..17a6696fb9d934974a5c244802a93a2272e9c3da
--- /dev/null
+++ b/lib/kokkos/example/multi_fem/BoxMeshPartition.cpp
@@ -0,0 +1,381 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <limits>
+#include <BoxMeshPartition.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace {
+
+void box_partition( size_t ip , size_t up ,
+                    const BoxType & box ,
+                    BoxType * const p_box )
+{
+  const size_t np = up - ip ;
+
+  if ( 1 == np ) {
+    p_box[ip] = box ;
+  }
+  else {
+    // Choose axis with largest count:
+
+    const size_t n0 = box[0][1] - box[0][0] ;
+    const size_t n1 = box[1][1] - box[1][0] ;
+    const size_t n2 = box[2][1] - box[2][0] ;
+
+    const size_t axis = n2 > n1 ? ( n2 > n0 ? 2 : ( n1 > n0 ? 1 : 0 ) ) :
+                                  ( n1 > n0 ? 1 : 0 );
+
+    const size_t n = box[ axis ][1] - box[ axis ][0] ;
+
+    if ( 0 == np % 3 ) {
+      const size_t np_part = np / 3 ; // exact
+
+      const size_t nbox_low = (size_t)(( (double) n ) * ( 1.0 / 3.0 ));
+      const size_t nbox_mid = (size_t)(( (double) n ) * ( 2.0 / 3.0 ));
+
+      BoxType dbox_low = box ; // P = [ip,ip+np/3) 
+      BoxType dbox_mid = box ; // P = [ip+np/3,ip+2*np/3) 
+      BoxType dbox_upp = box ; // P = [ip+2*np/3,ip+np) 
+
+      dbox_low[ axis ][1] = box[ axis ][0] + nbox_low ;
+      dbox_mid[ axis ][1] = box[ axis ][0] + nbox_mid ;
+
+      dbox_mid[ axis ][0] = dbox_low[ axis ][1];
+      dbox_upp[ axis ][0] = dbox_mid[ axis ][1];
+
+      box_partition( ip,           ip +   np_part, dbox_low , p_box );
+      box_partition( ip+  np_part, ip + 2*np_part, dbox_mid , p_box );
+      box_partition( ip+2*np_part, up,             dbox_upp , p_box );
+    }
+    else {
+      const size_t np_low = np / 2 ; /* Rounded down */
+      const size_t nbox_low = (size_t)
+        (((double)n) * ( ((double) np_low ) / ((double) np ) ));
+
+      BoxType dbox_low = box ;
+      BoxType dbox_upp = box ;
+
+      dbox_low[ axis ][1] = dbox_low[ axis ][0] + nbox_low ; 
+      dbox_upp[ axis ][0] = dbox_low[ axis ][1];
+
+      box_partition( ip, ip + np_low, dbox_low , p_box );
+      box_partition( ip + np_low, up, dbox_upp , p_box );
+    }
+  }
+}
+
+size_t box_map_offset( const BoxType & local_use ,
+                       const size_t global_i ,
+                       const size_t global_j ,
+                       const size_t global_k )
+
+{
+  const size_t max = std::numeric_limits<size_t>::max();
+
+  const size_t n[3] =
+    { local_use[0][1] - local_use[0][0] ,
+      local_use[1][1] - local_use[1][0] ,
+      local_use[2][1] - local_use[2][0] };
+
+  const size_t use[3] = {
+    ( global_i >= local_use[0][0] ? global_i - local_use[0][0] : max ) ,
+    ( global_j >= local_use[1][0] ? global_j - local_use[1][0] : max ) ,
+    ( global_k >= local_use[2][0] ? global_k - local_use[2][0] : max ) };
+
+  const size_t offset =
+    ( use[0] < n[0] && use[1] < n[1] && use[2] < n[2] ) ?
+    ( use[0] + n[0] * ( use[1] + n[1] * use[2] ) ) : max ;
+
+  if ( offset == max ) {
+    std::ostringstream msg ;
+    msg << "box_map_offset ERROR: "
+        << " use " << local_use
+        << " ( " << global_i
+        << " , " << global_j
+        << " , " << global_k
+        << " )" ;
+    throw std::runtime_error( msg.str() );
+  }
+
+  return offset ;
+}
+
+} // namespace
+
+//----------------------------------------------------------------------------
+
+void BoxBoundsLinear::apply(  const BoxType & box_global ,
+                              const BoxType & box_part ,
+                                    BoxType & box_interior ,
+                                    BoxType & box_use ) const
+{
+  const unsigned ghost = 1 ;
+
+  if ( 0 == count( box_part ) ) {
+    box_interior = box_part ;
+    box_use      = box_part ;
+  }
+  else {
+    for ( size_t i = 0 ; i < 3 ; ++i ) {
+
+      box_interior[i][0] =
+        ( box_part[i][0] == box_global[i][0] )      ? box_part[i][0] : (
+        ( box_part[i][0] + ghost < box_part[i][1] ) ? box_part[i][0] + ghost : 
+                                                      box_part[i][1] );
+
+      box_interior[i][1] =
+        ( box_part[i][1] == box_global[i][1] )      ? box_part[i][1] : (
+        ( box_part[i][0] + ghost < box_part[i][1] ) ? box_part[i][1] - ghost :
+                                                      box_part[i][0] );
+
+      box_use[i][0] = 
+        ( box_part[i][0] > ghost + box_global[i][0] ) ? box_part[i][0] - ghost :
+                                                        box_global[i][0] ;
+      box_use[i][1] = 
+        ( box_part[i][1] + ghost < box_global[i][1] ) ? box_part[i][1] + ghost :
+                                                        box_global[i][1] ;
+    }
+  }
+}
+
+void BoxBoundsQuadratic::apply( const BoxType & box_global ,
+                                const BoxType & box_part ,
+                                      BoxType & box_interior ,
+                                      BoxType & box_use ) const
+{
+  if ( 0 == count( box_part ) ) {
+    box_interior = box_part ;
+    box_use      = box_part ;
+  }
+  else {
+    for ( size_t i = 0 ; i < 3 ; ++i ) {
+      const bool odd = ( box_part[i][0] - box_global[i][0] ) & 01 ;
+
+      const unsigned ghost = odd ? 1 : 2 ;
+
+      box_interior[i][0] =
+        ( box_part[i][0] == box_global[i][0] )      ? box_part[i][0] : (
+        ( box_part[i][0] + ghost < box_part[i][1] ) ? box_part[i][0] + ghost : 
+                                                      box_part[i][1] );
+
+      box_interior[i][1] =
+        ( box_part[i][1] == box_global[i][1] )      ? box_part[i][1] : (
+        ( box_part[i][0] + ghost < box_part[i][1] ) ? box_part[i][1] - ghost :
+                                                      box_part[i][0] );
+
+      box_use[i][0] = 
+        ( box_part[i][0] > ghost + box_global[i][0] ) ? box_part[i][0] - ghost :
+                                                        box_global[i][0] ;
+      box_use[i][1] = 
+        ( box_part[i][1] + ghost < box_global[i][1] ) ? box_part[i][1] + ghost :
+                                                        box_global[i][1] ;
+    }
+  }
+}
+
+//----------------------------------------------------------------------------
+
+void box_partition_rcb( const BoxType        & root_box ,
+                        std::vector<BoxType> & part_boxes )
+{
+  const BoxBoundsLinear use_boxes ;
+
+  const size_t part_count = part_boxes.size();
+
+  box_partition( 0 , part_count , root_box , & part_boxes[0] );
+
+  // Verify partitioning
+
+  size_t total_cell = 0 ;
+
+  for ( size_t i = 0 ; i < part_count ; ++i ) {
+
+    total_cell += count( part_boxes[i] );
+
+    BoxType box_interior , box_use ;
+
+    use_boxes.apply( root_box , part_boxes[i] , box_interior , box_use );
+
+    if ( count( box_use ) < count( part_boxes[i] ) ||
+         count( part_boxes[i] ) < count( box_interior ) ||
+         part_boxes[i] != intersect( part_boxes[i] , box_use ) ||
+         box_interior  != intersect( part_boxes[i] , box_interior )) {
+
+      std::ostringstream msg ;
+
+      msg << "box_partition_rcb ERROR : "
+          << "part_boxes[" << i << "] = "
+          << part_boxes[i]
+          << " use " << box_use
+          << " interior " << box_interior
+          << std::endl 
+          << "  part ^ use " << intersect( part_boxes[i] , box_use )
+          << "  part ^ interior " << intersect( part_boxes[i] , box_interior );
+
+      throw std::runtime_error( msg.str() );
+    }
+
+    for ( size_t j = i + 1 ; j < part_count ; ++j ) {
+      const BoxType tmp = intersect( part_boxes[i] , part_boxes[j] );
+
+      if ( count( tmp ) ) {
+        throw std::runtime_error( std::string("box partition intersection") );
+      }
+    }
+  }
+
+  if ( total_cell != count( root_box ) ) {
+    throw std::runtime_error( std::string("box partition count") );
+  }
+}
+
+//----------------------------------------------------------------------------
+         
+size_t box_map_id( const BoxType & local_use ,
+                   const std::vector<size_t> & local_use_id_map ,
+                   const size_t global_i ,
+                   const size_t global_j ,
+                   const size_t global_k )
+
+{
+  const size_t offset =
+    box_map_offset( local_use , global_i , global_j , global_k );
+  return local_use_id_map[ offset ];
+}
+         
+//----------------------------------------------------------------------------
+
+void box_partition_maps( const BoxType              & root_box ,
+                         const std::vector<BoxType> & part_boxes ,
+                         const BoxBounds            & use_boxes ,
+                         const size_t          my_part ,
+                         BoxType             & my_use_box ,
+                         std::vector<size_t> & my_use_id_map ,
+                         size_t              & my_count_interior ,
+                         size_t              & my_count_owned ,
+                         size_t              & my_count_uses ,
+                         std::vector<size_t> & my_part_counts ,
+                         std::vector<std::vector<size_t> > & my_send_map )
+{
+  const size_t np = part_boxes.size();
+
+  if ( np <= my_part ) {
+    std::ostringstream msg ;
+    msg << "box_partition_maps ERROR : "
+        << " np(" << np << ") <= my_part(" << my_part << ")" ;
+    throw std::runtime_error( msg.str() );
+  }
+
+  const BoxType my_owned_box = part_boxes[my_part];
+  BoxType my_interior_box ;
+
+
+  use_boxes.apply( root_box, my_owned_box, my_interior_box, my_use_box );
+
+  my_count_interior = count( my_interior_box );
+  my_count_owned    = count( my_owned_box );
+  my_count_uses     = count( my_use_box );
+
+  my_use_id_map.assign( my_count_uses , std::numeric_limits<size_t>::max() );
+
+  // Order ids as { owned-interior , owned-parallel , received_{(p+i)%np} }
+
+  size_t offset_interior = 0 ;
+  size_t offset_parallel = my_count_interior ;
+
+  for ( size_t iz = my_owned_box[2][0] ; iz < my_owned_box[2][1] ; ++iz ) {
+  for ( size_t iy = my_owned_box[1][0] ; iy < my_owned_box[1][1] ; ++iy ) {
+  for ( size_t ix = my_owned_box[0][0] ; ix < my_owned_box[0][1] ; ++ix ) {
+    const size_t offset = box_map_offset( my_use_box , ix , iy , iz );
+    if ( contain( my_interior_box , ix , iy , iz ) ) {
+      my_use_id_map[ offset ] = offset_interior++ ;
+    }
+    else {
+      my_use_id_map[ offset ] = offset_parallel++ ;
+    }
+  }}}
+
+
+  my_part_counts.assign( np , (size_t) 0 );
+  my_send_map.assign( np , std::vector<size_t>() );
+
+  my_part_counts[0] = my_count_owned ;
+
+  for ( size_t i = 1 ; i < np ; ++i ) {
+
+    const size_t ip = ( my_part + i ) % np ;
+
+    const BoxType p_owned_box = part_boxes[ip];
+    BoxType p_use_box , p_interior_box ;
+    use_boxes.apply( root_box, p_owned_box, p_interior_box, p_use_box );
+
+    const BoxType recv_box = intersect( my_use_box , p_owned_box );
+    const BoxType send_box = intersect( my_owned_box , p_use_box );
+
+    if ( 0 != ( my_part_counts[i] = count( recv_box ) ) ) {
+      for ( size_t iz = recv_box[2][0] ; iz < recv_box[2][1] ; ++iz ) {
+      for ( size_t iy = recv_box[1][0] ; iy < recv_box[1][1] ; ++iy ) {
+      for ( size_t ix = recv_box[0][0] ; ix < recv_box[0][1] ; ++ix ) {
+        const size_t offset = box_map_offset( my_use_box , ix , iy , iz );
+        my_use_id_map[ offset ] = offset_parallel++ ;
+      }}}
+    }
+
+    if ( 0 != count( send_box ) ) {
+      for ( size_t iz = send_box[2][0] ; iz < send_box[2][1] ; ++iz ) {
+      for ( size_t iy = send_box[1][0] ; iy < send_box[1][1] ; ++iy ) {
+      for ( size_t ix = send_box[0][0] ; ix < send_box[0][1] ; ++ix ) {
+        const size_t offset = box_map_offset( my_use_box , ix , iy , iz );
+
+        my_send_map[ i ].push_back( my_use_id_map[ offset ] );
+      }}}
+    }
+  }
+}
+
+
diff --git a/lib/kokkos/example/multi_fem/BoxMeshPartition.hpp b/lib/kokkos/example/multi_fem/BoxMeshPartition.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f2aa6f57cc2b8f5b3b64682dddfb2f94dba7e0d1
--- /dev/null
+++ b/lib/kokkos/example/multi_fem/BoxMeshPartition.hpp
@@ -0,0 +1,210 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef BOXMESHPARTITION_HPP
+#define BOXMESHPARTITION_HPP
+
+#include <cstddef>
+#include <utility>
+#include <vector>
+#include <iostream>
+
+//----------------------------------------------------------------------------
+
+struct BoxType {
+  size_t data[3][2] ;
+
+  typedef size_t range_type[2] ;
+
+  inline
+  const range_type & operator[]( size_t i ) const { return data[i]; }
+
+  inline
+  range_type & operator[]( size_t i ) { return data[i]; }
+
+  inline
+  bool operator == ( const BoxType & rhs ) const
+  {
+    return data[0][0] == rhs.data[0][0] && data[0][1] == rhs.data[0][1] &&
+           data[1][0] == rhs.data[1][0] && data[1][1] == rhs.data[2][1] &&
+           data[2][0] == rhs.data[2][0] && data[2][1] == rhs.data[2][1] ;
+  }
+
+  inline
+  bool operator != ( const BoxType & rhs ) const
+  {
+    return data[0][0] != rhs.data[0][0] || data[0][1] != rhs.data[0][1] ||
+           data[1][0] != rhs.data[1][0] || data[1][1] != rhs.data[1][1] ||
+           data[2][0] != rhs.data[2][0] || data[2][1] != rhs.data[2][1] ;
+  }
+};
+
+inline
+size_t count( const BoxType & b )
+{
+  size_t n = 1 ;
+  for ( size_t i = 0 ; i < 3 ; ++i ) {
+    n *= b[i][1] > b[i][0] ? b[i][1] - b[i][0] : 0 ;
+  }
+  return n ;
+}
+
+inline
+bool contain( const BoxType & b , size_t i , size_t j , size_t k )
+{
+  return b[0][0] <= i && i < b[0][1] &&
+         b[1][0] <= j && j < b[1][1] &&
+         b[2][0] <= k && k < b[2][1] ;
+}
+
+inline
+BoxType intersect( const BoxType & x , const BoxType & y )
+{
+  BoxType z ;
+  for ( size_t i = 0 ; i < 3 ; ++i ) {
+    z[i][0] = std::max( x[i][0] , y[i][0] );    
+    z[i][1] = std::min( x[i][1] , y[i][1] );    
+  }
+
+  return z ;
+}
+
+inline
+std::ostream & operator << ( std::ostream & s , const BoxType & box )
+{
+  s << "{ "
+    << box[0][0] << " " << box[0][1] << " , "
+    << box[1][0] << " " << box[1][1] << " , "
+    << box[2][0] << " " << box[2][1] << " }" ;
+  return s ;
+}
+
+//----------------------------------------------------------------------------
+
+class BoxBounds {
+public:
+  /** \brief  Default bounds to one layer of ghosting */
+  virtual
+  void apply( const BoxType & box_global ,
+              const BoxType & box_part ,
+                    BoxType & box_interior ,
+                    BoxType & box_use ) const = 0 ;
+
+  virtual ~BoxBounds() {}
+  BoxBounds() {}
+};
+
+class BoxBoundsLinear : public BoxBounds
+{
+public:
+  /** \brief  Default bounds to one layer of ghosting */
+  virtual
+  void apply( const BoxType & box_global ,
+              const BoxType & box_part ,
+                    BoxType & box_interior ,
+                    BoxType & box_use ) const ;
+
+  virtual ~BoxBoundsLinear() {}
+  BoxBoundsLinear() {}
+};
+
+class BoxBoundsQuadratic : public BoxBounds {
+public:
+  /** \brief  Quadratic mesh: even ordinates have two layers,
+   *          odd ordinates have one layer.
+   */
+  virtual
+  void apply( const BoxType & box_global ,
+              const BoxType & box_part ,
+                    BoxType & box_interior ,
+                    BoxType & box_use ) const ;
+
+  virtual ~BoxBoundsQuadratic() {}
+  BoxBoundsQuadratic() {}
+};
+
+//----------------------------------------------------------------------------
+/* Partition box into part_boxes.size() sub-boxes */
+
+void box_partition_rcb( const BoxType        & root_box ,
+                        std::vector<BoxType> & part_boxes );
+
+//----------------------------------------------------------------------------
+/* Determine local id layout and communication maps for partitioned boxes.
+ *
+ *  Local ids are layed out as follows:
+ *    { [ owned-interior ids not sent ] ,
+ *      [ owned-boundary ids to be sent to other processes ] ,
+ *      [ received ids from processor ( my_part + 1 ) % part_count ]
+ *      [ received ids from processor ( my_part + 2 ) % part_count ]
+ *      [ received ids from processor ( my_part + 3 ) % part_count ]
+ *      ... };
+ *
+ *  This layout allows
+ *  (1) received data to be copied into a contiguous block of memory
+ *  (2) send data to be extracted from a contiguous block of memory.
+ */
+void box_partition_maps(
+  const BoxType              & root_box ,   // [in] Global box
+  const std::vector<BoxType> & part_boxes , // [in] Partitioned boxes
+  const BoxBounds            & use_boxes ,  // [in] Ghost boundaries
+  const size_t          my_part ,           // [in] My local part
+  BoxType             & my_use_box ,        // [out] My used box with ghost
+  std::vector<size_t> & my_use_id_map ,     // [out] Local ordering map
+  size_t              & my_count_interior , // [out] How many interior
+  size_t              & my_count_owned ,    // [out] How many owned
+  size_t              & my_count_uses ,     // [out] How may used
+  std::vector<size_t> & my_part_counts ,    // [out] Partitioning of my_use_id_map
+  std::vector<std::vector<size_t> > & my_send_map ); // [out] Send id map
+
+/*  Mapping of cartesian coordinate to local id */
+size_t box_map_id( const BoxType             & my_use_box ,
+                   const std::vector<size_t> & my_use_id_map ,
+                   const size_t global_i ,
+                   const size_t global_j ,
+                   const size_t global_k );
+
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef BOXMESHPARTITION_HPP */
+
diff --git a/lib/kokkos/example/multi_fem/CMakeLists.txt b/lib/kokkos/example/multi_fem/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e3a40bc26f0fb45a12d59ddcfa0f767c3988a6f9
--- /dev/null
+++ b/lib/kokkos/example/multi_fem/CMakeLists.txt
@@ -0,0 +1,16 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+SET(SOURCES "")
+
+FILE(GLOB SOURCES *.cpp)
+
+SET(LIBRARIES kokkoscore)
+
+TRIBITS_ADD_EXECUTABLE(
+  multi_fem
+  SOURCES ${SOURCES}
+  COMM serial mpi
+  )
+
diff --git a/lib/kokkos/example/multi_fem/Explicit.hpp b/lib/kokkos/example/multi_fem/Explicit.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..cef1a37a1acc2598647134217c51c7f4085011cc
--- /dev/null
+++ b/lib/kokkos/example/multi_fem/Explicit.hpp
@@ -0,0 +1,452 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef EXPLICIT_DRIVER_HPP
+#define EXPLICIT_DRIVER_HPP
+
+#include <sys/time.h>
+#include <iostream>
+#include <iomanip>
+#include <cstdlib>
+#include <cmath>
+
+#include <impl/Kokkos_Timer.hpp>
+
+#include <ExplicitFunctors.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Explicit {
+
+struct PerformanceData {
+  double mesh_time ;
+  double init_time ;
+  double internal_force_time ;
+  double central_diff ;
+  double comm_time ;
+  size_t number_of_steps ;
+
+  PerformanceData()
+  : mesh_time(0)
+  , init_time(0)
+  , internal_force_time(0)
+  , central_diff(0)
+  , comm_time(0)
+  , number_of_steps(0)
+  {}
+
+  void best( const PerformanceData & rhs )
+  {
+    if ( rhs.mesh_time < mesh_time ) mesh_time = rhs.mesh_time ;
+    if ( rhs.init_time < init_time ) init_time = rhs.init_time ;
+    if ( rhs.internal_force_time < internal_force_time ) internal_force_time = rhs.internal_force_time ;
+    if ( rhs.central_diff < central_diff ) central_diff = rhs.central_diff ;
+    if ( rhs.comm_time < comm_time ) comm_time = rhs.comm_time ;
+  }
+};
+
+template< typename Scalar , class FixtureType >
+PerformanceData run( const typename FixtureType::FEMeshType & mesh ,
+                     const int global_max_x ,
+                     const int global_max_y ,
+                     const int global_max_z ,
+                     const int steps ,
+                     const int print_sample )
+{
+  typedef Scalar                              scalar_type ;
+  typedef FixtureType                         fixture_type ;
+  typedef typename fixture_type::execution_space  execution_space ;
+  //typedef typename fixture_type::FEMeshType   mesh_type ; // unused
+
+  enum { ElementNodeCount = fixture_type::element_node_count };
+
+  const int NumStates = 2;
+
+  const int total_num_steps = steps ;
+
+  const Scalar user_dt = 5.0e-6;
+  //const Scalar  end_time = 0.0050;
+
+  // element block parameters
+  const Scalar  lin_bulk_visc = 0.0;
+  const Scalar  quad_bulk_visc = 0.0;
+
+  // const Scalar  lin_bulk_visc = 0.06;
+  // const Scalar  quad_bulk_visc = 1.2;
+  // const Scalar  hg_stiffness = 0.0;
+  // const Scalar  hg_viscosity = 0.0;
+  // const Scalar  hg_stiffness = 0.03;
+  // const Scalar  hg_viscosity = 0.001;
+
+  // material properties
+  const Scalar youngs_modulus=1.0e6;
+  const Scalar poissons_ratio=0.0;
+  const Scalar  density = 8.0e-4;
+
+  const comm::Machine machine = mesh.parallel_data_map.machine ;
+
+  PerformanceData perf_data ;
+
+  Kokkos::Timer wall_clock ;
+
+  //------------------------------------
+  // Generate fields
+
+  typedef Fields< scalar_type , execution_space > fields_type ;
+
+  fields_type mesh_fields( mesh ,
+                           lin_bulk_visc ,
+                           quad_bulk_visc ,
+                           youngs_modulus ,
+                           poissons_ratio ,
+                           density );
+
+  typename fields_type::node_coords_type::HostMirror
+    model_coords_h = Kokkos::create_mirror( mesh_fields.model_coords );
+
+  typename fields_type::geom_state_array_type::HostMirror
+    displacement_h = Kokkos::create_mirror( mesh_fields.displacement );
+
+  typename fields_type::geom_state_array_type::HostMirror
+    velocity_h = Kokkos::create_mirror( mesh_fields.velocity );
+
+  Kokkos::deep_copy( model_coords_h , mesh_fields.model_coords );
+
+  //------------------------------------
+  // Initialization
+
+  initialize_element<Scalar,execution_space>::apply( mesh_fields );
+  initialize_node<   Scalar,execution_space>::apply( mesh_fields );
+
+  const Scalar x_bc = global_max_x ;
+
+  // Initial condition on velocity to initiate a pulse along the X axis
+  {
+    const unsigned X = 0;
+    for (int inode = 0; inode< mesh_fields.num_nodes; ++inode) {
+      if ( model_coords_h(inode,X) == 0) {
+        velocity_h(inode,X,0) = 1.0e3;
+        velocity_h(inode,X,1) = 1.0e3;
+      }
+    }
+  }
+
+  Kokkos::deep_copy( mesh_fields.velocity , velocity_h );
+
+  //--------------------------------------------------------------------------
+  // We will call a sequence of functions.  These functions have been
+  // grouped into several functors to balance the number of global memory
+  // accesses versus requiring too many registers or too much L1 cache.
+  // Global memory accees have read/write cost and memory subsystem contention cost.
+  //--------------------------------------------------------------------------
+
+  perf_data.init_time = comm::max( machine , wall_clock.seconds() );
+
+  // Parameters required for the internal force computations.
+
+  int current_state = 0;
+  int previous_state = 0;
+  int next_state = 0;
+
+  perf_data.number_of_steps = total_num_steps ;
+
+#if defined( KOKKOS_HAVE_MPI )
+
+  typedef typename
+    fields_type::geom_state_array_type::value_type  comm_value_type ;
+
+  const unsigned comm_value_count = 6 ;
+
+  Kokkos::AsyncExchange< comm_value_type , execution_space ,
+                              Kokkos::ParallelDataMap >
+    comm_exchange( mesh.parallel_data_map , comm_value_count );
+
+#endif
+
+  for (int step = 0; step < total_num_steps; ++step) {
+
+    wall_clock.reset();
+
+    //------------------------------------------------------------------------
+#if defined( KOKKOS_HAVE_MPI )
+    {
+      // Communicate "send" nodes' displacement and velocity next_state
+      // to the ghosted nodes.
+      // buffer packages: { { dx , dy , dz , vx , vy , vz }_node }
+
+      pack_state< Scalar , execution_space >
+        ::apply( comm_exchange.buffer() ,
+                 mesh.parallel_data_map.count_interior ,
+                 mesh.parallel_data_map.count_send ,
+                 mesh_fields , next_state );
+
+      comm_exchange.setup();
+
+      comm_exchange.send_receive();
+
+      unpack_state< Scalar , execution_space >
+        ::apply( mesh_fields , next_state ,
+                 comm_exchange.buffer() ,
+                 mesh.parallel_data_map.count_owned ,
+                 mesh.parallel_data_map.count_receive );
+
+      execution_space::fence();
+    }
+#endif
+
+    perf_data.comm_time += comm::max( machine , wall_clock.seconds() );
+
+    //------------------------------------------------------------------------
+    // rotate the states
+
+    previous_state = current_state;
+    current_state = next_state;
+    ++next_state;
+    next_state %= NumStates;
+
+    wall_clock.reset();
+
+    // First kernel 'grad_hgop' combines two functions:
+    // gradient, velocity gradient
+    grad< Scalar , execution_space >::apply( mesh_fields ,
+                                         current_state ,
+                                         previous_state );
+
+    // Combine tensor decomposition and rotation functions.
+    decomp_rotate< Scalar , execution_space >::apply( mesh_fields ,
+                                                  current_state ,
+                                                  previous_state );
+
+    internal_force< Scalar , execution_space >::apply( mesh_fields ,
+                                                   user_dt ,
+                                                   current_state );
+
+    execution_space::fence();
+
+    perf_data.internal_force_time +=
+      comm::max( machine , wall_clock.seconds() );
+
+    wall_clock.reset();
+
+    // Assembly of elements' contributions to nodal force into
+    // a nodal force vector.  Update the accelerations, velocities,
+    // displacements.
+    // The same pattern can be used for matrix-free residual computations.
+    nodal_step< Scalar , execution_space >::apply( mesh_fields ,
+                                               x_bc ,
+                                               current_state,
+                                               next_state );
+    execution_space::fence();
+
+    perf_data.central_diff +=
+      comm::max( machine , wall_clock.seconds() );
+
+    if ( print_sample && 0 == step % 100 ) {
+      Kokkos::deep_copy( displacement_h , mesh_fields.displacement );
+      Kokkos::deep_copy( velocity_h ,     mesh_fields.velocity );
+
+      if ( 1 == print_sample ) {
+
+        std::cout << "step " << step
+                  << " : displacement(*,0,0) =" ;
+        for ( int i = 0 ; i < mesh_fields.num_nodes_owned ; ++i ) {
+          if ( model_coords_h(i,1) == 0 && model_coords_h(i,2) == 0 ) {
+            std::cout << " " << displacement_h(i,0,next_state);
+          }
+        }
+        std::cout << std::endl ;
+
+        const float tol = 1.0e-6 ;
+        const int yb = global_max_y ;
+        const int zb = global_max_z ;
+        std::cout << "step " << step
+                  << " : displacement(*," << yb << "," << zb << ") =" ;
+        for ( int i = 0 ; i < mesh_fields.num_nodes_owned ; ++i ) {
+          if ( fabs( model_coords_h(i,1) - yb ) < tol &&
+               fabs( model_coords_h(i,2) - zb ) < tol ) {
+            std::cout << " " << displacement_h(i,0,next_state);
+          }
+        }
+        std::cout << std::endl ;
+      }
+      else if ( 2 == print_sample ) {
+
+        const float tol = 1.0e-6 ;
+        const int xb = global_max_x / 2 ;
+        const int yb = global_max_y / 2 ;
+        const int zb = global_max_z / 2 ;
+
+        for ( int i = 0 ; i < mesh_fields.num_nodes_owned ; ++i ) {
+          if ( fabs( model_coords_h(i,0) - xb ) < tol &&
+               fabs( model_coords_h(i,1) - yb ) < tol &&
+               fabs( model_coords_h(i,2) - zb ) < tol ) {
+            std::cout << "step " << step
+                      << " : displacement("
+                      << xb << "," << yb << "," << zb << ") = {"
+                      << std::setprecision(6)
+                      << " " << displacement_h(i,0,next_state)
+                      << std::setprecision(2)
+                      << " " << displacement_h(i,1,next_state)
+                      << std::setprecision(2)
+                      << " " << displacement_h(i,2,next_state)
+                      << " }" << std::endl ;
+          }
+        }
+      }
+    }
+  }
+
+  return perf_data ;
+}
+
+
+template <typename Scalar, typename Device>
+static void driver( const char * const label ,
+                    comm::Machine machine ,
+                    const int gang_count ,
+                    const int elem_count_beg ,
+                    const int elem_count_end ,
+                    const int runs )
+{
+  typedef Scalar              scalar_type ;
+  typedef Device              execution_space ;
+  typedef double              coordinate_scalar_type ;
+  typedef FixtureElementHex8  fixture_element_type ;
+
+  typedef BoxMeshFixture< coordinate_scalar_type ,
+                          execution_space ,
+                          fixture_element_type > fixture_type ;
+
+  typedef typename fixture_type::FEMeshType mesh_type ;
+
+  const size_t proc_count = comm::size( machine );
+  const size_t proc_rank  = comm::rank( machine );
+
+  const int space = 15 ;
+  const int steps = 1000 ;
+  const int print_sample = 0 ;
+
+  if ( comm::rank( machine ) == 0 ) {
+
+    std::cout << std::endl ;
+    std::cout << "\"MiniExplicitDynamics with Kokkos " << label
+              << " time_steps(" << steps << ")"
+              << "\"" << std::endl;
+    std::cout << std::left << std::setw(space) << "\"Element\" , ";
+    std::cout << std::left << std::setw(space) << "\"Node\" , ";
+    std::cout << std::left << std::setw(space) << "\"Initialize\" , ";
+    std::cout << std::left << std::setw(space) << "\"ElemForce\" , ";
+    std::cout << std::left << std::setw(space) << "\"NodeUpdate\" , ";
+    std::cout << std::left << std::setw(space) << "\"NodeComm\" , ";
+    std::cout << std::left << std::setw(space) << "\"Time/Elem\" , ";
+    std::cout << std::left << std::setw(space) << "\"Time/Node\"";
+
+    std::cout << std::endl;
+
+    std::cout << std::left << std::setw(space) << "\"count\" , ";
+    std::cout << std::left << std::setw(space) << "\"count\" , ";
+    std::cout << std::left << std::setw(space) << "\"microsec\" , ";
+    std::cout << std::left << std::setw(space) << "\"microsec\" , ";
+    std::cout << std::left << std::setw(space) << "\"microsec\" , ";
+    std::cout << std::left << std::setw(space) << "\"microsec\" , ";
+    std::cout << std::left << std::setw(space) << "\"microsec\" , ";
+    std::cout << std::left << std::setw(space) << "\"microsec\"";
+
+    std::cout << std::endl;
+  }
+
+  for(int i = elem_count_beg ; i < elem_count_end ; i *= 2 )
+  {
+    const int iz = std::max( 1 , (int) cbrt( ((double) i) / 2.0 ) );
+    const int iy = iz + 1 ;
+    const int ix = 2 * iy ;
+    const int nelem = ix * iy * iz ;
+    const int nnode = ( ix + 1 ) * ( iy + 1 ) * ( iz + 1 );
+
+    mesh_type mesh =
+      fixture_type::create( proc_count , proc_rank , gang_count ,
+                            ix , iy , iz );
+
+    mesh.parallel_data_map.machine = machine ;
+
+    PerformanceData perf , best ;
+
+    for(int j = 0; j < runs; j++){
+
+     perf = run<scalar_type,fixture_type>(mesh,ix,iy,iz,steps,print_sample);
+
+     if( j == 0 ) {
+       best = perf ;
+     }
+     else {
+       best.best( perf );
+     }
+   }
+
+   if ( comm::rank( machine ) == 0 ) {
+     double time_per_element =
+       ( best.internal_force_time ) / ( nelem * perf.number_of_steps );
+     double time_per_node =
+       ( best.comm_time + best.central_diff ) / ( nnode * perf.number_of_steps );
+
+   std::cout << std::setw(space-3) << nelem << " , "
+             << std::setw(space-3) << nnode << " , "
+             << std::setw(space-3) << best.number_of_steps << " , "
+             << std::setw(space-3) << best.init_time * 1000000 << " , "
+             << std::setw(space-3)
+             << ( best.internal_force_time * 1000000 ) / best.number_of_steps << " , "
+             << std::setw(space-3)
+             << ( best.central_diff * 1000000 ) / best.number_of_steps << " , "
+             << std::setw(space-3)
+             << ( best.comm_time * 1000000 ) / best.number_of_steps << " , "
+             << std::setw(space-3) << time_per_element * 1000000 << " , "
+             << std::setw(space-3) << time_per_node * 1000000
+             << std::endl ;
+    }
+  }
+}
+
+
+} // namespace Explicit
+
+#endif /* #ifndef EXPLICIT_DRIVER_HPP */
diff --git a/lib/kokkos/example/multi_fem/ExplicitFunctors.hpp b/lib/kokkos/example/multi_fem/ExplicitFunctors.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..feea82244f2a57571034a87e31a537c530b0062e
--- /dev/null
+++ b/lib/kokkos/example/multi_fem/ExplicitFunctors.hpp
@@ -0,0 +1,1471 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_EXPLICITFUNCTORS_HPP
+#define KOKKOS_EXPLICITFUNCTORS_HPP
+
+#include <math.h>
+#include <Kokkos_Core.hpp>
+#include <FEMesh.hpp>
+
+namespace Explicit {
+
+template<typename Scalar , class Device >
+struct Fields {
+
+  static const int NumStates     = 2 ;
+  static const int SpatialDim    = 3 ;
+  static const int ElemNodeCount = 8 ;
+
+  // Indices for full 3x3 tensor:
+
+  static const int K_F_XX = 0 ;
+  static const int K_F_YY = 1 ;
+  static const int K_F_ZZ = 2 ;
+  static const int K_F_XY = 3 ;
+  static const int K_F_YZ = 4 ;
+  static const int K_F_ZX = 5 ;
+  static const int K_F_YX = 6 ;
+  static const int K_F_ZY = 7 ;
+  static const int K_F_XZ = 8 ;
+
+  //  Indexes into a 3 by 3 symmetric tensor stored as a length 6 vector
+
+  static const int K_S_XX = 0 ;
+  static const int K_S_YY = 1 ;
+  static const int K_S_ZZ = 2 ;
+  static const int K_S_XY = 3 ;
+  static const int K_S_YZ = 4 ;
+  static const int K_S_ZX = 5 ;
+  static const int K_S_YX = 3 ;
+  static const int K_S_ZY = 4 ;
+  static const int K_S_XZ = 5 ;
+
+  //  Indexes into a 3 by 3 skew symmetric tensor stored as a length 3 vector
+
+  static const int K_V_XY = 0 ;
+  static const int K_V_YZ = 1 ;
+  static const int K_V_ZX = 2 ;
+
+
+  typedef Device                           execution_space ;
+  typedef typename execution_space::size_type  size_type ;
+
+  typedef HybridFEM::FEMesh<double,ElemNodeCount,execution_space>  FEMesh ;
+
+  typedef typename FEMesh::node_coords_type    node_coords_type ;
+  typedef typename FEMesh::elem_node_ids_type  elem_node_ids_type ;
+  typedef typename FEMesh::node_elem_ids_type  node_elem_ids_type ;
+  typedef typename Kokkos::ParallelDataMap   parallel_data_map ;
+
+  typedef Kokkos::View< double[][ SpatialDim ][ NumStates ] , execution_space > geom_state_array_type ;
+  typedef Kokkos::View< Scalar[][ SpatialDim ] , execution_space > geom_array_type ;
+  typedef Kokkos::View< Scalar[] ,               execution_space > array_type ;
+  typedef Kokkos::View< Scalar ,                 execution_space >  scalar_type ;
+
+  typedef Kokkos::View< Scalar[][  6 ] ,    execution_space >  elem_sym_tensor_type ;
+  typedef Kokkos::View< Scalar[][  9 ] ,    execution_space >  elem_tensor_type ;
+  typedef Kokkos::View< Scalar[][  9 ][ NumStates ] , execution_space >  elem_tensor_state_type ;
+  typedef Kokkos::View< Scalar[][ SpatialDim ][ ElemNodeCount ] , execution_space > elem_node_geom_type ;
+
+  // Parameters:
+  const int num_nodes ;
+  const int num_nodes_owned ;
+  const int num_elements ;
+
+  const Scalar  lin_bulk_visc;
+  const Scalar  quad_bulk_visc;
+  const Scalar  two_mu;
+  const Scalar  bulk_modulus;
+  const Scalar  density;
+
+  // Mesh:
+  const elem_node_ids_type  elem_node_connectivity ;
+  const node_elem_ids_type  node_elem_connectivity ;
+  const node_coords_type    model_coords ;
+
+  // Compute:
+  const scalar_type                dt ;
+  const scalar_type                prev_dt ;
+  const geom_state_array_type      displacement ;
+  const geom_state_array_type      velocity ;
+  const geom_array_type            acceleration ;
+  const geom_array_type            internal_force ;
+  const array_type                 nodal_mass ;
+  const array_type                 elem_mass ;
+  const array_type                 internal_energy ;
+  const elem_sym_tensor_type       stress_new ;
+  const elem_tensor_state_type     rotation ;
+  const elem_node_geom_type        element_force ;
+  const elem_tensor_type           vel_grad ;
+  const elem_sym_tensor_type       stretch ;
+  const elem_sym_tensor_type       rot_stretch ;
+
+  Fields(
+      const FEMesh & mesh,
+      Scalar arg_lin_bulk_visc,
+      Scalar arg_quad_bulk_visc,
+      Scalar youngs_modulus,
+      Scalar poissons_ratio,
+      Scalar arg_density )
+    : num_nodes(       mesh.parallel_data_map.count_owned +
+                       mesh.parallel_data_map.count_receive )
+    , num_nodes_owned( mesh.parallel_data_map.count_owned )
+    , num_elements(    mesh.elem_node_ids.dimension_0() )
+    , lin_bulk_visc(  arg_lin_bulk_visc )
+    , quad_bulk_visc( arg_quad_bulk_visc )
+    , two_mu(youngs_modulus/(1.0+poissons_ratio))
+    , bulk_modulus(youngs_modulus/(3*(1.0-2.0*poissons_ratio)))
+    , density(arg_density)
+
+    // mesh
+
+    , elem_node_connectivity( mesh.elem_node_ids ) // ( num_elements , ElemNodeCount )
+    , node_elem_connectivity( mesh.node_elem_ids ) // ( num_nodes , ... )
+    , model_coords(  mesh.node_coords )            // ( num_nodes , 3 )
+
+    // compute with input/output
+
+    , dt(              "dt" )
+    , prev_dt(         "prev_dt" )
+    , displacement(    "displacement" ,   num_nodes )
+    , velocity(        "velocity" ,       num_nodes )
+    , acceleration(    "acceleration" ,   num_nodes_owned )
+    , internal_force(  "internal_force" , num_nodes_owned )
+    , nodal_mass(      "nodal_mass" ,     num_nodes_owned )
+    , elem_mass(       "elem_mass" ,       num_elements )
+    , internal_energy( "internal_energy" , num_elements )
+    , stress_new(      "stress_new" ,      num_elements )
+
+    // temporary arrays
+
+    , rotation(      "rotation" ,  num_elements )
+    , element_force( "element_force" ,  num_elements )
+    , vel_grad(      "vel_grad" , num_elements )
+    , stretch(       "stretch" , num_elements )
+    , rot_stretch(   "rot_stretch" , num_elements )
+  { }
+};
+
+
+//----------------------------------------------------------------------------
+
+template< typename Scalar , class DeviceType >
+KOKKOS_INLINE_FUNCTION
+Scalar dot8( const Scalar * a , const Scalar * b )
+{ return a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + a[3] * b[3] +
+         a[4] * b[4] + a[5] * b[5] + a[6] * b[6] + a[7] * b[7] ; }
+
+template< typename Scalar , class DeviceType >
+KOKKOS_INLINE_FUNCTION
+void comp_grad( const Scalar * const x ,
+                const Scalar * const y ,
+                const Scalar * const z,
+                Scalar * const grad_x ,
+                Scalar * const grad_y ,
+                Scalar * const grad_z )
+{
+  //  calc X difference vectors
+
+  Scalar R42=(x[3] - x[1]);
+  Scalar R52=(x[4] - x[1]);
+  Scalar R54=(x[4] - x[3]);
+
+  Scalar R63=(x[5] - x[2]);
+  Scalar R83=(x[7] - x[2]);
+  Scalar R86=(x[7] - x[5]);
+
+  Scalar R31=(x[2] - x[0]);
+  Scalar R61=(x[5] - x[0]);
+  Scalar R74=(x[6] - x[3]);
+
+  Scalar R72=(x[6] - x[1]);
+  Scalar R75=(x[6] - x[4]);
+  Scalar R81=(x[7] - x[0]);
+
+  Scalar t1=(R63 + R54);
+  Scalar t2=(R61 + R74);
+  Scalar t3=(R72 + R81);
+
+  Scalar t4 =(R86 + R42);
+  Scalar t5 =(R83 + R52);
+  Scalar t6 =(R75 + R31);
+
+  //  Calculate Y gradient from X and Z data
+
+  grad_y[0] = (z[1] *  t1) - (z[2] * R42) - (z[3] *  t5)  + (z[4] *  t4) + (z[5] * R52) - (z[7] * R54);
+  grad_y[1] = (z[2] *  t2) + (z[3] * R31) - (z[0] *  t1)  - (z[5] *  t6) + (z[6] * R63) - (z[4] * R61);
+  grad_y[2] = (z[3] *  t3) + (z[0] * R42) - (z[1] *  t2)  - (z[6] *  t4) + (z[7] * R74) - (z[5] * R72);
+  grad_y[3] = (z[0] *  t5) - (z[1] * R31) - (z[2] *  t3)  + (z[7] *  t6) + (z[4] * R81) - (z[6] * R83);
+  grad_y[4] = (z[5] *  t3) + (z[6] * R86) - (z[7] *  t2)  - (z[0] *  t4) - (z[3] * R81) + (z[1] * R61);
+  grad_y[5] = (z[6] *  t5) - (z[4] *  t3)  - (z[7] * R75) + (z[1] *  t6) - (z[0] * R52) + (z[2] * R72);
+  grad_y[6] = (z[7] *  t1) - (z[5] *  t5)  - (z[4] * R86) + (z[2] *  t4) - (z[1] * R63) + (z[3] * R83);
+  grad_y[7] = (z[4] *  t2) - (z[6] *  t1)  + (z[5] * R75) - (z[3] *  t6) - (z[2] * R74) + (z[0] * R54);
+
+  //   calc Z difference vectors
+
+  R42=(z[3] - z[1]);
+  R52=(z[4] - z[1]);
+  R54=(z[4] - z[3]);
+
+  R63=(z[5] - z[2]);
+  R83=(z[7] - z[2]);
+  R86=(z[7] - z[5]);
+
+  R31=(z[2] - z[0]);
+  R61=(z[5] - z[0]);
+  R74=(z[6] - z[3]);
+
+  R72=(z[6] - z[1]);
+  R75=(z[6] - z[4]);
+  R81=(z[7] - z[0]);
+
+  t1=(R63 + R54);
+  t2=(R61 + R74);
+  t3=(R72 + R81);
+
+  t4 =(R86 + R42);
+  t5 =(R83 + R52);
+  t6 =(R75 + R31);
+
+  //  Calculate X gradient from Y and Z data
+
+  grad_x[0] = (y[1] *  t1) - (y[2] * R42) - (y[3] *  t5) + (y[4] *  t4) + (y[5] * R52) - (y[7] * R54);
+  grad_x[1] = (y[2] *  t2) + (y[3] * R31) - (y[0] *  t1) - (y[5] *  t6) + (y[6] * R63) - (y[4] * R61);
+  grad_x[2] = (y[3] *  t3) + (y[0] * R42) - (y[1] *  t2) - (y[6] *  t4) + (y[7] * R74) - (y[5] * R72);
+  grad_x[3] = (y[0] *  t5) - (y[1] * R31) - (y[2] *  t3) + (y[7] *  t6) + (y[4] * R81) - (y[6] * R83);
+  grad_x[4] = (y[5] *  t3) + (y[6] * R86) - (y[7] *  t2) - (y[0] *  t4) - (y[3] * R81) + (y[1] * R61);
+  grad_x[5] = (y[6] *  t5) - (y[4] *  t3) - (y[7] * R75) + (y[1] *  t6) - (y[0] * R52) + (y[2] * R72);
+  grad_x[6] = (y[7] *  t1) - (y[5] *  t5) - (y[4] * R86) + (y[2] *  t4) - (y[1] * R63) + (y[3] * R83);
+  grad_x[7] = (y[4] *  t2) - (y[6] *  t1) + (y[5] * R75) - (y[3] *  t6) - (y[2] * R74) + (y[0] * R54);
+
+  //  calc Y difference vectors
+
+  R42=(y[3] - y[1]);
+  R52=(y[4] - y[1]);
+  R54=(y[4] - y[3]);
+
+  R63=(y[5] - y[2]);
+  R83=(y[7] - y[2]);
+  R86=(y[7] - y[5]);
+
+  R31=(y[2] - y[0]);
+  R61=(y[5] - y[0]);
+  R74=(y[6] - y[3]);
+
+  R72=(y[6] - y[1]);
+  R75=(y[6] - y[4]);
+  R81=(y[7] - y[0]);
+
+  t1=(R63 + R54);
+  t2=(R61 + R74);
+  t3=(R72 + R81);
+
+  t4 =(R86 + R42);
+  t5 =(R83 + R52);
+  t6 =(R75 + R31);
+
+  //  Calculate Z gradient from X and Y data
+
+  grad_z[0] = (x[1] *  t1) - (x[2] * R42) - (x[3] *  t5)  + (x[4] *  t4) + (x[5] * R52) - (x[7] * R54);
+  grad_z[1] = (x[2] *  t2) + (x[3] * R31) - (x[0] *  t1)  - (x[5] *  t6) + (x[6] * R63) - (x[4] * R61);
+  grad_z[2] = (x[3] *  t3) + (x[0] * R42) - (x[1] *  t2)  - (x[6] *  t4) + (x[7] * R74) - (x[5] * R72);
+  grad_z[3] = (x[0] *  t5) - (x[1] * R31) - (x[2] *  t3)  + (x[7] *  t6) + (x[4] * R81) - (x[6] * R83);
+  grad_z[4] = (x[5] *  t3) + (x[6] * R86) - (x[7] *  t2)  - (x[0] *  t4) - (x[3] * R81) + (x[1] * R61);
+  grad_z[5] = (x[6] *  t5) - (x[4] *  t3)  - (x[7] * R75) + (x[1] *  t6) - (x[0] * R52) + (x[2] * R72);
+  grad_z[6] = (x[7] *  t1) - (x[5] *  t5)  - (x[4] * R86) + (x[2] *  t4) - (x[1] * R63) + (x[3] * R83);
+  grad_z[7] = (x[4] *  t2) - (x[6] *  t1)  + (x[5] * R75) - (x[3] *  t6) - (x[2] * R74) + (x[0] * R54);
+}
+
+//----------------------------------------------------------------------------
+
+template< typename Scalar , class DeviceType >
+struct initialize_element
+{
+  typedef DeviceType     execution_space ;
+
+  typedef Explicit::Fields< Scalar , execution_space > Fields ;
+
+  typename Fields::elem_node_ids_type      elem_node_connectivity ;
+  typename Fields::node_coords_type        model_coords ;
+  typename Fields::elem_sym_tensor_type    stretch ;
+  typename Fields::elem_tensor_state_type  rotation ;
+  typename Fields::array_type              elem_mass ;
+
+  const Scalar density ;
+
+  initialize_element( const Fields & mesh_fields )
+    : elem_node_connectivity( mesh_fields.elem_node_connectivity )
+    , model_coords(           mesh_fields.model_coords )
+    , stretch(                mesh_fields.stretch )
+    , rotation(               mesh_fields.rotation )
+    , elem_mass(              mesh_fields.elem_mass )
+    , density(                mesh_fields.density )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int ielem )const
+  {
+    const int K_XX = 0 ;
+    const int K_YY = 1 ;
+    const int K_ZZ = 2 ;
+    const Scalar ONE12TH = 1.0 / 12.0 ;
+
+    Scalar x[ Fields::ElemNodeCount ];
+    Scalar y[ Fields::ElemNodeCount ];
+    Scalar z[ Fields::ElemNodeCount ];
+    Scalar grad_x[ Fields::ElemNodeCount ];
+    Scalar grad_y[ Fields::ElemNodeCount ];
+    Scalar grad_z[ Fields::ElemNodeCount ];
+
+    for ( int i = 0 ; i < Fields::ElemNodeCount ; ++i ) {
+      const int n = elem_node_connectivity( ielem , i );
+
+      x[i]  = model_coords( n , 0 );
+      y[i]  = model_coords( n , 1 );
+      z[i]  = model_coords( n , 2 );
+    }
+
+    comp_grad<Scalar,execution_space>( x, y, z, grad_x, grad_y, grad_z);
+
+    stretch(ielem,K_XX) = 1 ;
+    stretch(ielem,K_YY) = 1 ;
+    stretch(ielem,K_ZZ) = 1 ;
+
+    rotation(ielem,K_XX,0) = 1 ;
+    rotation(ielem,K_YY,0) = 1 ;
+    rotation(ielem,K_ZZ,0) = 1 ;
+
+    rotation(ielem,K_XX,1) = 1 ;
+    rotation(ielem,K_YY,1) = 1 ;
+    rotation(ielem,K_ZZ,1) = 1 ;
+
+    elem_mass(ielem) = ONE12TH * density *
+                                 dot8<Scalar,execution_space>( x , grad_x );
+  }
+
+  static void apply( const Fields & mesh_fields )
+  {
+    initialize_element op( mesh_fields );
+    Kokkos::parallel_for( mesh_fields.num_elements , op );
+  }
+};
+
+
+template<typename Scalar , class DeviceType >
+struct initialize_node
+{
+  typedef DeviceType     execution_space ;
+
+  typedef Explicit::Fields< Scalar , execution_space > Fields ;
+
+  typename Fields::node_elem_ids_type      node_elem_connectivity ;
+  typename Fields::array_type              nodal_mass ;
+  typename Fields::array_type              elem_mass ;
+
+  static const int ElemNodeCount = Fields::ElemNodeCount ;
+
+  initialize_node( const Fields & mesh_fields )
+    : node_elem_connectivity( mesh_fields.node_elem_connectivity )
+    , nodal_mass(             mesh_fields.nodal_mass )
+    , elem_mass(              mesh_fields.elem_mass )
+    {}
+
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int inode )const
+  {
+    const int begin = node_elem_connectivity.row_map[inode];
+    const int end   = node_elem_connectivity.row_map[inode+1];
+
+    Scalar node_mass = 0;
+
+    for(int i = begin; i != end; ++i) {
+      const int elem_id = node_elem_connectivity.entries( i , 0 );
+      node_mass += elem_mass(elem_id);
+    }
+
+    nodal_mass(inode) = node_mass / ElemNodeCount ;
+  }
+
+  static void apply( const Fields & mesh_fields )
+  {
+    initialize_node op( mesh_fields );
+    Kokkos::parallel_for( mesh_fields.num_nodes_owned , op );
+  }
+};
+
+//----------------------------------------------------------------------------
+
+
+template<typename Scalar, class DeviceType >
+struct grad
+{
+  typedef DeviceType execution_space ;
+
+  typedef Explicit::Fields< Scalar , execution_space >  Fields ;
+
+  static const int ElemNodeCount = Fields::ElemNodeCount ;
+
+  static const int K_F_XX = Fields::K_F_XX ;
+  static const int K_F_YY = Fields::K_F_YY ;
+  static const int K_F_ZZ = Fields::K_F_ZZ ;
+  static const int K_F_XY = Fields::K_F_XY ;
+  static const int K_F_YZ = Fields::K_F_YZ ;
+  static const int K_F_ZX = Fields::K_F_ZX ;
+  static const int K_F_YX = Fields::K_F_YX ;
+  static const int K_F_ZY = Fields::K_F_ZY ;
+  static const int K_F_XZ = Fields::K_F_XZ ;
+
+  // Global arrays used by this functor.
+
+  const typename Fields::elem_node_ids_type     elem_node_connectivity ;
+  const typename Fields::node_coords_type       model_coords ;
+  const typename Fields::geom_state_array_type  displacement ; 
+  const typename Fields::geom_state_array_type  velocity ; 
+  const typename Fields::elem_tensor_type       vel_grad ;
+  const typename Fields::scalar_type            dt ;
+
+  const int  current_state;
+  const int  previous_state;
+
+  // Constructor on the Host to populate this device functor.
+  // All array view copies are shallow.
+  grad( const Fields &  fields,
+        const int arg_current_state,
+        const int arg_previous_state)
+    : elem_node_connectivity( fields.elem_node_connectivity)
+    , model_coords( fields.model_coords)
+    , displacement( fields.displacement)
+    , velocity( fields.velocity)
+    , vel_grad( fields.vel_grad)
+    , dt(  fields.dt)
+    , current_state(arg_current_state)
+    , previous_state(arg_previous_state)
+    { }
+
+  //--------------------------------------------------------------------------
+
+    //   Calculate Velocity Gradients
+    KOKKOS_INLINE_FUNCTION
+    void v_grad(  int ielem,
+      Scalar * vx,       Scalar * vy,       Scalar * vz,
+      Scalar * grad_x,     Scalar * grad_y,     Scalar * grad_z,
+      Scalar inv_vol) const
+    {
+      const int K_F_XX = Fields::K_F_XX ;
+      const int K_F_YY = Fields::K_F_YY ;
+      const int K_F_ZZ = Fields::K_F_ZZ ;
+      const int K_F_XY = Fields::K_F_XY ;
+      const int K_F_YZ = Fields::K_F_YZ ;
+      const int K_F_ZX = Fields::K_F_ZX ;
+      const int K_F_YX = Fields::K_F_YX ;
+      const int K_F_ZY = Fields::K_F_ZY ;
+      const int K_F_XZ = Fields::K_F_XZ ;
+
+      vel_grad(ielem, K_F_XX) = inv_vol * dot8<Scalar,execution_space>( vx , grad_x );
+      vel_grad(ielem, K_F_YX) = inv_vol * dot8<Scalar,execution_space>( vy , grad_x );
+      vel_grad(ielem, K_F_ZX) = inv_vol * dot8<Scalar,execution_space>( vz , grad_x );
+
+      vel_grad(ielem, K_F_XY) = inv_vol * dot8<Scalar,execution_space>( vx , grad_y );
+      vel_grad(ielem, K_F_YY) = inv_vol * dot8<Scalar,execution_space>( vy , grad_y );
+      vel_grad(ielem, K_F_ZY) = inv_vol * dot8<Scalar,execution_space>( vz , grad_y );
+
+      vel_grad(ielem, K_F_XZ) = inv_vol * dot8<Scalar,execution_space>( vx , grad_z );
+      vel_grad(ielem, K_F_YZ) = inv_vol * dot8<Scalar,execution_space>( vy , grad_z );
+      vel_grad(ielem, K_F_ZZ) = inv_vol * dot8<Scalar,execution_space>( vz , grad_z );
+    }
+
+  //--------------------------------------------------------------------------
+  // Functor operator() which calls the three member functions.
+
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int ielem )const
+  {
+    const int X = 0 ;
+    const int Y = 1 ;
+    const int Z = 2 ;
+    const Scalar dt_scale = -0.5 * *dt;
+
+    //  declare and reuse local data for frequently accessed data to
+    //  reduce global memory reads and writes.
+
+    Scalar      x[8],      y[8],      z[8];
+    Scalar     vx[8],     vy[8],     vz[8];
+    Scalar grad_x[8], grad_y[8], grad_z[8];
+
+    // Read global velocity once and use many times
+    // via local registers / L1 cache.
+    //  store the velocity information in local memory before using,
+    //  so it can be returned for other functions to use
+
+    // Read global coordinates and velocity once and use many times
+    // via local registers / L1 cache.
+    // load X coordinate information and move by half time step
+
+    for ( int i = 0 ; i < ElemNodeCount ; ++i ) {
+      const int n = elem_node_connectivity( ielem , i );
+
+      vx[i] = velocity( n , X , current_state );
+      vy[i] = velocity( n , Y , current_state );
+      vz[i] = velocity( n , Z , current_state );
+
+      x[i]  = model_coords( n , X ) +
+              displacement( n , X , current_state ) +
+              dt_scale * vx[i];
+
+      y[i]  = model_coords( n , Y ) +
+              displacement( n , Y , current_state ) +
+              dt_scale * vy[i];
+
+      z[i]  = model_coords( n , Z ) +
+              displacement( n , Z , current_state ) +
+              dt_scale * vz[i];
+    }
+
+    comp_grad<Scalar,execution_space>( x, y, z, grad_x, grad_y, grad_z);
+
+    //  Calculate hexahedral volume from x model_coords and gradient information
+
+    const Scalar inv_vol = 1.0 / dot8<Scalar,execution_space>( x , grad_x );
+
+    v_grad(ielem, vx, vy, vz, grad_x, grad_y, grad_z, inv_vol);
+  }
+
+  static void apply( const Fields & fields ,
+                     const int arg_current_state ,
+                     const int arg_previous_state )
+  {
+    grad op( fields, arg_current_state , arg_previous_state );
+    Kokkos::parallel_for( fields.num_elements , op );
+  }
+};
+
+//----------------------------------------------------------------------------
+
+template<typename Scalar, class DeviceType >
+struct decomp_rotate
+{
+  typedef DeviceType execution_space ;
+
+  typedef Explicit::Fields< Scalar , execution_space >  Fields ;
+
+  static const int ElemNodeCount = Fields::ElemNodeCount ;
+
+  static const int K_F_XX = Fields::K_F_XX ;
+  static const int K_F_YY = Fields::K_F_YY ;
+  static const int K_F_ZZ = Fields::K_F_ZZ ;
+  static const int K_F_XY = Fields::K_F_XY ;
+  static const int K_F_YZ = Fields::K_F_YZ ;
+  static const int K_F_ZX = Fields::K_F_ZX ;
+  static const int K_F_YX = Fields::K_F_YX ;
+  static const int K_F_ZY = Fields::K_F_ZY ;
+  static const int K_F_XZ = Fields::K_F_XZ ;
+
+  static const int K_S_XX = Fields::K_S_XX ;
+  static const int K_S_YY = Fields::K_S_YY ;
+  static const int K_S_ZZ = Fields::K_S_ZZ ;
+  static const int K_S_XY = Fields::K_S_XY ;
+  static const int K_S_YZ = Fields::K_S_YZ ;
+  static const int K_S_ZX = Fields::K_S_ZX ;
+  static const int K_S_YX = Fields::K_S_YX ;
+  static const int K_S_ZY = Fields::K_S_ZY ;
+  static const int K_S_XZ = Fields::K_S_XZ ;
+
+  static const int K_V_XY = Fields::K_V_XY ;
+  static const int K_V_YZ = Fields::K_V_YZ ;
+  static const int K_V_ZX = Fields::K_V_ZX ;
+
+  // Global arrays used by this functor.
+
+  const typename Fields::elem_tensor_state_type     rotation ;
+  const typename Fields::elem_tensor_type           vel_grad ;
+  const typename Fields::elem_sym_tensor_type       stretch ;
+  const typename Fields::elem_sym_tensor_type       rot_stretch ;
+  const typename Fields::scalar_type                dt_value ;
+
+  const int  current_state;
+  const int  previous_state;
+
+  decomp_rotate( const Fields & mesh_fields ,
+                 const int arg_current_state,
+                 const int arg_previous_state)
+    : rotation(    mesh_fields.rotation )
+    , vel_grad(    mesh_fields.vel_grad )
+    , stretch(     mesh_fields.stretch )
+    , rot_stretch( mesh_fields.rot_stretch )
+    , dt_value(    mesh_fields.dt)
+    , current_state( arg_current_state)
+    , previous_state(arg_previous_state)
+    {}
+
+  static void apply( const Fields & mesh_fields ,
+                     const int arg_current_state ,
+                     const int arg_previous_state )
+  {
+    decomp_rotate op( mesh_fields , arg_current_state , arg_previous_state );
+    Kokkos::parallel_for( mesh_fields.num_elements , op );
+  }
+
+
+  KOKKOS_INLINE_FUNCTION
+  void additive_decomp(int ielem, Scalar * v_gr, Scalar * str_ten) const
+  {
+    //  In addition to calculating stretching_tensor,
+    //  use this as an opportunity to load global
+    //  variables into a local space
+
+    for ( int i = 0 ; i < 9 ; ++i ) {
+      v_gr[i] = vel_grad( ielem , i );
+    }
+
+    //
+    //  Symmetric part
+    //
+    str_ten[K_S_XX] = v_gr[K_F_XX];
+    str_ten[K_S_YY] = v_gr[K_F_YY];
+    str_ten[K_S_ZZ] = v_gr[K_F_ZZ];
+    str_ten[K_S_XY] = 0.5*(v_gr[K_F_XY] + v_gr[K_F_YX]);
+    str_ten[K_S_YZ] = 0.5*(v_gr[K_F_YZ] + v_gr[K_F_ZY]);
+    str_ten[K_S_ZX] = 0.5*(v_gr[K_F_ZX] + v_gr[K_F_XZ]);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void polar_decomp(int ielem, Scalar * v_gr, Scalar * str_ten, Scalar * str, Scalar * vort, Scalar * rot_old, Scalar * rot_new)const
+  {
+    const Scalar dt = *dt_value;
+    const Scalar dt_half = 0.5 * dt;
+
+    //  Skew Symmetric part
+    vort[K_V_XY] = 0.5*(v_gr[K_F_XY] - v_gr[K_F_YX]);
+    vort[K_V_YZ] = 0.5*(v_gr[K_F_YZ] - v_gr[K_F_ZY]);
+    vort[K_V_ZX] = 0.5*(v_gr[K_F_ZX] - v_gr[K_F_XZ]);
+
+    //   calculate the rates of rotation via gauss elimination.
+    for ( int i = 0 ; i < 6 ; ++i ) {
+      str[i] = stretch(ielem, i);
+    }
+
+    Scalar z1 = str_ten[K_S_XY] * str[K_S_ZX] -
+                str_ten[K_S_ZX] * str[K_S_XY] +
+                str_ten[K_S_YY] * str[K_S_YZ] -
+                str_ten[K_S_YZ] * str[K_S_YY] +
+                str_ten[K_S_YZ] * str[K_S_ZZ] -
+                str_ten[K_S_ZZ] * str[K_S_YZ];
+
+    Scalar z2 = str_ten[K_S_ZX] * str[K_S_XX] -
+                str_ten[K_S_XX] * str[K_S_ZX] +
+                str_ten[K_S_YZ] * str[K_S_XY] -
+                str_ten[K_S_XY] * str[K_S_YZ] +
+                str_ten[K_S_ZZ] * str[K_S_ZX] -
+                str_ten[K_S_ZX] * str[K_S_ZZ];
+
+    Scalar z3 = str_ten[K_S_XX] * str[K_S_XY] -
+                str_ten[K_S_XY] * str[K_S_XX] +
+                str_ten[K_S_XY] * str[K_S_YY] -
+                str_ten[K_S_YY] * str[K_S_XY] +
+                str_ten[K_S_ZX] * str[K_S_YZ] -
+                str_ten[K_S_YZ] * str[K_S_ZX];
+
+  //   forward elimination
+    const Scalar a1inv = 1.0 / (str[K_S_YY] + str[K_S_ZZ]);
+
+    const Scalar a4BYa1 = -1 * str[K_S_XY] * a1inv;
+
+    const Scalar a2inv = 1.0 / (str[K_S_ZZ] + str[K_S_XX] + str[K_S_XY] * a4BYa1);
+
+    const Scalar a5 =  -str[K_S_YZ] + str[K_S_ZX] * a4BYa1;
+
+    z2 -= z1 * a4BYa1;
+    Scalar a6BYa1 = -1 * str[K_S_ZX] * a1inv;
+    const Scalar a5BYa2 = a5 * a2inv;
+    z3 -= z1 * a6BYa1 - z2 * a5BYa2;
+
+  //   backward substitution -
+    z3 /= (str[K_S_XX] + str[K_S_YY] + str[K_S_ZX] * a6BYa1 + a5 * a5BYa2);
+    z2 = (z2 - a5 * z3) * a2inv;
+    z1 = (z1*a1inv - a6BYa1 * z3 -a4BYa1 * z2);
+
+  //   calculate rotation rates - recall that spin_rate is an asymmetric tensor,
+  //   so compute spin rate vector as dual of spin rate tensor,
+  //   i.e   w_i = e_ijk * spin_rate_jk
+    z1 += vort[K_V_YZ];
+    z2 += vort[K_V_ZX];
+    z3 += vort[K_V_XY];
+
+  //   update rotation tensor:
+  //  1) premultiply old rotation tensor to get right-hand side.
+
+    for ( int i = 0 ; i < 9 ; ++i ) {
+      rot_old[i] = rotation(ielem, i, previous_state);
+    }
+
+    Scalar r_XX = rot_old[K_F_XX] + dt_half*( z3 * rot_old[K_F_YX] - z2 * rot_old[K_F_ZX] );
+    Scalar r_YX = rot_old[K_F_YX] + dt_half*( z1 * rot_old[K_F_ZX] - z3 * rot_old[K_F_XX] );
+    Scalar r_ZX = rot_old[K_F_ZX] + dt_half*( z2 * rot_old[K_F_XX] - z1 * rot_old[K_F_YX] );
+    Scalar r_XY = rot_old[K_F_XY] + dt_half*( z3 * rot_old[K_F_YY] - z2 * rot_old[K_F_ZY] );
+    Scalar r_YY = rot_old[K_F_YY] + dt_half*( z1 * rot_old[K_F_ZY] - z3 * rot_old[K_F_XY] );
+    Scalar r_ZY = rot_old[K_F_ZY] + dt_half*( z2 * rot_old[K_F_XY] - z1 * rot_old[K_F_YY] );
+    Scalar r_XZ = rot_old[K_F_XZ] + dt_half*( z3 * rot_old[K_F_YZ] - z2 * rot_old[K_F_ZZ] );
+    Scalar r_YZ = rot_old[K_F_YZ] + dt_half*( z1 * rot_old[K_F_ZZ] - z3 * rot_old[K_F_XZ] );
+    Scalar r_ZZ = rot_old[K_F_ZZ] + dt_half*( z2 * rot_old[K_F_XZ] - z1 * rot_old[K_F_YZ] );
+
+
+  //  2) solve for new rotation tensor via gauss elimination.
+  //   forward elimination -
+    Scalar a12 = - dt_half * z3;
+    Scalar a13 =   dt_half * z2;
+    Scalar b32 = - dt_half * z1;
+    Scalar a22inv = 1.0 / (1.0 + a12 * a12);
+
+    Scalar a13a12 = a13*a12;
+    Scalar a23 = b32 + a13a12;
+    r_YX += r_XX * a12;
+    r_YY += r_XY * a12;
+    r_YZ += r_XZ * a12;
+
+
+    b32 = (b32 - a13a12) * a22inv;
+    r_ZX += r_XX * a13 + r_YX * b32;
+    r_ZY += r_XY * a13 + r_YY * b32;
+    r_ZZ += r_XZ * a13 + r_YZ * b32;
+
+
+  //   backward substitution -
+    const Scalar a33inv = 1.0 / (1.0 + a13 * a13 + a23 * b32);
+
+    rot_new[K_F_ZX] = r_ZX * a33inv;
+    rot_new[K_F_ZY] = r_ZY * a33inv;
+    rot_new[K_F_ZZ] = r_ZZ * a33inv;
+    rot_new[K_F_YX] = ( r_YX - rot_new[K_F_ZX] * a23 ) * a22inv;
+    rot_new[K_F_YY] = ( r_YY - rot_new[K_F_ZY] * a23 ) * a22inv;
+    rot_new[K_F_YZ] = ( r_YZ - rot_new[K_F_ZZ] * a23 ) * a22inv;
+    rot_new[K_F_XX] = r_XX - rot_new[K_F_ZX] * a13 - rot_new[K_F_YX] * a12;
+    rot_new[K_F_XY] = r_XY - rot_new[K_F_ZY] * a13 - rot_new[K_F_YY] * a12;
+    rot_new[K_F_XZ] = r_XZ - rot_new[K_F_ZZ] * a13 - rot_new[K_F_YZ] * a12;
+
+    for ( int i = 0 ; i < 9 ; ++i ) {
+      rotation(ielem, i, current_state) = rot_new[i] ;
+    }
+
+  //   update stretch tensor in the new configuration -
+    const Scalar a1 = str_ten[K_S_XY] + vort[K_V_XY];
+    const Scalar a2 = str_ten[K_S_YZ] + vort[K_V_YZ];
+    const Scalar a3 = str_ten[K_S_ZX] + vort[K_V_ZX];
+    const Scalar b1 = str_ten[K_S_ZX] - vort[K_V_ZX];
+    const Scalar b2 = str_ten[K_S_XY] - vort[K_V_XY];
+    const Scalar b3 = str_ten[K_S_YZ] - vort[K_V_YZ];
+
+    const Scalar s_XX = str[K_S_XX];
+    const Scalar s_YY = str[K_S_YY];
+    const Scalar s_ZZ = str[K_S_ZZ];
+    const Scalar s_XY = str[K_S_XY];
+    const Scalar s_YZ = str[K_S_YZ];
+    const Scalar s_ZX = str[K_S_ZX];
+
+    str[K_S_XX] += dt * (str_ten[K_S_XX] * s_XX + ( a1 + z3 ) * s_XY + ( b1 - z2 ) * s_ZX);
+    str[K_S_YY] += dt * (str_ten[K_S_YY] * s_YY + ( a2 + z1 ) * s_YZ + ( b2 - z3 ) * s_XY);
+    str[K_S_ZZ] += dt * (str_ten[K_S_ZZ] * s_ZZ + ( a3 + z2 ) * s_ZX + ( b3 - z1 ) * s_YZ);
+    str[K_S_XY] += dt * (str_ten[K_S_XX] * s_XY + ( a1 )      * s_YY + ( b1      ) * s_YZ - z3 * s_XX + z1 * s_ZX);
+    str[K_S_YZ] += dt * (str_ten[K_S_YY] * s_YZ + ( a2 )      * s_ZZ + ( b2      ) * s_ZX - z1 * s_YY + z2 * s_XY);
+    str[K_S_ZX] += dt * (str_ten[K_S_ZZ] * s_ZX + ( a3 )      * s_XX + ( b3      ) * s_XY - z2 * s_ZZ + z3 * s_YZ);
+
+  }
+
+
+  KOKKOS_INLINE_FUNCTION
+  void rotate_tensor(int ielem, Scalar * str_ten, Scalar * str, Scalar * rot_new)const {
+
+    Scalar t[9];
+    Scalar rot_str[6]; // Rotated stretch
+
+    t[0] = str_ten[K_S_XX]*rot_new[K_F_XX] +
+           str_ten[K_S_XY]*rot_new[K_F_YX] +
+           str_ten[K_S_XZ]*rot_new[K_F_ZX];
+
+    t[1] = str_ten[K_S_YX]*rot_new[K_F_XX] +
+           str_ten[K_S_YY]*rot_new[K_F_YX] +
+           str_ten[K_S_YZ]*rot_new[K_F_ZX];
+
+    t[2] = str_ten[K_S_ZX]*rot_new[K_F_XX] +
+           str_ten[K_S_ZY]*rot_new[K_F_YX] +
+           str_ten[K_S_ZZ]*rot_new[K_F_ZX];
+
+    t[3] = str_ten[K_S_XX]*rot_new[K_F_XY] +
+           str_ten[K_S_XY]*rot_new[K_F_YY] +
+           str_ten[K_S_XZ]*rot_new[K_F_ZY];
+
+    t[4] = str_ten[K_S_YX]*rot_new[K_F_XY] +
+           str_ten[K_S_YY]*rot_new[K_F_YY] +
+           str_ten[K_S_YZ]*rot_new[K_F_ZY];
+
+    t[5] = str_ten[K_S_ZX]*rot_new[K_F_XY] +
+           str_ten[K_S_ZY]*rot_new[K_F_YY] +
+           str_ten[K_S_ZZ]*rot_new[K_F_ZY];
+
+    t[6] = str_ten[K_S_XX]*rot_new[K_F_XZ] +
+           str_ten[K_S_XY]*rot_new[K_F_YZ] +
+           str_ten[K_S_XZ]*rot_new[K_F_ZZ];
+
+    t[7] = str_ten[K_S_YX]*rot_new[K_F_XZ] +
+           str_ten[K_S_YY]*rot_new[K_F_YZ] +
+           str_ten[K_S_YZ]*rot_new[K_F_ZZ];
+
+    t[8] = str_ten[K_S_ZX]*rot_new[K_F_XZ] +
+           str_ten[K_S_ZY]*rot_new[K_F_YZ] +
+           str_ten[K_S_ZZ]*rot_new[K_F_ZZ];
+
+
+    rot_str[ K_S_XX ] = rot_new[K_F_XX] * t[0] +
+                        rot_new[K_F_YX] * t[1] +
+                        rot_new[K_F_ZX] * t[2];
+    rot_str[ K_S_YY ] = rot_new[K_F_XY] * t[3] +
+                        rot_new[K_F_YY] * t[4] +
+                        rot_new[K_F_ZY] * t[5];
+    rot_str[ K_S_ZZ ] = rot_new[K_F_XZ] * t[6] +
+                        rot_new[K_F_YZ] * t[7] +
+                        rot_new[K_F_ZZ] * t[8];
+
+    rot_str[ K_S_XY ] = rot_new[K_F_XX] * t[3] +
+                        rot_new[K_F_YX] * t[4] +
+                        rot_new[K_F_ZX] * t[5];
+    rot_str[ K_S_YZ ] = rot_new[K_F_XY] * t[6] +
+                        rot_new[K_F_YY] * t[7] +
+                        rot_new[K_F_ZY] * t[8];
+    rot_str[ K_S_ZX ] = rot_new[K_F_XZ] * t[0] +
+                        rot_new[K_F_YZ] * t[1] +
+                        rot_new[K_F_ZZ] * t[2];
+
+    for ( int i = 0 ; i < 6 ; ++i ) {
+      rot_stretch(ielem, i) = rot_str[i] ;
+    }
+
+    for ( int i = 0 ; i < 6 ; ++i ) {
+      stretch(ielem, i) = str[i] ;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int ielem )const {
+
+    //   Local scratch space to avoid multiple
+    //   accesses to global memory.
+    Scalar str_ten[6]; // Stretching tensor
+    Scalar str[6];     // Stretch
+    Scalar rot_old[9]; // Rotation old
+    Scalar rot_new[9]; // Rotation new
+    Scalar vort[3];    // Vorticity
+    Scalar v_gr[9];    // Velocity gradient
+
+    additive_decomp(ielem, v_gr, str_ten);
+
+    polar_decomp(ielem, v_gr, str_ten, str, vort, rot_old, rot_new);
+
+    rotate_tensor(ielem, str_ten, str, rot_new);
+  }
+};
+
+//----------------------------------------------------------------------------
+
+template<typename Scalar, class DeviceType >
+struct internal_force
+{
+  typedef DeviceType execution_space ;
+
+  typedef Explicit::Fields< Scalar , execution_space >  Fields ;
+
+  static const int ElemNodeCount = Fields::ElemNodeCount ;
+
+  static const int K_F_XX = Fields::K_F_XX ;
+  static const int K_F_YY = Fields::K_F_YY ;
+  static const int K_F_ZZ = Fields::K_F_ZZ ;
+  static const int K_F_XY = Fields::K_F_XY ;
+  static const int K_F_YZ = Fields::K_F_YZ ;
+  static const int K_F_ZX = Fields::K_F_ZX ;
+  static const int K_F_YX = Fields::K_F_YX ;
+  static const int K_F_ZY = Fields::K_F_ZY ;
+  static const int K_F_XZ = Fields::K_F_XZ ;
+
+  static const int K_S_XX = Fields::K_S_XX ;
+  static const int K_S_YY = Fields::K_S_YY ;
+  static const int K_S_ZZ = Fields::K_S_ZZ ;
+  static const int K_S_XY = Fields::K_S_XY ;
+  static const int K_S_YZ = Fields::K_S_YZ ;
+  static const int K_S_ZX = Fields::K_S_ZX ;
+  static const int K_S_YX = Fields::K_S_YX ;
+  static const int K_S_ZY = Fields::K_S_ZY ;
+  static const int K_S_XZ = Fields::K_S_XZ ;
+
+  //--------------------------------------------------------------------------
+  // Reduction:
+
+  typedef Scalar value_type;
+
+  KOKKOS_INLINE_FUNCTION
+  static void init(value_type &update) {
+    update = 1.0e32;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update,
+                    const volatile value_type & source )
+  {
+    update = update < source ? update : source;
+  }
+
+  // Final serial processing of reduction value:
+  KOKKOS_INLINE_FUNCTION
+  void final( value_type & result ) const
+  {
+    *prev_dt = *dt ;
+    *dt = result ;
+  };
+
+  //--------------------------------------------------------------------------
+
+  // Global arrays used by this functor.
+
+  const typename Fields::elem_node_ids_type      elem_node_connectivity ;
+  const typename Fields::node_coords_type        model_coords ;
+  const typename Fields::scalar_type             dt ;
+  const typename Fields::scalar_type             prev_dt ;
+  const typename Fields::geom_state_array_type   displacement ;
+  const typename Fields::geom_state_array_type   velocity ;
+  const typename Fields::array_type              elem_mass ;
+  const typename Fields::array_type              internal_energy ;
+  const typename Fields::elem_sym_tensor_type    stress_new ;
+  const typename Fields::elem_node_geom_type     element_force ;
+  const typename Fields::elem_tensor_state_type  rotation ;
+  const typename Fields::elem_sym_tensor_type    rot_stretch ;
+
+  const Scalar     two_mu;
+  const Scalar     bulk_modulus;
+  const Scalar     lin_bulk_visc;
+  const Scalar     quad_bulk_visc;
+  const Scalar     user_dt;
+  const int        current_state;
+
+  internal_force( const Fields & mesh_fields,
+                  const Scalar arg_user_dt,
+                  const int arg_current_state )
+    : elem_node_connectivity( mesh_fields.elem_node_connectivity )
+    , model_coords(           mesh_fields.model_coords )
+    , dt(                     mesh_fields.dt )
+    , prev_dt(                mesh_fields.prev_dt )
+    , displacement(           mesh_fields.displacement )
+    , velocity(               mesh_fields.velocity )
+    , elem_mass(              mesh_fields.elem_mass )
+    , internal_energy(        mesh_fields.internal_energy )
+    , stress_new(             mesh_fields.stress_new )
+    , element_force(          mesh_fields.element_force )
+    , rotation(               mesh_fields.rotation )
+    , rot_stretch(            mesh_fields.rot_stretch )
+    , two_mu(                 mesh_fields.two_mu )
+    , bulk_modulus(           mesh_fields.bulk_modulus )
+    , lin_bulk_visc(          mesh_fields.lin_bulk_visc )
+    , quad_bulk_visc(         mesh_fields.quad_bulk_visc )
+    , user_dt(       arg_user_dt )
+    , current_state( arg_current_state )
+  {}
+
+  static void apply( const Fields & mesh_fields ,
+                     const Scalar arg_user_dt,
+                     const int arg_current_state )
+  {
+    internal_force  op_force( mesh_fields , arg_user_dt , arg_current_state );
+
+    Kokkos::parallel_reduce( mesh_fields.num_elements, op_force );
+  }
+
+  //--------------------------------------------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  void rotate_tensor_backward(int ielem ,
+    const Scalar * const s_n ,
+    Scalar * const rot_stress )const
+  {
+    const int rot_state = current_state ; // 1 ;
+
+    //   t : temporary variables
+    //   s_n : stress_new in local memory space
+    //   r_n : rotation_new in local memory space
+    Scalar t[9], r_n[9];
+
+    r_n[0] = rotation(ielem, 0, rot_state );
+    r_n[1] = rotation(ielem, 1, rot_state );
+    r_n[2] = rotation(ielem, 2, rot_state );
+    r_n[3] = rotation(ielem, 3, rot_state );
+    r_n[4] = rotation(ielem, 4, rot_state );
+    r_n[5] = rotation(ielem, 5, rot_state );
+    r_n[6] = rotation(ielem, 6, rot_state );
+    r_n[7] = rotation(ielem, 7, rot_state );
+    r_n[8] = rotation(ielem, 8, rot_state );
+
+    t[0] = s_n[K_S_XX]*r_n[K_F_XX]+ s_n[K_S_XY]*r_n[K_F_XY]+ s_n[K_S_XZ]*r_n[K_F_XZ];
+    t[1] = s_n[K_S_YX]*r_n[K_F_XX]+ s_n[K_S_YY]*r_n[K_F_XY]+ s_n[K_S_YZ]*r_n[K_F_XZ];
+    t[2] = s_n[K_S_ZX]*r_n[K_F_XX]+ s_n[K_S_ZY]*r_n[K_F_XY]+ s_n[K_S_ZZ]*r_n[K_F_XZ];
+    t[3] = s_n[K_S_XX]*r_n[K_F_YX]+ s_n[K_S_XY]*r_n[K_F_YY]+ s_n[K_S_XZ]*r_n[K_F_YZ];
+    t[4] = s_n[K_S_YX]*r_n[K_F_YX]+ s_n[K_S_YY]*r_n[K_F_YY]+ s_n[K_S_YZ]*r_n[K_F_YZ];
+    t[5] = s_n[K_S_ZX]*r_n[K_F_YX]+ s_n[K_S_ZY]*r_n[K_F_YY]+ s_n[K_S_ZZ]*r_n[K_F_YZ];
+    t[6] = s_n[K_S_XX]*r_n[K_F_ZX]+ s_n[K_S_XY]*r_n[K_F_ZY]+ s_n[K_S_XZ]*r_n[K_F_ZZ];
+    t[7] = s_n[K_S_YX]*r_n[K_F_ZX]+ s_n[K_S_YY]*r_n[K_F_ZY]+ s_n[K_S_YZ]*r_n[K_F_ZZ];
+    t[8] = s_n[K_S_ZX]*r_n[K_F_ZX]+ s_n[K_S_ZY]*r_n[K_F_ZY]+ s_n[K_S_ZZ]*r_n[K_F_ZZ];
+
+    rot_stress[ K_S_XX ] = r_n[K_F_XX]*t[0] + r_n[K_F_XY]*t[1] + r_n[K_F_XZ]*t[2];
+    rot_stress[ K_S_YY ] = r_n[K_F_YX]*t[3] + r_n[K_F_YY]*t[4] + r_n[K_F_YZ]*t[5];
+    rot_stress[ K_S_ZZ ] = r_n[K_F_ZX]*t[6] + r_n[K_F_ZY]*t[7] + r_n[K_F_ZZ]*t[8];
+
+    rot_stress[ K_S_XY ] = r_n[K_F_XX]*t[3] + r_n[K_F_XY]*t[4] + r_n[K_F_XZ]*t[5];
+    rot_stress[ K_S_YZ ] = r_n[K_F_YX]*t[6] + r_n[K_F_YY]*t[7] + r_n[K_F_YZ]*t[8];
+    rot_stress[ K_S_ZX ] = r_n[K_F_ZX]*t[0] + r_n[K_F_ZY]*t[1] + r_n[K_F_ZZ]*t[2];
+  }
+
+  //--------------------------------------------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  void comp_force(int ielem,
+     const Scalar * const vx ,
+     const Scalar * const vy ,
+     const Scalar * const vz ,
+     const Scalar * const grad_x ,
+     const Scalar * const grad_y ,
+     const Scalar * const grad_z ,
+     Scalar * total_stress12th ) const
+  {
+    Scalar internal_energy_inc = 0 ;
+
+    for(int inode = 0; inode < 8; ++inode) {
+
+      const Scalar fx =
+        total_stress12th[K_S_XX] * grad_x[inode] +
+        total_stress12th[K_S_XY] * grad_y[inode] +
+        total_stress12th[K_S_XZ] * grad_z[inode] ;
+
+      element_force(ielem, 0, inode) = fx ;
+
+      const Scalar fy =
+        total_stress12th[K_S_YX] * grad_x[inode] +
+        total_stress12th[K_S_YY] * grad_y[inode] +
+        total_stress12th[K_S_YZ] * grad_z[inode] ;
+
+      element_force(ielem, 1, inode) = fy ;
+
+      const Scalar fz =
+        total_stress12th[K_S_ZX] * grad_x[inode] +
+        total_stress12th[K_S_ZY] * grad_y[inode] +
+        total_stress12th[K_S_ZZ] * grad_z[inode] ;
+
+      element_force(ielem, 2, inode) = fz ;
+
+      internal_energy_inc +=
+        fx * vx[inode] +
+        fy * vy[inode] +
+        fz * vz[inode] ;
+    }
+
+    internal_energy(ielem) = internal_energy_inc ;
+  }
+
+  //----------------------------------------------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  void get_stress(int ielem , Scalar * const s_n ) const
+    {
+      const int kxx = 0;
+      const int kyy = 1;
+      const int kzz = 2;
+      const int kxy = 3;
+      const int kyz = 4;
+      const int kzx = 5;
+
+      const Scalar e = (rot_stretch(ielem,kxx)+rot_stretch(ielem,kyy)+rot_stretch(ielem,kzz))/3.0;
+
+      s_n[kxx] = stress_new(ielem,kxx) += *dt * (two_mu * (rot_stretch(ielem,kxx)-e)+3*bulk_modulus*e);
+      s_n[kyy] = stress_new(ielem,kyy) += *dt * (two_mu * (rot_stretch(ielem,kyy)-e)+3*bulk_modulus*e);
+      s_n[kzz] = stress_new(ielem,kzz) += *dt * (two_mu * (rot_stretch(ielem,kzz)-e)+3*bulk_modulus*e);
+
+      s_n[kxy] = stress_new(ielem,kxy) += *dt * two_mu * rot_stretch(ielem,kxy);
+      s_n[kyz] = stress_new(ielem,kyz) += *dt * two_mu * rot_stretch(ielem,kyz);
+      s_n[kzx] = stress_new(ielem,kzx) += *dt * two_mu * rot_stretch(ielem,kzx);
+    }
+
+  //----------------------------------------------------------------------------
+
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int ielem, value_type & update )const
+  {
+    const Scalar ONE12TH = 1.0 / 12.0 ;
+
+    Scalar x[8], y[8], z[8] ;
+    Scalar vx[8], vy[8], vz[8];
+    Scalar grad_x[8], grad_y[8], grad_z[8];
+
+    // Position and velocity:
+
+    for ( int i = 0 ; i < ElemNodeCount ; ++i ) {
+      const int n = elem_node_connectivity(ielem,i);
+
+      x[i] = model_coords(n, 0) + displacement(n, 0, current_state) ;
+      y[i] = model_coords(n, 1) + displacement(n, 1, current_state) ;
+      z[i] = model_coords(n, 2) + displacement(n, 2, current_state) ;
+
+      vx[i] = velocity(n, 0, current_state);
+      vy[i] = velocity(n, 1, current_state);
+      vz[i] = velocity(n, 2, current_state);
+    }
+
+    // Gradient:
+
+    comp_grad<Scalar,execution_space>( x , y , z , grad_x , grad_y , grad_z );
+
+
+    const Scalar mid_vol = dot8<Scalar,execution_space>( x , grad_x );
+
+    const Scalar shr = two_mu ;
+    const Scalar dil = bulk_modulus + ((2.0*shr)/3.0);
+
+    const Scalar aspect = 6.0 * mid_vol /
+                          ( dot8<Scalar,execution_space>( grad_x , grad_x ) +
+                            dot8<Scalar,execution_space>( grad_y , grad_y ) +
+                            dot8<Scalar,execution_space>( grad_z , grad_z ) );
+
+    const Scalar dtrial = sqrt(elem_mass(ielem) * aspect / dil);
+    const Scalar traced = (rot_stretch(ielem, 0) + rot_stretch(ielem, 1) + rot_stretch(ielem, 2));
+
+    const Scalar eps = traced < 0 ? (lin_bulk_visc - quad_bulk_visc * traced * dtrial) : lin_bulk_visc ;
+
+    const Scalar bulkq = eps * dil * dtrial * traced;
+
+    Scalar cur_time_step = dtrial * ( sqrt( 1.0 + eps * eps) - eps);
+
+    // force fixed time step if input
+
+    cur_time_step = user_dt > 0 ? user_dt : cur_time_step;
+
+    update = update < cur_time_step ? update : cur_time_step;
+
+
+    Scalar s_n[ 6 ];
+
+    get_stress( ielem, s_n );
+
+    Scalar total_stress12th[6];
+
+    // Get rotated stress:
+
+    rotate_tensor_backward(ielem, s_n , total_stress12th );
+
+    total_stress12th[0] = ONE12TH*( total_stress12th[ 0 ] + bulkq );
+    total_stress12th[1] = ONE12TH*( total_stress12th[ 1 ] + bulkq );
+    total_stress12th[2] = ONE12TH*( total_stress12th[ 2 ] + bulkq );
+    total_stress12th[3] = ONE12TH*( total_stress12th[ 3 ] );
+    total_stress12th[4] = ONE12TH*( total_stress12th[ 4 ] );
+    total_stress12th[5] = ONE12TH*( total_stress12th[ 5 ] );
+
+    comp_force(ielem, vx, vy, vz,
+                      grad_x, grad_y, grad_z, total_stress12th);
+  }
+};
+
+//----------------------------------------------------------------------------
+
+template<typename Scalar, class DeviceType >
+struct nodal_step
+{
+  typedef DeviceType     execution_space ;
+  typedef typename execution_space::size_type  size_type;
+
+  typedef Explicit::Fields< Scalar , execution_space >  Fields ;
+
+  const typename Fields::scalar_type            dt ;
+  const typename Fields::scalar_type            prev_dt ;
+  const typename Fields::node_elem_ids_type     node_elem_connectivity ;
+  const typename Fields::node_coords_type       model_coords ;
+  const typename Fields::array_type             nodal_mass ;
+  const typename Fields::geom_state_array_type  displacement ;
+  const typename Fields::geom_state_array_type  velocity ;
+  const typename Fields::geom_array_type        acceleration ;
+  const typename Fields::geom_array_type        internal_force ;
+  const typename Fields::elem_node_geom_type    element_force ;
+
+  const Scalar   x_bc;
+  const int      current_state;
+  const int      next_state;
+
+
+  nodal_step( const Fields  & mesh_fields ,
+              const Scalar    arg_x_bc,
+              const int       arg_current_state,
+              const int       arg_next_state)
+   : dt(       mesh_fields.dt )
+   , prev_dt(  mesh_fields.prev_dt )
+   , node_elem_connectivity( mesh_fields.node_elem_connectivity )
+   , model_coords(   mesh_fields.model_coords )
+   , nodal_mass(     mesh_fields.nodal_mass )
+   , displacement(   mesh_fields.displacement )
+   , velocity(       mesh_fields.velocity )
+   , acceleration(   mesh_fields.acceleration )
+   , internal_force( mesh_fields.internal_force )
+   , element_force(  mesh_fields.element_force )
+   , x_bc(          arg_x_bc )
+   , current_state( arg_current_state )
+   , next_state(    arg_next_state )
+   {
+        //std::cout << "finish_step dt: " << dt << std::endl;
+        //std::cout << "finish_step prev_dt: " << prev_dt << std::endl;
+   }
+
+  static void apply( const Fields  & mesh_fields ,
+                     const Scalar    arg_x_bc ,
+                     const int       arg_current_state ,
+                     const int       arg_next_state )
+  {
+    nodal_step op( mesh_fields, arg_x_bc, arg_current_state, arg_next_state );
+
+    // Only update the owned nodes:
+
+    Kokkos::parallel_for( mesh_fields.num_nodes_owned , op );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int inode) const
+    {
+      // Getting count as per 'CSR-like' data structure
+      const int begin = node_elem_connectivity.row_map[inode];
+      const int end   = node_elem_connectivity.row_map[inode+1];
+
+      double local_force[] = {0.0, 0.0, 0.0};
+
+      // Gather-sum internal force from
+      // each element that a node is attached to.
+
+      for ( int i = begin; i < end ; ++i ){
+
+        //  node_elem_offset is a cumulative structure, so
+        //  node_elem_offset(inode) should be the index where
+        //  a particular row's elem_IDs begin
+        const int nelem = node_elem_connectivity.entries( i, 0);
+
+        //  find the row in an element's stiffness matrix
+        //  that corresponds to inode
+        const int elem_node_index = node_elem_connectivity.entries( i, 1);
+
+        local_force[0] += element_force(nelem, 0, elem_node_index);
+        local_force[1] += element_force(nelem, 1, elem_node_index);
+        local_force[2] += element_force(nelem, 2, elem_node_index);
+      }
+
+      internal_force(inode, 0) = local_force[0];
+      internal_force(inode, 1) = local_force[1];
+      internal_force(inode, 2) = local_force[2];
+
+      // Acceleration:
+
+      Scalar v_new[3];
+      Scalar a_current[3];
+
+      const Scalar tol = 1.0e-7;
+
+      // If not on the boundary then: a = F / m
+      if ( tol < fabs(model_coords(inode,0)-x_bc) ) {
+
+        const Scalar m = nodal_mass( inode );
+
+        acceleration(inode,0) = a_current[0] = -local_force[0] / m ;
+        acceleration(inode,1) = a_current[1] = -local_force[1] / m ;
+        acceleration(inode,2) = a_current[2] = -local_force[2] / m ;
+      }
+      else { //enforce fixed BC
+        acceleration(inode,0) = a_current[0] = 0;
+        acceleration(inode,1) = a_current[1] = 0;
+        acceleration(inode,2) = a_current[2] = 0;
+      }
+
+      // Central difference time integration:
+
+      const Scalar dt_disp = *dt ;
+      const Scalar dt_vel = ( *dt + *prev_dt ) / 2.0 ;
+
+      velocity(inode,0,next_state) = v_new[0] =
+        velocity(inode,0,current_state) + dt_vel * a_current[0];
+
+      velocity(inode,1,next_state) = v_new[1] =
+        velocity(inode,1,current_state) + dt_vel * a_current[1];
+
+      velocity(inode,2,next_state) = v_new[2] =
+        velocity(inode,2,current_state) + dt_vel * a_current[2];
+
+      displacement(inode,0,next_state) =
+        displacement(inode,0,current_state) + dt_disp * v_new[0];
+
+      displacement(inode,1,next_state) =
+        displacement(inode,1,current_state) + dt_disp * v_new[1];
+
+      displacement(inode,2,next_state) =
+        displacement(inode,2,current_state) + dt_disp * v_new[2];
+    }
+};
+
+//----------------------------------------------------------------------------
+
+template< typename Scalar , class DeviceType >
+struct pack_state
+{
+  typedef DeviceType     execution_space ;
+  typedef typename execution_space::size_type  size_type ;
+
+  typedef Explicit::Fields< Scalar , execution_space >  Fields ;
+
+  typedef typename Fields::geom_state_array_type::value_type  value_type ;
+  typedef Kokkos::View< value_type* , execution_space >     buffer_type ;
+
+  static const unsigned value_count = 6 ;
+
+  const typename Fields::geom_state_array_type  displacement ;
+  const typename Fields::geom_state_array_type  velocity ;
+  const buffer_type  output ;
+  const size_type    inode_base ;
+  const size_type    state_next ;
+
+  pack_state( const buffer_type & arg_output ,
+              const Fields      & mesh_fields ,
+              const size_type     arg_begin ,
+              const size_type     arg_state )
+   : displacement( mesh_fields.displacement )
+   , velocity(     mesh_fields.velocity )
+   , output(       arg_output )
+   , inode_base(   arg_begin )
+   , state_next(   arg_state )
+   {}
+
+  static void apply( const buffer_type & arg_output ,
+                     const size_type     arg_begin ,
+                     const size_type     arg_count ,
+                     const Fields      & mesh_fields ,
+                     const size_type     arg_state )
+  {
+    pack_state op( arg_output , mesh_fields , arg_begin , arg_state );
+
+    Kokkos::parallel_for( arg_count , op );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type i ) const
+  {
+    const size_type inode = inode_base + i ;
+
+    size_type j = i * value_count ;
+
+    output[j++] = displacement( inode , 0 , state_next );
+    output[j++] = displacement( inode , 1 , state_next );
+    output[j++] = displacement( inode , 2 , state_next );
+    output[j++] = velocity( inode , 0 , state_next );
+    output[j++] = velocity( inode , 1 , state_next );
+    output[j++] = velocity( inode , 2 , state_next );
+  }
+};
+
+template< typename Scalar , class DeviceType >
+struct unpack_state
+{
+  typedef DeviceType     execution_space ;
+  typedef typename execution_space::size_type  size_type ;
+
+  typedef Explicit::Fields< Scalar , execution_space >  Fields ;
+
+  typedef typename Fields::geom_state_array_type::value_type  value_type ;
+  typedef Kokkos::View< value_type* , execution_space >     buffer_type ;
+
+  static const unsigned value_count = 6 ;
+
+  const typename Fields::geom_state_array_type  displacement ;
+  const typename Fields::geom_state_array_type  velocity ;
+  const buffer_type  input ;
+  const size_type    inode_base ;
+  const size_type    state_next ;
+
+  unpack_state( const buffer_type & arg_input ,
+                const Fields      & mesh_fields ,
+                const size_type     arg_begin ,
+                const size_type     arg_state )
+   : displacement( mesh_fields.displacement )
+   , velocity(     mesh_fields.velocity )
+   , input(        arg_input )
+   , inode_base(   arg_begin )
+   , state_next(   arg_state )
+   {}
+
+  static void apply( const Fields      & mesh_fields ,
+                     const size_type     arg_state ,
+                     const buffer_type & arg_input ,
+                     const size_type     arg_begin ,
+                     const size_type     arg_count )
+  {
+    unpack_state op( arg_input , mesh_fields , arg_begin , arg_state );
+
+    Kokkos::parallel_for( arg_count , op );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type i ) const
+  {
+    const size_type inode = inode_base + i ;
+
+    size_type j = i * value_count ;
+
+    displacement( inode , 0 , state_next ) = input[j++] ;
+    displacement( inode , 1 , state_next ) = input[j++] ;
+    displacement( inode , 2 , state_next ) = input[j++] ;
+    velocity( inode , 0 , state_next ) = input[j++] ;
+    velocity( inode , 1 , state_next ) = input[j++] ;
+    velocity( inode , 2 , state_next ) = input[j++] ;
+  }
+};
+
+} /* namespace Explicit */
+
+#endif /* #ifndef KOKKOS_EXPLICITFUNCTORS_HPP */
+
+
diff --git a/lib/kokkos/example/multi_fem/FEMesh.hpp b/lib/kokkos/example/multi_fem/FEMesh.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..33468e2fbd7a3739f92bdb9473b0ae01b60b1311
--- /dev/null
+++ b/lib/kokkos/example/multi_fem/FEMesh.hpp
@@ -0,0 +1,86 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_FEMESH_HPP
+#define KOKKOS_FEMESH_HPP
+
+#include <utility>
+#include <limits>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+
+#include <Kokkos_Core.hpp>
+#include <Kokkos_StaticCrsGraph.hpp>
+
+#include <ParallelComm.hpp>
+#include <ParallelDataMap.hpp>
+
+namespace HybridFEM {
+
+//----------------------------------------------------------------------------
+/** \brief  Finite element mesh fixture for hybrid parallel performance tests.
+ */
+template< typename CoordScalarType , unsigned ElemNodeCount , class Device >
+struct FEMesh {
+
+  typedef typename Device::size_type size_type ;
+
+  static const size_type element_node_count = ElemNodeCount ;
+
+  typedef Kokkos::View< CoordScalarType*[3] , Device >       node_coords_type ;
+  typedef Kokkos::View< size_type*[ElemNodeCount], Device >  elem_node_ids_type ;
+  typedef Kokkos::StaticCrsGraph< size_type[2] ,  Device >   node_elem_ids_type ;
+
+  node_coords_type         node_coords ;
+  elem_node_ids_type       elem_node_ids ;
+  node_elem_ids_type       node_elem_ids ;
+  Kokkos::ParallelDataMap  parallel_data_map ;
+};
+
+//----------------------------------------------------------------------------
+
+} /* namespace HybridFEM */
+
+#endif /* #ifndef KOKKOS_FEMESH_HPP */
+
diff --git a/lib/kokkos/example/multi_fem/HexElement.hpp b/lib/kokkos/example/multi_fem/HexElement.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..7dec087cbdad3334d2dc264ca8560cc5e3b5ea3b
--- /dev/null
+++ b/lib/kokkos/example/multi_fem/HexElement.hpp
@@ -0,0 +1,268 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef ELEMENTHEX_HPP
+#define ELEMENTHEX_HPP
+
+namespace HybridFEM {
+
+template< unsigned NodeCount >
+class HexElement_TensorData ;
+
+template< unsigned NodeCount , class Device >
+class HexElement_TensorEval ;
+
+//----------------------------------------------------------------------------
+/** \brief  Evaluate Hex element on interval [-1,1]^3 */
+template<>
+class HexElement_TensorData< 8 > {
+public:
+
+  static const unsigned element_node_count    = 8 ;
+  static const unsigned spatial_dimension     = 3 ;
+  static const unsigned integration_count_1d  = 2 ;
+  static const unsigned function_count_1d     = 2 ;
+
+  float values_1d [ function_count_1d ][ integration_count_1d ];
+  float derivs_1d [ function_count_1d ][ integration_count_1d ];
+  float weights_1d[ integration_count_1d ];
+
+  unsigned char eval_map[ element_node_count ][4] ;
+
+  static float eval_value_1d( const unsigned jf , const float x )
+  {
+    return 0 == jf ? 0.5 * ( 1.0 - x ) : (
+           1 == jf ? 0.5 * ( 1.0 + x ) : 0 );
+  }
+
+  static float eval_deriv_1d( const unsigned jf , const float )
+  {
+    return 0 == jf ? -0.5 : (
+           1 == jf ?  0.5 : 0 );
+  }
+
+  HexElement_TensorData()
+  {
+    const unsigned char tmp_map[ element_node_count ][ spatial_dimension ] =
+      { { 0 , 0 , 0 },
+        { 1 , 0 , 0 },
+        { 1 , 1 , 0 },
+        { 0 , 1 , 0 },
+        { 0 , 0 , 1 },
+        { 1 , 0 , 1 },
+        { 1 , 1 , 1 },
+        { 0 , 1 , 1 } };
+
+    weights_1d[0] = 1 ;
+    weights_1d[1] = 1 ;
+
+    const float points_1d[ integration_count_1d ] =
+      { -0.577350269 , 0.577350269 };
+
+    for ( unsigned i = 0 ; i < element_node_count ; ++i ) {
+      eval_map[i][0] = tmp_map[i][0];
+      eval_map[i][1] = tmp_map[i][1];
+      eval_map[i][2] = tmp_map[i][2];
+    }
+
+    for ( unsigned xp = 0 ; xp < integration_count_1d ; ++xp ) {
+    for ( unsigned xf = 0 ; xf < function_count_1d ; ++xf ) {
+      values_1d[xp][xf] = eval_value_1d( xf , points_1d[xp] );
+      derivs_1d[xp][xf] = eval_deriv_1d( xf , points_1d[xp] );
+    }}
+  }
+};
+
+//----------------------------------------------------------------------------
+
+template<>
+class HexElement_TensorData< 27 > {
+public:
+
+  static const unsigned element_node_count    = 27 ;
+  static const unsigned spatial_dimension     = 3 ;
+  static const unsigned integration_count_1d  = 3 ;
+  static const unsigned function_count_1d     = 3 ;
+
+  float values_1d [ function_count_1d ][ integration_count_1d ];
+  float derivs_1d [ function_count_1d ][ integration_count_1d ];
+  float weights_1d[ integration_count_1d ];
+
+  unsigned char eval_map[ element_node_count ][4] ;
+
+  // sizeof(EvaluateElementHex) = 111 bytes =
+  //   sizeof(float) * 9 +
+  //   sizeof(float) * 9 +
+  //   sizeof(float) * 3 +
+  //   sizeof(char) * 27 
+
+  static float eval_value_1d( const unsigned jf , const float p )
+  {
+    return 0 == jf ? 0.5 * p * ( p - 1 ) : (
+           1 == jf ? 1.0 - p * p : (
+           2 == jf ? 0.5 * p * ( p + 1 ) : 0 ));
+  }
+
+  static float eval_deriv_1d( const unsigned jf , const float p )
+  {
+    return 0 == jf ? p - 0.5 : (
+           1 == jf ? -2.0 * p : (
+           2 == jf ? p + 0.5 : 0 ));
+  }
+
+  HexElement_TensorData()
+  {
+    const unsigned char tmp_map[ element_node_count ][ spatial_dimension ] =
+      { { 0 , 0 , 0 },
+        { 2 , 0 , 0 },
+        { 2 , 2 , 0 },
+        { 0 , 2 , 0 },
+        { 0 , 0 , 2 },
+        { 2 , 0 , 2 },
+        { 2 , 2 , 2 },
+        { 0 , 2 , 2 },
+        { 1 , 0 , 0 },
+        { 2 , 1 , 0 },
+        { 1 , 2 , 0 },
+        { 0 , 1 , 0 },
+        { 0 , 0 , 1 },
+        { 2 , 0 , 1 },
+        { 2 , 2 , 1 },
+        { 0 , 2 , 1 },
+        { 1 , 0 , 2 },
+        { 2 , 1 , 2 },
+        { 1 , 2 , 2 },
+        { 0 , 1 , 2 },
+        { 1 , 1 , 1 },
+        { 1 , 1 , 0 },
+        { 1 , 1 , 2 },
+        { 0 , 1 , 1 },
+        { 2 , 1 , 1 },
+        { 1 , 0 , 1 },
+        { 1 , 2 , 1 } };
+
+    // Interval [-1,1]
+
+    weights_1d[0] = 0.555555556 ;
+    weights_1d[1] = 0.888888889 ;
+    weights_1d[2] = 0.555555556 ;
+
+    const float points_1d[3] = { -0.774596669 ,
+                                  0.000000000 ,
+                                  0.774596669 };
+
+    for ( unsigned i = 0 ; i < element_node_count ; ++i ) {
+      eval_map[i][0] = tmp_map[i][0];
+      eval_map[i][1] = tmp_map[i][1];
+      eval_map[i][2] = tmp_map[i][2];
+    }
+
+    for ( unsigned xp = 0 ; xp < integration_count_1d ; ++xp ) {
+    for ( unsigned xf = 0 ; xf < function_count_1d ; ++xf ) {
+      values_1d[xp][xf] = eval_value_1d( xf , points_1d[xp] );
+      derivs_1d[xp][xf] = eval_deriv_1d( xf , points_1d[xp] );
+    }}
+  }
+};
+
+//----------------------------------------------------------------------------
+
+template< unsigned NodeCount >
+class HexElement_Data {
+public:
+  static const unsigned spatial_dimension   = 3 ;
+  static const unsigned element_node_count  = NodeCount ;
+  static const unsigned integration_count   = NodeCount ;
+  static const unsigned function_count      = NodeCount ;
+
+  float weights[   integration_count ] ;
+  float values[    integration_count ][ function_count ];
+  float gradients[ integration_count ][ spatial_dimension ][ function_count ];
+
+  HexElement_Data()
+  {
+    HexElement_TensorData< NodeCount > tensor_data ;
+
+    for ( unsigned ip = 0 ; ip < integration_count ; ++ip ) {
+
+      const unsigned ipx = tensor_data.eval_map[ip][0] ;
+      const unsigned ipy = tensor_data.eval_map[ip][1] ;
+      const unsigned ipz = tensor_data.eval_map[ip][2] ;
+
+      weights[ip] = tensor_data.weights_1d[ ipx ] *
+                    tensor_data.weights_1d[ ipy ] *
+                    tensor_data.weights_1d[ ipz ] ;
+
+      for ( unsigned jf = 0 ; jf < function_count ; ++jf ) {
+
+        const unsigned jfx = tensor_data.eval_map[jf][0] ;
+        const unsigned jfy = tensor_data.eval_map[jf][1] ;
+        const unsigned jfz = tensor_data.eval_map[jf][2] ;
+
+        values[ip][jf] = tensor_data.values_1d[ ipx ][ jfx ] *
+                         tensor_data.values_1d[ ipy ][ jfy ] *
+                         tensor_data.values_1d[ ipz ][ jfz ] ;
+
+        gradients[ip][0][jf] = tensor_data.derivs_1d[ ipx ][ jfx ] *
+                               tensor_data.values_1d[ ipy ][ jfy ] *
+                               tensor_data.values_1d[ ipz ][ jfz ] ;
+
+        gradients[ip][1][jf] = tensor_data.values_1d[ ipx ][ jfx ] *
+                               tensor_data.derivs_1d[ ipy ][ jfy ] *
+                               tensor_data.values_1d[ ipz ][ jfz ] ;
+
+        gradients[ip][2][jf] = tensor_data.values_1d[ ipx ][ jfx ] *
+                               tensor_data.values_1d[ ipy ][ jfy ] *
+                               tensor_data.derivs_1d[ ipz ][ jfz ] ;
+      }
+    }
+  }
+};
+
+//----------------------------------------------------------------------------
+
+} /* namespace HybridFEM */
+
+#endif /* #ifndef ELEMENTHEX_HPP */
+
+
diff --git a/lib/kokkos/example/multi_fem/HexExplicitFunctions.hpp b/lib/kokkos/example/multi_fem/HexExplicitFunctions.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..3c4ca582245f687c360eb8812d899ff33a6a1644
--- /dev/null
+++ b/lib/kokkos/example/multi_fem/HexExplicitFunctions.hpp
@@ -0,0 +1,443 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_HEXEXPLICITFUNCTIONS_HPP
+#define KOKKOS_HEXEXPLICITFUNCTIONS_HPP
+
+#include <math.h>
+
+namespace Explicit {
+
+struct Hex8Functions
+{
+  static const unsigned SpatialDim    = 3 ;
+  static const unsigned ElemNodeCount = 8 ;
+
+  // Indices for full 3x3 tensor:
+
+  static const unsigned K_F_XX = 0 ;
+  static const unsigned K_F_YY = 1 ;
+  static const unsigned K_F_ZZ = 2 ;
+  static const unsigned K_F_XY = 3 ;
+  static const unsigned K_F_YZ = 4 ;
+  static const unsigned K_F_ZX = 5 ;
+  static const unsigned K_F_YX = 6 ;
+  static const unsigned K_F_ZY = 7 ;
+  static const unsigned K_F_XZ = 8 ;
+  static const unsigned K_F_SIZE = 9 ;
+
+  //  Indexes into a 3 by 3 symmetric tensor stored as a length 6 vector
+
+  static const unsigned K_S_XX = 0 ;
+  static const unsigned K_S_YY = 1 ;
+  static const unsigned K_S_ZZ = 2 ;
+  static const unsigned K_S_XY = 3 ;
+  static const unsigned K_S_YZ = 4 ;
+  static const unsigned K_S_ZX = 5 ;
+  static const unsigned K_S_YX = 3 ;
+  static const unsigned K_S_ZY = 4 ;
+  static const unsigned K_S_XZ = 5 ;
+  static const unsigned K_S_SIZE = 6 ;
+
+  //  Indexes into a 3 by 3 skew symmetric tensor stored as a length 3 vector
+
+  static const unsigned K_V_XY = 0 ;
+  static const unsigned K_V_YZ = 1 ;
+  static const unsigned K_V_ZX = 2 ;
+  static const unsigned K_V_SIZE = 3 ;
+
+  //--------------------------------------------------------------------------
+
+  template< typename ScalarA , typename ScalarB >
+  KOKKOS_INLINE_FUNCTION static
+  double dot8( const ScalarA * const a , const ScalarB * const b )
+  { return a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + a[3] * b[3] +
+           a[4] * b[4] + a[5] * b[5] + a[6] * b[6] + a[7] * b[7] ; }
+
+  //--------------------------------------------------------------------------
+
+  template< class ScalarPrecise ,
+            class ScalarCompact >
+  KOKKOS_INLINE_FUNCTION static
+  void grad( const ScalarPrecise x[] ,
+             const ScalarPrecise z[] ,
+                   ScalarCompact grad_y[] )
+  {
+    const ScalarCompact R42=(x[3] - x[1]);
+    const ScalarCompact R52=(x[4] - x[1]);
+    const ScalarCompact R54=(x[4] - x[3]);
+
+    const ScalarCompact R63=(x[5] - x[2]);
+    const ScalarCompact R83=(x[7] - x[2]);
+    const ScalarCompact R86=(x[7] - x[5]);
+
+    const ScalarCompact R31=(x[2] - x[0]);
+    const ScalarCompact R61=(x[5] - x[0]);
+    const ScalarCompact R74=(x[6] - x[3]);
+
+    const ScalarCompact R72=(x[6] - x[1]);
+    const ScalarCompact R75=(x[6] - x[4]);
+    const ScalarCompact R81=(x[7] - x[0]);
+
+    const ScalarCompact t1=(R63 + R54);
+    const ScalarCompact t2=(R61 + R74);
+    const ScalarCompact t3=(R72 + R81);
+
+    const ScalarCompact t4 =(R86 + R42);
+    const ScalarCompact t5 =(R83 + R52);
+    const ScalarCompact t6 =(R75 + R31);
+
+    //  Calculate Y gradient from X and Z data
+
+    grad_y[0] = (z[1] *  t1) - (z[2] * R42) - (z[3] *  t5)  + (z[4] *  t4) + (z[5] * R52) - (z[7] * R54);
+    grad_y[1] = (z[2] *  t2) + (z[3] * R31) - (z[0] *  t1)  - (z[5] *  t6) + (z[6] * R63) - (z[4] * R61);
+    grad_y[2] = (z[3] *  t3) + (z[0] * R42) - (z[1] *  t2)  - (z[6] *  t4) + (z[7] * R74) - (z[5] * R72);
+    grad_y[3] = (z[0] *  t5) - (z[1] * R31) - (z[2] *  t3)  + (z[7] *  t6) + (z[4] * R81) - (z[6] * R83);
+    grad_y[4] = (z[5] *  t3) + (z[6] * R86) - (z[7] *  t2)  - (z[0] *  t4) - (z[3] * R81) + (z[1] * R61);
+    grad_y[5] = (z[6] *  t5) - (z[4] *  t3)  - (z[7] * R75) + (z[1] *  t6) - (z[0] * R52) + (z[2] * R72);
+    grad_y[6] = (z[7] *  t1) - (z[5] *  t5)  - (z[4] * R86) + (z[2] *  t4) - (z[1] * R63) + (z[3] * R83);
+    grad_y[7] = (z[4] *  t2) - (z[6] *  t1)  + (z[5] * R75) - (z[3] *  t6) - (z[2] * R74) + (z[0] * R54);
+  }
+
+  template< class ScalarPrecise ,
+            class ScalarCompact >
+  static KOKKOS_INLINE_FUNCTION
+  void grad( const ScalarPrecise x[] ,
+             const ScalarPrecise y[] ,
+             const ScalarPrecise z[] ,
+                   ScalarCompact grad_x[] ,
+                   ScalarCompact grad_y[] ,
+                   ScalarCompact grad_z[] )
+  {
+    grad( x , z , grad_y );
+    grad( z , y , grad_x );
+    grad( y , x , grad_z );
+  }
+
+  //--------------------------------------------------------------------------
+
+  template< class ScalarPrecise ,
+            class ScalarCompact >
+  KOKKOS_INLINE_FUNCTION static
+  void polar_decomp( const float dt ,
+                     const ScalarCompact v_gr[] ,
+                           ScalarPrecise stretch[] /* INOUT */ ,
+                           ScalarCompact str_ten[] /* OUT */ ,
+                           ScalarCompact rot[]     /* OUT */ )
+  {
+    const float dt_half = 0.5 * dt;
+
+    ScalarCompact vort[ K_V_SIZE ];  // Vorticity
+
+    //  Symmetric part
+    str_ten[K_S_XX] = v_gr[K_F_XX];
+    str_ten[K_S_YY] = v_gr[K_F_YY];
+    str_ten[K_S_ZZ] = v_gr[K_F_ZZ];
+    str_ten[K_S_XY] = 0.5 * ( v_gr[K_F_XY] + v_gr[K_F_YX] );
+    str_ten[K_S_YZ] = 0.5 * ( v_gr[K_F_YZ] + v_gr[K_F_ZY] );
+    str_ten[K_S_ZX] = 0.5 * ( v_gr[K_F_ZX] + v_gr[K_F_XZ] );
+
+    //  Skew Symmetric part
+    vort[K_V_XY] = 0.5 * ( v_gr[K_F_XY] - v_gr[K_F_YX] );
+    vort[K_V_YZ] = 0.5 * ( v_gr[K_F_YZ] - v_gr[K_F_ZY] );
+    vort[K_V_ZX] = 0.5 * ( v_gr[K_F_ZX] - v_gr[K_F_XZ] );
+
+    //   calculate the rates of rotation via gauss elimination.
+
+    ScalarCompact z1 = str_ten[K_S_XY] * stretch[K_S_ZX] -
+                       str_ten[K_S_ZX] * stretch[K_S_XY] +
+                       str_ten[K_S_YY] * stretch[K_S_YZ] -
+                       str_ten[K_S_YZ] * stretch[K_S_YY] +
+                       str_ten[K_S_YZ] * stretch[K_S_ZZ] -
+                       str_ten[K_S_ZZ] * stretch[K_S_YZ];
+
+    ScalarCompact z2 = str_ten[K_S_ZX] * stretch[K_S_XX] -
+                       str_ten[K_S_XX] * stretch[K_S_ZX] +
+                       str_ten[K_S_YZ] * stretch[K_S_XY] -
+                       str_ten[K_S_XY] * stretch[K_S_YZ] +
+                       str_ten[K_S_ZZ] * stretch[K_S_ZX] -
+                       str_ten[K_S_ZX] * stretch[K_S_ZZ];
+
+    ScalarCompact z3 = str_ten[K_S_XX] * stretch[K_S_XY] -
+                       str_ten[K_S_XY] * stretch[K_S_XX] +
+                       str_ten[K_S_XY] * stretch[K_S_YY] -
+                       str_ten[K_S_YY] * stretch[K_S_XY] +
+                       str_ten[K_S_ZX] * stretch[K_S_YZ] -
+                       str_ten[K_S_YZ] * stretch[K_S_ZX];
+
+    {
+      //   forward elimination
+
+      const ScalarCompact a1inv  = 1.0 / (stretch[K_S_YY] + stretch[K_S_ZZ]);
+      const ScalarCompact a4BYa1 = -1 * stretch[K_S_XY] * a1inv;
+      const ScalarCompact a2inv  = 1.0 / (stretch[K_S_ZZ] + stretch[K_S_XX] + stretch[K_S_XY] * a4BYa1);
+
+     const ScalarCompact a5 =  -stretch[K_S_YZ] + stretch[K_S_ZX] * a4BYa1;
+
+      z2 -= z1 * a4BYa1;
+      const ScalarCompact a6BYa1 = -1 * stretch[K_S_ZX] * a1inv;
+      const ScalarCompact a5BYa2 = a5 * a2inv;
+      z3 -= z1 * a6BYa1 - z2 * a5BYa2;
+
+      //   backward substitution -
+
+      z3 /= (stretch[K_S_XX] + stretch[K_S_YY] + stretch[K_S_ZX] * a6BYa1 + a5 * a5BYa2);
+      z2 = (z2 - a5 * z3) * a2inv;
+      z1 = (z1*a1inv - a6BYa1 * z3 -a4BYa1 * z2);
+    }
+
+    //   calculate rotation rates - recall that spin_rate is an asymmetric tensor,
+    //   so compute spin rate vector as dual of spin rate tensor,
+    //   i.e   w_i = e_ijk * spin_rate_jk
+
+    z1 += vort[K_V_YZ];
+    z2 += vort[K_V_ZX];
+    z3 += vort[K_V_XY];
+
+    {
+      //   update rotation tensor:
+      //  1) premultiply old rotation tensor to get right-hand side.
+
+      ScalarCompact r_XX = rot[K_F_XX] + dt_half*( z3 * rot[K_F_YX] - z2 * rot[K_F_ZX] );
+      ScalarCompact r_YX = rot[K_F_YX] + dt_half*( z1 * rot[K_F_ZX] - z3 * rot[K_F_XX] );
+      ScalarCompact r_ZX = rot[K_F_ZX] + dt_half*( z2 * rot[K_F_XX] - z1 * rot[K_F_YX] );
+      ScalarCompact r_XY = rot[K_F_XY] + dt_half*( z3 * rot[K_F_YY] - z2 * rot[K_F_ZY] );
+      ScalarCompact r_YY = rot[K_F_YY] + dt_half*( z1 * rot[K_F_ZY] - z3 * rot[K_F_XY] );
+      ScalarCompact r_ZY = rot[K_F_ZY] + dt_half*( z2 * rot[K_F_XY] - z1 * rot[K_F_YY] );
+      ScalarCompact r_XZ = rot[K_F_XZ] + dt_half*( z3 * rot[K_F_YZ] - z2 * rot[K_F_ZZ] );
+      ScalarCompact r_YZ = rot[K_F_YZ] + dt_half*( z1 * rot[K_F_ZZ] - z3 * rot[K_F_XZ] );
+      ScalarCompact r_ZZ = rot[K_F_ZZ] + dt_half*( z2 * rot[K_F_XZ] - z1 * rot[K_F_YZ] );
+
+
+      //  2) solve for new rotation tensor via gauss elimination.
+      //   forward elimination -
+
+      const ScalarCompact a12 = - dt_half * z3;
+      const ScalarCompact a13 =   dt_half * z2;
+            ScalarCompact b32 = - dt_half * z1;
+      const ScalarCompact a22inv = 1.0 / (1.0 + a12 * a12);
+
+      const ScalarCompact a13a12 = a13*a12;
+      const ScalarCompact a23 = b32 + a13a12;
+
+      r_YX += r_XX * a12;
+      r_YY += r_XY * a12;
+      r_YZ += r_XZ * a12;
+
+      b32 = (b32 - a13a12) * a22inv;
+
+      r_ZX += r_XX * a13 + r_YX * b32;
+      r_ZY += r_XY * a13 + r_YY * b32;
+      r_ZZ += r_XZ * a13 + r_YZ * b32;
+
+      //   backward substitution -
+
+      const ScalarCompact a33inv = 1.0 / (1.0 + a13 * a13 + a23 * b32);
+
+      rot[K_F_ZX] = r_ZX * a33inv;
+      rot[K_F_ZY] = r_ZY * a33inv;
+      rot[K_F_ZZ] = r_ZZ * a33inv;
+      rot[K_F_YX] = ( r_YX - rot[K_F_ZX] * a23 ) * a22inv;
+      rot[K_F_YY] = ( r_YY - rot[K_F_ZY] * a23 ) * a22inv;
+      rot[K_F_YZ] = ( r_YZ - rot[K_F_ZZ] * a23 ) * a22inv;
+      rot[K_F_XX] = r_XX - rot[K_F_ZX] * a13 - rot[K_F_YX] * a12;
+      rot[K_F_XY] = r_XY - rot[K_F_ZY] * a13 - rot[K_F_YY] * a12;
+      rot[K_F_XZ] = r_XZ - rot[K_F_ZZ] * a13 - rot[K_F_YZ] * a12;
+    }
+
+    //   update stretch tensor in the new configuration -
+
+    const ScalarCompact a1 = str_ten[K_S_XY] + vort[K_V_XY];
+    const ScalarCompact a2 = str_ten[K_S_YZ] + vort[K_V_YZ];
+    const ScalarCompact a3 = str_ten[K_S_ZX] + vort[K_V_ZX];
+    const ScalarCompact b1 = str_ten[K_S_ZX] - vort[K_V_ZX];
+    const ScalarCompact b2 = str_ten[K_S_XY] - vort[K_V_XY];
+    const ScalarCompact b3 = str_ten[K_S_YZ] - vort[K_V_YZ];
+
+    const ScalarCompact s_XX = stretch[K_S_XX];
+    const ScalarCompact s_YY = stretch[K_S_YY];
+    const ScalarCompact s_ZZ = stretch[K_S_ZZ];
+    const ScalarCompact s_XY = stretch[K_S_XY];
+    const ScalarCompact s_YZ = stretch[K_S_YZ];
+    const ScalarCompact s_ZX = stretch[K_S_ZX];
+
+    stretch[K_S_XX] += dt * (str_ten[K_S_XX] * s_XX + ( a1 + z3 ) * s_XY + ( b1 - z2 ) * s_ZX);
+    stretch[K_S_YY] += dt * (str_ten[K_S_YY] * s_YY + ( a2 + z1 ) * s_YZ + ( b2 - z3 ) * s_XY);
+    stretch[K_S_ZZ] += dt * (str_ten[K_S_ZZ] * s_ZZ + ( a3 + z2 ) * s_ZX + ( b3 - z1 ) * s_YZ);
+    stretch[K_S_XY] += dt * (str_ten[K_S_XX] * s_XY + ( a1 )      * s_YY + ( b1      ) * s_YZ - z3 * s_XX + z1 * s_ZX);
+    stretch[K_S_YZ] += dt * (str_ten[K_S_YY] * s_YZ + ( a2 )      * s_ZZ + ( b2      ) * s_ZX - z1 * s_YY + z2 * s_XY);
+    stretch[K_S_ZX] += dt * (str_ten[K_S_ZZ] * s_ZX + ( a3 )      * s_XX + ( b3      ) * s_XY - z2 * s_ZZ + z3 * s_YZ);
+  }
+
+  //--------------------------------------------------------------------------
+
+  template< typename ScalarCompact >
+  static KOKKOS_INLINE_FUNCTION
+  void rotate_tensor( const ScalarCompact str_ten[] ,
+                      const ScalarCompact rot[] ,
+                            ScalarCompact rot_str[] )
+  {
+    ScalarCompact t[9];
+
+    t[0] = str_ten[K_S_XX]*rot[K_F_XX] + str_ten[K_S_XY]*rot[K_F_YX] + str_ten[K_S_XZ]*rot[K_F_ZX];
+    t[1] = str_ten[K_S_YX]*rot[K_F_XX] + str_ten[K_S_YY]*rot[K_F_YX] + str_ten[K_S_YZ]*rot[K_F_ZX];
+    t[2] = str_ten[K_S_ZX]*rot[K_F_XX] + str_ten[K_S_ZY]*rot[K_F_YX] + str_ten[K_S_ZZ]*rot[K_F_ZX];
+
+    t[3] = str_ten[K_S_XX]*rot[K_F_XY] + str_ten[K_S_XY]*rot[K_F_YY] + str_ten[K_S_XZ]*rot[K_F_ZY];
+    t[4] = str_ten[K_S_YX]*rot[K_F_XY] + str_ten[K_S_YY]*rot[K_F_YY] + str_ten[K_S_YZ]*rot[K_F_ZY];
+    t[5] = str_ten[K_S_ZX]*rot[K_F_XY] + str_ten[K_S_ZY]*rot[K_F_YY] + str_ten[K_S_ZZ]*rot[K_F_ZY];
+
+    t[6] = str_ten[K_S_XX]*rot[K_F_XZ] + str_ten[K_S_XY]*rot[K_F_YZ] + str_ten[K_S_XZ]*rot[K_F_ZZ];
+    t[7] = str_ten[K_S_YX]*rot[K_F_XZ] + str_ten[K_S_YY]*rot[K_F_YZ] + str_ten[K_S_YZ]*rot[K_F_ZZ];
+    t[8] = str_ten[K_S_ZX]*rot[K_F_XZ] + str_ten[K_S_ZY]*rot[K_F_YZ] + str_ten[K_S_ZZ]*rot[K_F_ZZ];
+
+
+    rot_str[ K_S_XX ] = rot[K_F_XX] * t[0] + rot[K_F_YX] * t[1] + rot[K_F_ZX] * t[2];
+    rot_str[ K_S_YY ] = rot[K_F_XY] * t[3] + rot[K_F_YY] * t[4] + rot[K_F_ZY] * t[5];
+    rot_str[ K_S_ZZ ] = rot[K_F_XZ] * t[6] + rot[K_F_YZ] * t[7] + rot[K_F_ZZ] * t[8];
+
+    rot_str[ K_S_XY ] = rot[K_F_XX] * t[3] + rot[K_F_YX] * t[4] + rot[K_F_ZX] * t[5];
+    rot_str[ K_S_YZ ] = rot[K_F_XY] * t[6] + rot[K_F_YY] * t[7] + rot[K_F_ZY] * t[8];
+    rot_str[ K_S_ZX ] = rot[K_F_XZ] * t[0] + rot[K_F_YZ] * t[1] + rot[K_F_ZZ] * t[2];
+  }
+
+  //--------------------------------------------------------------------------
+
+  template< class ScalarPrecise ,
+            class ScalarCompact >
+  static KOKKOS_INLINE_FUNCTION
+  void rotate_tensor_backward( const ScalarPrecise stress[] ,
+                               const ScalarCompact rot[] ,
+                                     ScalarCompact rot_stress[] )
+  {
+    ScalarCompact t[9] ;
+
+    t[0] = stress[K_S_XX]*rot[K_F_XX]+ stress[K_S_XY]*rot[K_F_XY]+ stress[K_S_XZ]*rot[K_F_XZ];
+    t[1] = stress[K_S_YX]*rot[K_F_XX]+ stress[K_S_YY]*rot[K_F_XY]+ stress[K_S_YZ]*rot[K_F_XZ];
+    t[2] = stress[K_S_ZX]*rot[K_F_XX]+ stress[K_S_ZY]*rot[K_F_XY]+ stress[K_S_ZZ]*rot[K_F_XZ];
+    t[3] = stress[K_S_XX]*rot[K_F_YX]+ stress[K_S_XY]*rot[K_F_YY]+ stress[K_S_XZ]*rot[K_F_YZ];
+    t[4] = stress[K_S_YX]*rot[K_F_YX]+ stress[K_S_YY]*rot[K_F_YY]+ stress[K_S_YZ]*rot[K_F_YZ];
+    t[5] = stress[K_S_ZX]*rot[K_F_YX]+ stress[K_S_ZY]*rot[K_F_YY]+ stress[K_S_ZZ]*rot[K_F_YZ];
+    t[6] = stress[K_S_XX]*rot[K_F_ZX]+ stress[K_S_XY]*rot[K_F_ZY]+ stress[K_S_XZ]*rot[K_F_ZZ];
+    t[7] = stress[K_S_YX]*rot[K_F_ZX]+ stress[K_S_YY]*rot[K_F_ZY]+ stress[K_S_YZ]*rot[K_F_ZZ];
+    t[8] = stress[K_S_ZX]*rot[K_F_ZX]+ stress[K_S_ZY]*rot[K_F_ZY]+ stress[K_S_ZZ]*rot[K_F_ZZ];
+
+    rot_stress[ K_S_XX ] = rot[K_F_XX]*t[0] + rot[K_F_XY]*t[1] + rot[K_F_XZ]*t[2];
+    rot_stress[ K_S_YY ] = rot[K_F_YX]*t[3] + rot[K_F_YY]*t[4] + rot[K_F_YZ]*t[5];
+    rot_stress[ K_S_ZZ ] = rot[K_F_ZX]*t[6] + rot[K_F_ZY]*t[7] + rot[K_F_ZZ]*t[8];
+
+    rot_stress[ K_S_XY ] = rot[K_F_XX]*t[3] + rot[K_F_XY]*t[4] + rot[K_F_XZ]*t[5];
+    rot_stress[ K_S_YZ ] = rot[K_F_YX]*t[6] + rot[K_F_YY]*t[7] + rot[K_F_YZ]*t[8];
+    rot_stress[ K_S_ZX ] = rot[K_F_ZX]*t[0] + rot[K_F_ZY]*t[1] + rot[K_F_ZZ]*t[2];
+  }
+
+  //--------------------------------------------------------------------------
+
+  template< class ScalarPrecise ,
+            class ScalarCompact >
+  KOKKOS_INLINE_FUNCTION static
+  void update_stress( const float dt ,
+                      const float two_mu ,
+                      const float bulk_modulus ,
+                      const ScalarCompact rot_str[] ,
+                            ScalarPrecise stress[] )
+  {
+    const ScalarCompact e = rot_str[ K_S_XX ] + rot_str[ K_S_YY ] + rot_str[ K_S_ZZ ] ;
+    const ScalarCompact eb = e * bulk_modulus ;
+    const ScalarCompact e3 = e / 3.0 ;
+
+    stress[K_S_XX] += dt * ( two_mu * ( rot_str[K_S_XX] - e3 ) + eb );
+    stress[K_S_YY] += dt * ( two_mu * ( rot_str[K_S_YY] - e3 ) + eb );
+    stress[K_S_ZZ] += dt * ( two_mu * ( rot_str[K_S_ZZ] - e3 ) + eb );
+
+    stress[K_S_XY] += dt * two_mu * rot_str[K_S_XY];
+    stress[K_S_YZ] += dt * two_mu * rot_str[K_S_YZ];
+    stress[K_S_ZX] += dt * two_mu * rot_str[K_S_ZX];
+  }
+
+  //--------------------------------------------------------------------------
+
+  template< class ScalarPrecise ,
+            class ScalarCompact >
+  static KOKKOS_INLINE_FUNCTION
+  void comp_force( const ScalarPrecise vx[] ,
+                   const ScalarPrecise vy[] ,
+                   const ScalarPrecise vz[] ,
+                   const ScalarCompact grad_x[] ,
+                   const ScalarCompact grad_y[] ,
+                   const ScalarCompact grad_z[] ,
+                   const ScalarCompact total_stress12th[] ,
+                         ScalarCompact force[][ SpatialDim ] ,
+                         ScalarCompact & energy )
+  {
+    ScalarPrecise internal_energy = 0 ;
+
+    for ( unsigned inode = 0; inode < ElemNodeCount ; ++inode ) {
+
+      force[inode][0] = total_stress12th[K_S_XX] * grad_x[inode] +
+                        total_stress12th[K_S_XY] * grad_y[inode] +
+                        total_stress12th[K_S_XZ] * grad_z[inode] ;
+
+      force[inode][1] = total_stress12th[K_S_YX] * grad_x[inode] +
+                        total_stress12th[K_S_YY] * grad_y[inode] +
+                        total_stress12th[K_S_YZ] * grad_z[inode] ;
+
+      force[inode][2] = total_stress12th[K_S_ZX] * grad_x[inode] +
+                        total_stress12th[K_S_ZY] * grad_y[inode] +
+                        total_stress12th[K_S_ZZ] * grad_z[inode] ;
+
+      internal_energy += force[inode][0] * vx[inode] +
+                         force[inode][1] * vy[inode] +
+                         force[inode][2] * vz[inode] ;
+    }
+
+    energy = internal_energy ;
+  }
+
+  //--------------------------------------------------------------------------
+};
+
+} // namespace Explicit
+
+#endif /* #ifndef KOKKOS_HEXEXPLICITFUNCTIONS_HPP */
+
diff --git a/lib/kokkos/example/multi_fem/Implicit.hpp b/lib/kokkos/example/multi_fem/Implicit.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..53f602f11ae3fe6e0a61bf29ded6ad8464f653b0
--- /dev/null
+++ b/lib/kokkos/example/multi_fem/Implicit.hpp
@@ -0,0 +1,341 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef HYBRIDFEM_IMPLICIT_HPP
+#define HYBRIDFEM_IMPLICIT_HPP
+
+#include <utility>
+#include <iostream>
+#include <iomanip>
+
+#include <Kokkos_Core.hpp>
+#include <SparseLinearSystem.hpp>
+#include <SparseLinearSystemFill.hpp>
+#include <ImplicitFunctors.hpp>
+#include <FEMesh.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace HybridFEM {
+namespace Implicit {
+
+struct PerformanceData {
+  double mesh_time ;
+  double graph_time ;
+  double elem_time ;
+  double matrix_gather_fill_time ;
+  double matrix_boundary_condition_time ;
+  double cg_iteration_time ;
+
+  PerformanceData()
+    : mesh_time(0)
+    , graph_time(0)
+    , elem_time(0)
+    , matrix_gather_fill_time(0)
+    , matrix_boundary_condition_time(0)
+    , cg_iteration_time(0)
+    {}
+
+  void best( const PerformanceData & rhs )
+  {
+    mesh_time = std::min( mesh_time , rhs.mesh_time );
+    graph_time = std::min( graph_time , rhs.graph_time );
+    elem_time = std::min( elem_time , rhs.elem_time );
+    matrix_gather_fill_time = std::min( matrix_gather_fill_time , rhs.matrix_gather_fill_time );
+    matrix_boundary_condition_time = std::min( matrix_boundary_condition_time , rhs.matrix_boundary_condition_time );
+    cg_iteration_time = std::min( cg_iteration_time , rhs.cg_iteration_time );
+  }
+};
+
+//----------------------------------------------------------------------------
+
+template< typename Scalar , class FixtureType >
+PerformanceData run( const typename FixtureType::FEMeshType & mesh ,
+                     const int , // global_max_x ,
+                     const int , // global_max_y ,
+                     const int global_max_z ,
+                     const bool print_sample )
+{
+  typedef Scalar                              scalar_type ;
+  typedef FixtureType                         fixture_type ;
+  typedef typename fixture_type::execution_space  execution_space;
+  //typedef typename execution_space::size_type     size_type ; // unused
+
+  typedef typename fixture_type::FEMeshType mesh_type ;
+  typedef typename fixture_type::coordinate_scalar_type coordinate_scalar_type ;
+
+  enum { ElementNodeCount = fixture_type::element_node_count };
+
+  const comm::Machine machine = mesh.parallel_data_map.machine ;
+
+  const size_t element_count = mesh.elem_node_ids.dimension_0();
+
+  const size_t iteration_limit = 200 ;
+  const double residual_tolerance = 1e-14 ;
+
+  size_t iteration_count = 0 ;
+  double residual_norm = 0 ;
+
+  PerformanceData perf_data ;
+
+  //------------------------------------
+  // Sparse linear system types:
+
+  typedef Kokkos::View< scalar_type* , execution_space >   vector_type ;
+  typedef Kokkos::CrsMatrix< scalar_type , execution_space >     matrix_type ;
+  typedef typename matrix_type::graph_type         matrix_graph_type ;
+  typedef typename matrix_type::coefficients_type  matrix_coefficients_type ;
+
+  typedef GraphFactory< matrix_graph_type , mesh_type > graph_factory ;
+
+  //------------------------------------
+  // Problem setup types:
+
+  typedef ElementComputation< scalar_type , scalar_type , execution_space > ElementFunctor ;
+  typedef DirichletBoundary< scalar_type , scalar_type , execution_space > BoundaryFunctor ;
+
+  typedef typename ElementFunctor::elem_matrices_type elem_matrices_type ;
+  typedef typename ElementFunctor::elem_vectors_type  elem_vectors_type ;
+
+  typedef GatherFill< matrix_type ,
+                      mesh_type ,
+                      elem_matrices_type ,
+                      elem_vectors_type > GatherFillFunctor ;
+
+  //------------------------------------
+
+  const scalar_type elem_coeff_K = 2 ;
+  const scalar_type elem_load_Q  = 1 ;
+
+  matrix_type linsys_matrix ;
+  vector_type linsys_rhs ;
+  vector_type linsys_solution ;
+
+  typename graph_factory::element_map_type element_map ;
+
+  Kokkos::Timer wall_clock ;
+
+  //------------------------------------
+  // Generate sparse matrix graph and element->graph map.
+
+  graph_factory::create( mesh , linsys_matrix.graph , element_map );
+
+  execution_space::fence();
+  perf_data.graph_time = comm::max( machine , wall_clock.seconds() );
+
+  //------------------------------------
+  // Allocate linear system coefficients and rhs:
+
+  const size_t local_owned_length =
+    linsys_matrix.graph.row_map.dimension_0() - 1 ;
+
+  linsys_matrix.coefficients =
+    matrix_coefficients_type( "coeff" , linsys_matrix.graph.entries.dimension_0() );
+
+  linsys_rhs      = vector_type( "rhs" , local_owned_length );
+  linsys_solution = vector_type( "solution" , local_owned_length );
+
+  //------------------------------------
+  // Fill linear system
+  {
+    elem_matrices_type elem_matrices ;
+    elem_vectors_type  elem_vectors ;
+
+    if ( element_count ) {
+      elem_matrices = elem_matrices_type( std::string("elem_matrices"), element_count );
+      elem_vectors  = elem_vectors_type ( std::string("elem_vectors"), element_count );
+    }
+
+    //------------------------------------
+    // Compute element matrices and vectors:
+
+    wall_clock.reset();
+
+    ElementFunctor::apply( mesh ,
+                           elem_matrices , elem_vectors ,
+                           elem_coeff_K , elem_load_Q );
+
+    execution_space::fence();
+    perf_data.elem_time = comm::max( machine , wall_clock.seconds() );
+
+    //------------------------------------
+    // Fill linear system coefficients:
+
+    wall_clock.reset();
+
+    GatherFillFunctor::apply( linsys_matrix , linsys_rhs ,
+               mesh , element_map , elem_matrices , elem_vectors );
+
+    execution_space::fence();
+    perf_data.matrix_gather_fill_time = comm::max( machine , wall_clock.seconds() );
+
+    // Apply boundary conditions:
+
+    wall_clock.reset();
+
+    BoundaryFunctor::apply( linsys_matrix , linsys_rhs , mesh ,
+                            0 , global_max_z , 0 , global_max_z );
+
+    execution_space::fence();
+    perf_data.matrix_boundary_condition_time = comm::max( machine , wall_clock.seconds() );
+  }
+
+  //------------------------------------
+  // Solve linear sytem
+
+  cgsolve( mesh.parallel_data_map ,
+           linsys_matrix , linsys_rhs , linsys_solution ,
+           iteration_count , residual_norm ,
+           perf_data.cg_iteration_time ,
+           iteration_limit , residual_tolerance );
+
+  //------------------------------------
+
+  if ( print_sample ) {
+
+    typename mesh_type::node_coords_type::HostMirror coords_h =
+      Kokkos::create_mirror( mesh.node_coords );
+
+    typename vector_type::HostMirror X_h =
+      Kokkos::create_mirror( linsys_solution );
+
+    Kokkos::deep_copy( coords_h , mesh.node_coords );
+    Kokkos::deep_copy( X_h , linsys_solution );
+
+    for ( size_t i = 0 ; i < mesh.parallel_data_map.count_owned ; ++i ) {
+      const coordinate_scalar_type x = coords_h(i,0);
+      const coordinate_scalar_type y = coords_h(i,1);
+      const coordinate_scalar_type z = coords_h(i,2);
+
+      if ( x <= 0 && y <= 0 ) {
+        std::cout << "  node( " << x << " " << y << " " << z << " ) = "
+                  << X_h(i) << std::endl ;
+      }
+    }
+  }
+
+  return perf_data ;
+}
+
+//----------------------------------------------------------------------------
+
+template< typename Scalar , class Device >
+void driver( const char * const label ,
+             comm::Machine machine ,
+             const int gang_count ,
+             const int elem_count_beg ,
+             const int elem_count_end ,
+             const int runs )
+{
+  typedef Scalar              scalar_type ;
+  typedef Device              execution_space ;
+  typedef double              coordinate_scalar_type ;
+  typedef FixtureElementHex8  fixture_element_type ;
+
+  typedef BoxMeshFixture< coordinate_scalar_type ,
+                          execution_space ,
+                          fixture_element_type > fixture_type ;
+
+  typedef typename fixture_type::FEMeshType mesh_type ;
+
+  const size_t proc_count = comm::size( machine );
+  const size_t proc_rank  = comm::rank( machine );
+
+  if ( elem_count_beg == 0 || elem_count_end == 0 || runs == 0 ) return ;
+
+  if ( comm::rank( machine ) == 0 ) {
+    std::cout << std::endl ;
+    std::cout << "\"Kokkos::HybridFE::Implicit " << label << "\"" << std::endl;
+    std::cout << "\"Size\" ,  \"Graphing\" , \"Element\" , \"Fill\" ,   \"Boundary\" ,  \"CG-Iter\"" << std::endl
+              << "\"elems\" , \"millisec\" , \"millisec\" , \"millisec\" , \"millisec\" , \"millisec\"" << std::endl ;
+  }
+
+  for(int i = elem_count_beg ; i < elem_count_end ; i *= 2 )
+  {
+    const int ix = std::max( 1 , (int) cbrt( ((double) i) / 2.0 ) );
+    const int iy = ix + 1 ;
+    const int iz = 2 * iy ;
+    const int n  = ix * iy * iz ;
+
+    mesh_type mesh =
+      fixture_type::create( proc_count , proc_rank , gang_count ,
+                            ix , iy , iz );
+
+    mesh.parallel_data_map.machine = machine ;
+
+    PerformanceData perf_data , perf_best ;
+
+    for(int j = 0; j < runs; j++){
+
+     perf_data = run<scalar_type,fixture_type>(mesh,ix,iy,iz, false );
+
+     if( j == 0 ) {
+       perf_best = perf_data ;
+     }
+     else {
+       perf_best.best( perf_data );
+     }
+   }
+
+  if ( comm::rank( machine ) == 0 ) {
+
+     std::cout << std::setw(8) << n << " , "
+               << std::setw(10) << perf_best.graph_time * 1000 << " , "
+               << std::setw(10) << perf_best.elem_time * 1000 << " , "
+               << std::setw(10) << perf_best.matrix_gather_fill_time * 1000 << " , "
+               << std::setw(10) << perf_best.matrix_boundary_condition_time * 1000 << " , "
+               << std::setw(10) << perf_best.cg_iteration_time * 1000
+               << std::endl ;
+    }
+  }
+}
+
+//----------------------------------------------------------------------------
+
+} /* namespace Implicit */
+} /* namespace HybridFEM */
+
+
+#endif /* #ifndef HYBRIDFEM_IMPLICIT_HPP */
+
diff --git a/lib/kokkos/example/multi_fem/ImplicitFunctors.hpp b/lib/kokkos/example/multi_fem/ImplicitFunctors.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..9d9aa771636c7ec9af064b9346ffed7f01344a2f
--- /dev/null
+++ b/lib/kokkos/example/multi_fem/ImplicitFunctors.hpp
@@ -0,0 +1,585 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <iostream>
+#include <fstream>
+#include <iomanip>
+#include <cstdlib>
+#include <cmath>
+
+namespace HybridFEM {
+namespace Implicit {
+
+//----------------------------------------------------------------------------
+
+template< typename Scalar , unsigned Dim , unsigned N >
+struct TensorIntegration ;
+
+template<typename Scalar >
+struct TensorIntegration<Scalar,1,1> {
+  Scalar pts[1] ;
+  Scalar wts[1] ;
+
+  TensorIntegration() { pts[0] = 0 ; wts[0] = 2 ; }
+};
+
+template<typename Scalar >
+struct TensorIntegration<Scalar,1,2>
+{
+  Scalar pts[2] ;
+  Scalar wts[2] ;
+
+  TensorIntegration()
+  {
+    const Scalar x2 = 0.577350269 ;
+    pts[0] = -x2; wts[0] = 1.0;
+    pts[1] =  x2; wts[1] = 1.0;
+  }
+};
+
+template<typename Scalar >
+struct TensorIntegration<Scalar,1,3>
+{
+  Scalar pts[3] ;
+  Scalar wts[3] ;
+
+  TensorIntegration()
+  {
+    const Scalar x3 = 0.774596669 ;
+    const Scalar w1 = 0.555555556 ;
+    const Scalar w2 = 0.888888889 ;
+    pts[0] =  -x3 ;  wts[0] = w1 ;
+    pts[1] =    0 ;  wts[1] = w2 ;
+    pts[2] =   x3 ;  wts[2] = w1 ;
+  }
+};
+
+template< typename Scalar , unsigned Order >
+struct TensorIntegration<Scalar,3,Order>
+{
+  static const unsigned N = Order * Order * Order ;
+
+  Scalar pts[N][3] ;
+  Scalar wts[N];
+
+  TensorIntegration()
+  {
+    TensorIntegration<Scalar,1,Order> oneD ;
+
+    unsigned n = 0 ;
+    for ( unsigned k = 0 ; k < Order ; ++k ) {
+    for ( unsigned j = 0 ; j < Order ; ++j ) {
+    for ( unsigned i = 0 ; i < Order ; ++i , ++n ) {
+      pts[n][0] = oneD.pts[i] ;
+      pts[n][1] = oneD.pts[j] ;
+      pts[n][2] = oneD.pts[k] ;
+      wts[n] = oneD.wts[i] * oneD.wts[j] * oneD.wts[k] ;
+    }}}
+  }
+};
+
+//----------------------------------------------------------------------------
+
+template< typename Scalar >
+struct ShapeFunctionEvaluation {
+
+  static const unsigned FunctionCount = 8 ;
+  static const unsigned SpatialDimension = 3 ;
+  static const unsigned IntegrationOrder = 2 ;
+
+  typedef TensorIntegration< Scalar , SpatialDimension , IntegrationOrder > 
+    TensorIntegrationType ;
+
+  static const unsigned PointCount = TensorIntegrationType::N ;
+
+  Scalar value   [ PointCount ][ FunctionCount ] ;
+  Scalar gradient[ PointCount ][ FunctionCount * SpatialDimension ];
+  Scalar weight  [ PointCount ];
+
+  ShapeFunctionEvaluation()
+  {
+    const TensorIntegration< Scalar , SpatialDimension , IntegrationOrder > 
+      integration ;
+
+    const Scalar ONE8TH = 0.125 ;
+
+    for ( unsigned i = 0 ; i < PointCount ; ++i ) {
+
+      const Scalar u = 1.0 - integration.pts[i][0];
+      const Scalar v = 1.0 - integration.pts[i][1];
+      const Scalar w = 1.0 - integration.pts[i][2];
+
+      const Scalar up1 = 1.0 + integration.pts[i][0];
+      const Scalar vp1 = 1.0 + integration.pts[i][1];
+      const Scalar wp1 = 1.0 + integration.pts[i][2];
+
+      weight[i] = integration.wts[i] ;
+
+      // Vaues:
+      value[i][0] = ONE8TH *   u *   v *  w ;
+      value[i][1] = ONE8TH * up1 *   v *  w ;
+      value[i][2] = ONE8TH * up1 * vp1 *  w ;
+      value[i][3] = ONE8TH *   u * vp1 *  w ;
+
+      value[i][4] = ONE8TH *   u *   v *  wp1 ;
+      value[i][5] = ONE8TH * up1 *   v *  wp1 ;
+      value[i][6] = ONE8TH * up1 * vp1 *  wp1 ;
+      value[i][7] = ONE8TH *   u * vp1 *  wp1 ;
+
+      //fn 0 = u * v * w
+      gradient[i][ 0] = ONE8TH * -1  *  v  *  w  ;
+      gradient[i][ 1] = ONE8TH *  u  * -1  *  w  ;
+      gradient[i][ 2] = ONE8TH *  u  *  v  * -1  ;
+
+      //fn 1 = up1 * v * w
+      gradient[i][ 3] = ONE8TH *  1  *  v  *  w  ;
+      gradient[i][ 4] = ONE8TH * up1 * -1  *  w  ;
+      gradient[i][ 5] = ONE8TH * up1 *  v  * -1  ;
+
+      //fn 2 = up1 * vp1 * w
+      gradient[i][ 6] = ONE8TH *  1  * vp1 *  w ;
+      gradient[i][ 7] = ONE8TH * up1 *  1  *  w ;
+      gradient[i][ 8] = ONE8TH * up1 * vp1 * -1 ;
+
+      //fn 3 = u * vp1 * w
+      gradient[i][ 9] = ONE8TH * -1 * vp1 *  w ;
+      gradient[i][10] = ONE8TH *  u *  1  *  w ;
+      gradient[i][11] = ONE8TH *  u * vp1 * -1 ;
+
+      //fn 4 = u * v * wp1
+      gradient[i][12] = ONE8TH * -1  *  v  * wp1 ;
+      gradient[i][13] = ONE8TH *  u  * -1  * wp1 ;
+      gradient[i][14] = ONE8TH *  u  *  v  *  1  ;
+
+      //fn 5 = up1 * v * wp1
+      gradient[i][15] = ONE8TH *  1  *  v  * wp1 ;
+      gradient[i][16] = ONE8TH * up1 * -1  * wp1 ;
+      gradient[i][17] = ONE8TH * up1 *  v  *  1  ;
+
+      //fn 6 = up1 * vp1 * wp1
+      gradient[i][18] = ONE8TH *  1  * vp1 * wp1 ;
+      gradient[i][19] = ONE8TH * up1 *  1  * wp1 ;
+      gradient[i][20] = ONE8TH * up1 * vp1 *  1 ;
+
+      //fn 7 = u * vp1 * wp1
+      gradient[i][21] = ONE8TH * -1 * vp1 * wp1 ;
+      gradient[i][22] = ONE8TH *  u *  1  * wp1 ;
+      gradient[i][23] = ONE8TH *  u * vp1 *  1 ;
+    }
+  }
+};
+
+//----------------------------------------------------------------------------
+
+template< typename ScalarType , typename ScalarCoordType , class DeviceType >
+struct ElementComputation
+{
+  typedef DeviceType     execution_space;
+  typedef ScalarType              scalar_type ;
+  typedef typename execution_space::size_type  size_type ;
+
+  static const size_type ElementNodeCount = 8 ;
+
+  typedef FEMesh< ScalarCoordType , ElementNodeCount , execution_space > mesh_type ;
+  typedef Kokkos::View< scalar_type[][ElementNodeCount][ElementNodeCount] , execution_space > elem_matrices_type ;
+  typedef Kokkos::View< scalar_type[][ElementNodeCount] , execution_space > elem_vectors_type ;
+
+  typedef ShapeFunctionEvaluation< scalar_type > shape_function_data ;
+
+  static const unsigned SpatialDim    = shape_function_data::SpatialDimension ;
+  static const unsigned FunctionCount = shape_function_data::FunctionCount ;
+
+private:
+
+  const shape_function_data               shape_eval ;
+  typename mesh_type::elem_node_ids_type  elem_node_ids ;
+  typename mesh_type::node_coords_type    node_coords ;
+  elem_matrices_type                      element_matrices ;
+  elem_vectors_type                       element_vectors ;
+  scalar_type                             coeff_K ;
+  scalar_type                             coeff_Q ;
+
+  ElementComputation( const mesh_type   & arg_mesh ,
+                      const elem_matrices_type  & arg_element_matrices , 
+                      const elem_vectors_type   & arg_element_vectors ,
+                      const scalar_type   arg_coeff_K ,
+                      const scalar_type   arg_coeff_Q )
+  : shape_eval()
+  , elem_node_ids( arg_mesh.elem_node_ids )
+  , node_coords(   arg_mesh.node_coords )
+  , element_matrices( arg_element_matrices )
+  , element_vectors( arg_element_vectors )
+  , coeff_K( arg_coeff_K )
+  , coeff_Q( arg_coeff_Q )
+  {}
+
+public:
+
+  static void apply( const mesh_type  & mesh ,
+                     const elem_matrices_type & elem_matrices ,
+                     const elem_vectors_type  & elem_vectors ,
+                     const scalar_type  elem_coeff_K ,
+                     const scalar_type  elem_coeff_Q )
+  {
+    ElementComputation comp( mesh , elem_matrices , elem_vectors , elem_coeff_K , elem_coeff_Q );
+    const size_t elem_count = mesh.elem_node_ids.dimension_0();
+
+    parallel_for( elem_count , comp );
+  }
+
+  //------------------------------------
+
+  static const unsigned FLOPS_jacobian =
+    FunctionCount * SpatialDim * SpatialDim * 2 ;
+
+  KOKKOS_INLINE_FUNCTION
+  void jacobian( const ScalarCoordType * x, 
+                 const ScalarCoordType * y, 
+                 const ScalarCoordType * z, 
+                 const scalar_type * grad_vals, 
+                 scalar_type * J) const
+  {
+    int i_grad = 0 ;
+
+    for( unsigned i = 0; i < ElementNodeCount ; ++i , i_grad += SpatialDim ) {
+      const scalar_type g0 = grad_vals[ i_grad ];
+      const scalar_type g1 = grad_vals[ i_grad + 1 ];
+      const scalar_type g2 = grad_vals[ i_grad + 2 ];
+      const scalar_type x0 = x[i] ;
+      const scalar_type x1 = y[i] ;
+      const scalar_type x2 = z[i] ;
+
+      J[0] += g0 * x0 ;
+      J[1] += g0 * x1 ;
+      J[2] += g0 * x2 ;
+
+      J[3] += g1 * x0 ;
+      J[4] += g1 * x1 ;
+      J[5] += g1 * x2 ;
+
+      J[6] += g2 * x0 ;
+      J[7] += g2 * x1 ;
+      J[8] += g2 * x2 ;
+    }
+  }
+
+  //------------------------------------
+
+  static const unsigned FLOPS_inverse_and_det = 46 ;
+
+  KOKKOS_INLINE_FUNCTION
+  scalar_type inverse_and_determinant3x3( scalar_type * const J ) const
+  {
+    const scalar_type J00 = J[0];
+    const scalar_type J01 = J[1];
+    const scalar_type J02 = J[2];
+
+    const scalar_type J10 = J[3];
+    const scalar_type J11 = J[4];
+    const scalar_type J12 = J[5];
+
+    const scalar_type J20 = J[6];
+    const scalar_type J21 = J[7];
+    const scalar_type J22 = J[8];
+
+    const scalar_type term0 = J22*J11 - J21*J12;
+    const scalar_type term1 = J22*J01 - J21*J02;
+    const scalar_type term2 = J12*J01 - J11*J02;
+
+    const scalar_type detJ = J00*term0 - J10*term1 + J20*term2;
+    const scalar_type inv_detJ = 1.0/detJ;
+
+    J[0] =  term0*inv_detJ;
+    J[1] = -term1*inv_detJ;
+    J[2] =  term2*inv_detJ;
+
+    J[3] = -(J22*J10 - J20*J12)*inv_detJ;
+    J[4] =  (J22*J00 - J20*J02)*inv_detJ;
+    J[5] = -(J12*J00 - J10*J02)*inv_detJ;
+
+    J[6] =  (J21*J10 - J20*J11)*inv_detJ;
+    J[7] = -(J21*J00 - J20*J01)*inv_detJ;
+    J[8] =  (J11*J00 - J10*J01)*inv_detJ;
+
+    return detJ ;
+  }
+
+  //------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  void matTransMat3x3_X_3xn( const scalar_type * A, int n,
+                             const scalar_type * B,
+                             scalar_type * C ) const
+  {
+    //A is 3x3, B is 3xn. So C is also 3xn.
+    //A,B,C are all assumed to be ordered such that columns are contiguous.
+
+    scalar_type * Cj = C;
+    const scalar_type * Bj = B;
+
+    for(int j=0; j<n; ++j) {
+      Cj[0] = A[0]*Bj[0] + A[1]*Bj[1] + A[2]*Bj[2];
+      Cj[1] = A[3]*Bj[0] + A[4]*Bj[1] + A[5]*Bj[2];
+      Cj[2] = A[6]*Bj[0] + A[7]*Bj[1] + A[8]*Bj[2];
+      Bj += 3;
+      Cj += 3;
+    }
+
+  }
+  //------------------------------------
+
+  static const unsigned FLOPS_contributeDiffusionMatrix = FunctionCount * ( 3 * 5 + FunctionCount * 7 ) ;
+
+  KOKKOS_INLINE_FUNCTION
+  void contributeDiffusionMatrix(
+    const scalar_type weight ,
+    const scalar_type grad_vals[] ,
+    const scalar_type invJ[] ,
+    scalar_type elem_mat[][8] ) const
+  {
+    scalar_type dpsidx[8], dpsidy[8], dpsidz[8];
+
+    int i_grad = 0 ;
+    for( unsigned i = 0; i < FunctionCount ; ++i , i_grad += 3 ) {
+      const scalar_type g0 = grad_vals[i_grad+0];
+      const scalar_type g1 = grad_vals[i_grad+1];
+      const scalar_type g2 = grad_vals[i_grad+2];
+
+      dpsidx[i] = g0 * invJ[0] + g1 * invJ[1] + g2 * invJ[2];
+      dpsidy[i] = g0 * invJ[3] + g1 * invJ[4] + g2 * invJ[5];
+      dpsidz[i] = g0 * invJ[6] + g1 * invJ[7] + g2 * invJ[8];
+    }
+
+    for( unsigned m = 0; m < FunctionCount; m++) {
+      for( unsigned n = 0; n < FunctionCount; n++) {
+
+        elem_mat[m][n] += weight * 
+          ((dpsidx[m] * dpsidx[n]) + 
+           (dpsidy[m] * dpsidy[n]) +
+           (dpsidz[m] * dpsidz[n]));            
+      }
+    }
+  }
+
+  //------------------------------------
+
+  static const unsigned FLOPS_contributeSourceVector = FunctionCount * 2 ;
+
+  KOKKOS_INLINE_FUNCTION
+  void contributeSourceVector( const scalar_type term ,
+                               const scalar_type psi[] ,
+                               scalar_type elem_vec[] ) const
+  {
+     for( unsigned i=0; i< FunctionCount ; ++i) {
+       elem_vec[i] += psi[i] * term ;
+     }
+  }
+
+
+  static const unsigned FLOPS_operator =
+           shape_function_data::PointCount * ( 3
+             + FLOPS_jacobian
+             + FLOPS_inverse_and_det
+             + FLOPS_contributeDiffusionMatrix
+             + FLOPS_contributeSourceVector ) ;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int ielem )const {
+
+    scalar_type elem_vec[8] = { 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 };
+    scalar_type elem_mat[8][8] =
+      { { 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 } ,
+        { 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 } ,
+        { 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 } ,
+        { 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 } ,
+        { 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 } ,
+        { 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 } ,
+        { 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 } ,
+        { 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 } };
+
+    ScalarCoordType x[8], y[8], z[8];
+
+    for ( int i = 0 ; i < 8 ; ++i ) {
+      const int node_index = elem_node_ids( ielem , i );
+      x[i] = node_coords( node_index , 0 );
+      y[i] = node_coords( node_index , 1 );
+      z[i] = node_coords( node_index , 2 );
+    }
+
+    // This loop could be parallelized; however,
+    // it would require additional per-thread temporaries
+    // of 'elem_vec' and 'elem_mat' which would
+    // consume more local memory and have to be reduced.
+
+    for ( unsigned i = 0 ; i < shape_function_data::PointCount ; ++i ) {
+
+      scalar_type J[SpatialDim*SpatialDim] = { 0, 0, 0,  0, 0, 0,  0, 0, 0 };
+
+      jacobian( x, y, z, shape_eval.gradient[i] , J );
+
+      // Overwrite J with its inverse to save scratch memory space.
+      const scalar_type detJ_w   = shape_eval.weight[i] * inverse_and_determinant3x3(J);
+      const scalar_type k_detJ_w = coeff_K * detJ_w ;
+      const scalar_type Q_detJ_w = coeff_Q * detJ_w ;
+
+      contributeDiffusionMatrix( k_detJ_w , shape_eval.gradient[i] , J , elem_mat );
+
+      contributeSourceVector( Q_detJ_w , shape_eval.value[i] , elem_vec );
+    }
+
+    for( size_type i=0; i< ElementNodeCount ; ++i) {
+      element_vectors(ielem, i) = elem_vec[i] ;
+    }
+
+    for( size_type i = 0; i < ElementNodeCount ; i++){
+      for( size_type j = 0; j < ElementNodeCount ; j++){
+        element_matrices(ielem, i, j) = elem_mat[i][j] ;
+      }
+    }
+  }
+}; /* ElementComputation */
+
+//----------------------------------------------------------------------------
+
+template< typename ScalarType , typename ScalarCoordType , class DeviceType >
+struct DirichletBoundary
+{
+  typedef DeviceType     execution_space;
+  typedef typename execution_space::size_type  size_type ;
+
+  static const size_type ElementNodeCount = 8 ;
+
+  typedef Kokkos::CrsMatrix< ScalarType , execution_space >    matrix_type ;
+  typedef Kokkos::View< ScalarType[] , execution_space >  vector_type ;
+
+  typedef FEMesh< ScalarCoordType , ElementNodeCount , execution_space > mesh_type ;
+
+  typename mesh_type::node_coords_type node_coords ;
+  matrix_type     matrix ;
+  vector_type     rhs ;
+  ScalarCoordType bc_lower_z ;
+  ScalarCoordType bc_upper_z ;
+  ScalarType      bc_lower_value ;
+  ScalarType      bc_upper_value ;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( size_type inode ) const
+  {
+    //  Apply a dirichlet boundary condition to 'irow'
+    //  to maintain the symmetry of the original 
+    //  global stiffness matrix, zero out the columns
+    //  that correspond to boundary conditions, and
+    //  adjust the load vector accordingly
+
+    const size_type iBeg = matrix.graph.row_map[inode];
+    const size_type iEnd = matrix.graph.row_map[inode+1];
+
+    const ScalarCoordType z = node_coords(inode,2);
+    const bool bc_lower = z <= bc_lower_z ;
+    const bool bc_upper = bc_upper_z <= z ;
+
+    if ( bc_lower || bc_upper ) {
+      const ScalarType bc_value = bc_lower ? bc_lower_value
+                                           : bc_upper_value ;
+
+      rhs(inode) = bc_value ; //  set the rhs vector
+
+      //  zero each value on the row, and leave a one
+      //  on the diagonal
+
+      for( size_type i = iBeg ; i < iEnd ; i++) {
+        matrix.coefficients(i) =
+          (int) inode == matrix.graph.entries(i) ? 1 : 0 ;
+      }
+    }
+    else {
+      //  Find any columns that are boundary conditions.
+      //  Clear them and adjust the load vector
+
+      for( size_type i = iBeg ; i < iEnd ; i++ ) {
+        const size_type cnode = matrix.graph.entries(i) ;
+
+        const ScalarCoordType zc = node_coords(cnode,2);
+        const bool c_bc_lower = zc <= bc_lower_z ;
+        const bool c_bc_upper = bc_upper_z <= zc ;
+
+        if ( c_bc_lower || c_bc_upper ) {
+
+          const ScalarType c_bc_value = c_bc_lower ? bc_lower_value
+                                                   : bc_upper_value ;
+
+          rhs( inode ) -= c_bc_value * matrix.coefficients(i);
+
+          matrix.coefficients(i) = 0 ;
+        }
+      }
+    }
+  }
+
+
+  static void apply( const matrix_type & linsys_matrix ,
+                     const vector_type & linsys_rhs ,
+                     const mesh_type   & mesh ,
+                     const ScalarCoordType  bc_lower_z ,
+                     const ScalarCoordType  bc_upper_z ,
+                     const ScalarType       bc_lower_value ,
+                     const ScalarType       bc_upper_value )
+  {
+    const size_t row_count = linsys_matrix.graph.row_map.dimension_0() - 1 ;
+    DirichletBoundary op ;
+    op.node_coords    = mesh.node_coords ;
+    op.matrix         = linsys_matrix ;
+    op.rhs            = linsys_rhs ;
+    op.bc_lower_z     = bc_lower_z ;
+    op.bc_upper_z     = bc_upper_z ;
+    op.bc_lower_value = bc_lower_value ;
+    op.bc_upper_value = bc_upper_value ;
+    parallel_for( row_count , op );
+  }
+};
+
+//----------------------------------------------------------------------------
+
+} /* namespace Implicit */
+} /* namespace HybridFEM */
+
diff --git a/lib/kokkos/example/multi_fem/LinAlgBLAS.hpp b/lib/kokkos/example/multi_fem/LinAlgBLAS.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..2478fa9aede034ec286e34911847e1eaf4eb11e3
--- /dev/null
+++ b/lib/kokkos/example/multi_fem/LinAlgBLAS.hpp
@@ -0,0 +1,567 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef USESCASES_LINALG_BLAS_HPP
+#define USESCASES_LINALG_BLAS_HPP
+
+#include <cmath>
+#include <utility>
+#include <ParallelComm.hpp>
+#include <Kokkos_Core.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class Scalar , class Layout , class DeviceType > struct Dot ;
+
+template< class Scalar , class Layout , class DeviceType > struct Dot1 ;
+
+template< typename ScalarA ,
+          typename ScalarY ,
+          class Layout , class Device >
+struct Scale ;
+
+template< typename ScalarA ,
+          typename ScalarY ,
+          class Layout , class Device >
+struct Fill ;
+
+template< typename ScalarA ,
+          typename ScalarX ,
+          typename ScalarY ,
+          class Layout , class Device >
+struct AXPY ;
+
+template< typename ScalarX ,
+          typename ScalarB ,
+          typename ScalarY ,
+          class Layout , class Device >
+struct XPBY ;
+
+template< typename ScalarA ,
+          typename ScalarX ,
+          typename ScalarB ,
+          typename ScalarY ,
+          typename ScalarW ,
+          class Layout , class Device >
+struct WAXPBY ;
+
+}
+}
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_HAVE_MPI )
+
+template< typename ScalarX /* Allow mix of const and non-const */ ,
+          typename ScalarY /* Allow mix of const and non-const */ ,
+          class L , class D ,
+          class MX /* Allow any management type */ ,
+          class MY /* Allow any management type */ >
+inline
+double dot( const size_t n ,
+            const View< ScalarX * , L , D , MX > & x ,
+            const View< ScalarY * , L , D , MY > & y ,
+            comm::Machine machine )
+{
+  double global_result = 0 ;
+  double local_result = 0 ;
+
+  Impl::Dot< ScalarX , L , D >( n , x , y , local_result );
+
+  MPI_Allreduce( & local_result , & global_result , 1 ,
+                 MPI_DOUBLE , MPI_SUM , machine.mpi_comm );
+
+  return global_result ;
+}
+
+#else
+
+template< typename ScalarX /* Allow mix of const and non-const */ ,
+          typename ScalarY /* Allow mix of const and non-const */ ,
+          class L , class D ,
+          class MX /* Allow any management type */ ,
+          class MY /* Allow any management type */ >
+inline
+double dot( const size_t n ,
+            const View< ScalarX * , L , D , MX > & x ,
+            const View< ScalarY * , L , D , MY > & y ,
+            comm::Machine )
+{
+  double global_result = 0 ;
+
+  Impl::Dot< ScalarX , L , D >( n , x , y , global_result );
+
+  return global_result ;
+}
+
+#endif
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_HAVE_MPI )
+
+template< typename ScalarX /* Allow mix of const and non-const */ ,
+          class L , class D ,
+          class MX /* Allow any management type */ >
+inline
+double dot( const size_t n ,
+            const View< ScalarX * , L , D , MX > & x ,
+            comm::Machine machine )
+{
+  double global_result = 0 ;
+  double local_result = 0 ;
+
+  Impl::Dot1< ScalarX , L , D >( n , x , local_result );
+
+  MPI_Allreduce( & local_result , & global_result , 1 ,
+                 MPI_DOUBLE , MPI_SUM , machine.mpi_comm );
+
+  return global_result ;
+}
+
+#else
+
+template< typename ScalarX /* Allow mix of const and non-const */ ,
+          class L , class D ,
+          class MX /* Allow any management type */ >
+inline
+double dot( const size_t n ,
+            const View< ScalarX * , L , D , MX > & x ,
+            comm::Machine )
+{
+  double global_result = 0 ;
+
+  Impl::Dot1< ScalarX , L , D >( n , x , global_result );
+
+  return global_result ;
+}
+
+#endif
+
+//----------------------------------------------------------------------------
+
+template< typename ScalarX /* Allow mix of const and non-const */ ,
+          class L , class D ,
+          class MX /* Allow any management type */ >
+inline
+double norm2( const size_t n ,
+              const View< ScalarX * , L , D , MX > & x ,
+              comm::Machine machine )
+{
+  return std::sqrt( dot( n , x , machine ) );
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ScalarA ,
+          typename ScalarX ,
+          class L ,
+          class D ,
+          class MX >
+void scale( const size_t n ,
+            const ScalarA & alpha ,
+            const View< ScalarX * , L , D , MX > & x )
+{
+  Impl::Scale< ScalarA , ScalarX , L , D >( n , alpha , x );
+}
+
+template< typename ScalarA ,
+          typename ScalarX ,
+          class L ,
+          class D ,
+          class MX >
+void fill( const size_t n ,
+           const ScalarA & alpha ,
+           const View< ScalarX * , L , D , MX > & x )
+{
+  Impl::Fill< ScalarA , ScalarX , L , D >( n , alpha , x );
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ScalarA ,
+          typename ScalarX ,
+          typename ScalarY ,
+          class L ,
+          class D ,
+          class MX ,
+          class MY >
+void axpy( const size_t n ,
+           const ScalarA & alpha ,
+           const View< ScalarX *, L , D , MX > & x ,
+           const View< ScalarY *, L , D , MY > & y )
+{
+  Impl::AXPY< ScalarA, ScalarX, ScalarY , L , D >( n, alpha, x, y );
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ScalarX ,
+          typename ScalarB ,
+          typename ScalarY ,
+          class L ,
+          class D ,
+          class MX ,
+          class MY >
+void xpby( const size_t n ,
+           const View< ScalarX *, L , D , MX > & x ,
+           const ScalarB & beta ,
+           const View< ScalarY *, L , D , MY > & y )
+{
+  Impl::XPBY< ScalarX, ScalarB, ScalarY , L , D >( n, x, beta, y );
+}
+
+//----------------------------------------------------------------------------
+// w = alpha * x + beta * y
+
+template< typename ScalarA ,
+          typename ScalarX ,
+          typename ScalarB ,
+          typename ScalarY ,
+          typename ScalarW ,
+          class L , class D ,
+          class MX , class MY , class MW >
+void waxpby( const size_t n ,
+             const ScalarA & alpha ,
+             const View< ScalarX * , L , D , MX > & x ,
+             const ScalarB & beta ,
+             const View< ScalarY * , L , D , MY > & y ,
+             const View< ScalarW * , L , D , MW > & w )
+{
+  Impl::WAXPBY<ScalarA,ScalarX,ScalarB,ScalarY,ScalarW,L,D>
+    ( n , alpha , x , beta , y , w );
+}
+
+}
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< typename Scalar , class L , class D >
+struct Dot
+{
+private:
+
+  typedef View< const Scalar*, L, D, MemoryUnmanaged >  vector_const_type ;
+
+  const vector_const_type x ;
+  const vector_const_type y ;
+
+public:
+
+  typedef typename vector_const_type::execution_space  execution_space ; // Manycore device
+  typedef double      value_type ;  // Reduction value
+
+  template< class ArgX , class ArgY >
+  inline
+  Dot( const size_t n , const ArgX & arg_x , const ArgY & arg_y , double & result )
+    : x( arg_x ), y( arg_y )
+  {
+    parallel_reduce( n , *this , result );
+  }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const iType & i , value_type & update ) const
+  { update += x(i) * y(i); }
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update ,
+                    const volatile value_type & source )
+  { update += source;    }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init( value_type & update )
+  { update = 0 ; }
+}; // Dot
+
+//----------------------------------------------------------------------------
+
+template< typename Scalar , class L , class D >
+struct Dot1
+{
+private:
+
+  typedef View< const Scalar*, L, D , MemoryUnmanaged >  vector_const_type ;
+
+  const vector_const_type x ;
+
+public:
+
+  typedef typename vector_const_type::execution_space  execution_space ; // Manycore device
+  typedef double      value_type ;  // Reduction value
+
+  template< class ArgX >
+  inline
+  Dot1( const size_t n , const ArgX & arg_x , double & result )
+    : x( arg_x )
+  {
+    parallel_reduce( n , *this , result );
+  }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const iType & i , value_type & update ) const
+  { update += x(i) * x(i) ; }
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update ,
+                    const volatile value_type & source )
+  { update += source ; }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init( value_type & update )
+  { update = 0 ; }
+}; // Dot
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+template < typename ScalarA ,
+           typename ScalarX ,
+           typename ScalarB ,
+           typename ScalarY ,
+           typename ScalarW ,
+           class L , class D >
+struct WAXPBY
+{
+private:
+
+  typedef View<       ScalarW *, L , D , MemoryUnmanaged > ViewW ;
+  typedef View< const ScalarX *, L , D , MemoryUnmanaged > ViewX ;
+  typedef View< const ScalarY *, L , D , MemoryUnmanaged > ViewY ;
+
+  const ViewW    w ;
+  const ViewX    x ;
+  const ViewY    y ;
+  const ScalarA  alpha ;
+  const ScalarB  beta ;
+
+public:
+
+  typedef typename ViewW::execution_space  execution_space ;
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const iType inode ) const
+  {
+    w(inode) = alpha * x(inode) + beta * y(inode);
+  }
+
+  template< class ArgX , class ArgY , class ArgW >
+  inline
+  WAXPBY( const size_t  n ,
+          const ScalarA & arg_alpha ,
+          const ArgX    & arg_x ,
+          const ScalarB & arg_beta ,
+          const ArgY    & arg_y ,
+          const ArgW    & arg_w )
+    : w( arg_w ), x( arg_x ), y( arg_y )
+    , alpha( arg_alpha ), beta( arg_beta )
+  {
+    parallel_for( n , *this );
+  }
+}; // WAXPBY
+
+//----------------------------------------------------------------------------
+
+template < typename ScalarB ,
+           typename ScalarW ,
+           class L , class D >
+struct Scale
+{
+private:
+
+  typedef View< ScalarW *, L , D , MemoryUnmanaged >  ViewW ;
+  const ViewW    w ;
+  const ScalarB  beta ;
+
+public:
+
+  typedef typename ViewW::execution_space  execution_space ;
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const iType & i ) const
+  { w(i) *= beta ; }
+
+  template< class ArgW >
+  inline
+  Scale( const size_t n , const ScalarB & arg_beta , const ArgW & arg_w )
+    : w( arg_w )
+    , beta( arg_beta )
+  {
+    parallel_for( n , *this );
+  }
+};
+
+template < typename ScalarB ,
+           typename ScalarW ,
+           class L , class D >
+struct Fill
+{
+private:
+
+  typedef View< ScalarW *, L , D , MemoryUnmanaged >  ViewW ;
+  const ViewW    w ;
+  const ScalarB  beta ;
+
+public:
+
+  typedef typename ViewW::execution_space  execution_space ;
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const iType & i ) const
+  { w(i) = beta ; }
+
+  template< class ArgW >
+  inline
+  Fill( const size_t n , const ScalarB & arg_beta , const ArgW & arg_w )
+    : w( arg_w )
+    , beta( arg_beta )
+  {
+    parallel_for( n , *this );
+  }
+};
+
+//----------------------------------------------------------------------------
+
+template < typename ScalarA ,
+           typename ScalarX ,
+           typename ScalarW ,
+           class L , class D >
+struct AXPY
+{
+private:
+
+  typedef View<       ScalarW *, L , D , MemoryUnmanaged >  ViewW ;
+  typedef View< const ScalarX *, L , D , MemoryUnmanaged >  ViewX ;
+
+  const ViewW    w ;
+  const ViewX    x ;
+  const ScalarA  alpha ;
+
+public:
+
+  typedef typename ViewW::execution_space  execution_space ;
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const iType & i ) const
+  { w(i) += alpha * x(i); }
+
+  template< class ArgX , class ArgW >
+  inline
+  AXPY( const size_t  n ,
+        const ScalarA & arg_alpha ,
+        const ArgX    & arg_x ,
+        const ArgW    & arg_w )
+    : w( arg_w ), x( arg_x )
+    , alpha( arg_alpha )
+  {
+    parallel_for( n , *this );
+  }
+}; // AXPY
+
+template< typename ScalarX ,
+          typename ScalarB ,
+          typename ScalarW ,
+          class L , class D >
+struct XPBY
+{
+private:
+
+  typedef View<       ScalarW *, L , D , MemoryUnmanaged >  ViewW ;
+  typedef View< const ScalarX *, L , D , MemoryUnmanaged >  ViewX ;
+
+  const ViewW    w ;
+  const ViewX    x ;
+  const ScalarB  beta ;
+
+public:
+
+  typedef typename ViewW::execution_space  execution_space ;
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const iType & i ) const
+  { w(i) = x(i) + beta * w(i); }
+
+  template< class ArgX , class ArgW >
+  inline
+  XPBY( const size_t  n ,
+        const ArgX    & arg_x ,
+        const ScalarB & arg_beta ,
+        const ArgW    & arg_w )
+    : w( arg_w ), x( arg_x )
+    , beta( arg_beta )
+  {
+    parallel_for( n , *this );
+  }
+}; // XPBY
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef USESCASES_LINALG_BLAS_HPP */
+
+
diff --git a/lib/kokkos/example/multi_fem/Makefile b/lib/kokkos/example/multi_fem/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..72e1768fcb9b446f94400a3e783767923779f6bf
--- /dev/null
+++ b/lib/kokkos/example/multi_fem/Makefile
@@ -0,0 +1,53 @@
+KOKKOS_PATH ?= ../..
+
+MAKEFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST)))
+SRC_DIR := $(dir $(MAKEFILE_PATH))
+
+SRC = $(wildcard $(SRC_DIR)/*.cpp)
+OBJ = $(SRC:$(SRC_DIR)/%.cpp=%.o)
+
+#SRC = $(wildcard *.cpp)
+#OBJ = $(SRC:%.cpp=%.o)
+
+default: build
+	echo "Start Build"
+
+# use installed Makefile.kokkos
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+CXX = $(NVCC_WRAPPER)
+CXXFLAGS = -I$(SRC_DIR) -I$(CUDA_PATH) -O3
+LINK = $(CXX)
+LINKFLAGS = -L$(CUDA_PATH)/lib64 -lcusparse
+EXE = $(addsuffix .cuda, $(shell basename $(SRC_DIR)))
+#KOKKOS_DEVICES = "Cuda,OpenMP"
+#KOKKOS_ARCH = "SNB,Kepler35"
+else
+CXX = g++
+CXXFLAGS = -I$(SRC_DIR) -O3
+LINK = $(CXX)
+LINKFLAGS =  
+EXE = $(addsuffix .host, $(shell basename $(SRC_DIR)))
+#KOKKOS_DEVICES = "OpenMP"
+#KOKKOS_ARCH = "SNB"
+endif
+
+DEPFLAGS = -M
+
+LIB =
+
+
+build: $(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: 
+	rm -f *.a *.o *.cuda *.host
+
+# Compilation rules
+
+%.o:$(SRC_DIR)/%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
+
diff --git a/lib/kokkos/example/multi_fem/Nonlinear.hpp b/lib/kokkos/example/multi_fem/Nonlinear.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..1d243395c25bcb4396dd0c6ed656c10aad1bad3e
--- /dev/null
+++ b/lib/kokkos/example/multi_fem/Nonlinear.hpp
@@ -0,0 +1,573 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef HYBRIDFEM_NONLINEAR_HPP
+#define HYBRIDFEM_NONLINEAR_HPP
+
+#include <utility>
+#include <iostream>
+#include <iomanip>
+
+#include <Kokkos_Core.hpp>
+#include <SparseLinearSystem.hpp>
+#include <SparseLinearSystemFill.hpp>
+#include <NonlinearFunctors.hpp>
+
+#include <FEMesh.hpp>
+#include <HexElement.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace HybridFEM {
+namespace Nonlinear {
+
+struct PerformanceData {
+  double mesh_time ;
+  double graph_time ;
+  double elem_time ;
+  double matrix_gather_fill_time ;
+  double matrix_boundary_condition_time ;
+  double cg_iteration_time ;
+  size_t cg_iteration_count ;
+  size_t newton_iteration_count ;
+  double error_max ;
+
+  PerformanceData()
+    : mesh_time(0)
+    , graph_time(0)
+    , elem_time(0)
+    , matrix_gather_fill_time(0)
+    , matrix_boundary_condition_time(0)
+    , cg_iteration_time(0)
+    , cg_iteration_count(0)
+    , newton_iteration_count(0)
+    , error_max(0)
+    {}
+
+  void best( const PerformanceData & rhs )
+  {
+    mesh_time = std::min( mesh_time , rhs.mesh_time );
+    graph_time = std::min( graph_time , rhs.graph_time );
+    elem_time = std::min( elem_time , rhs.elem_time );
+    matrix_gather_fill_time = std::min( matrix_gather_fill_time , rhs.matrix_gather_fill_time );
+    matrix_boundary_condition_time = std::min( matrix_boundary_condition_time , rhs.matrix_boundary_condition_time );
+    cg_iteration_time = std::min( cg_iteration_time , rhs.cg_iteration_time );
+    cg_iteration_count = std::min( cg_iteration_count , rhs.cg_iteration_count );
+    newton_iteration_count = std::min( newton_iteration_count , rhs.newton_iteration_count );
+    error_max = std::min( error_max , rhs.error_max );
+  }
+};
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+class ManufacturedSolution {
+public:
+
+  // Manufactured solution for one dimensional nonlinear PDE
+  //
+  //  -K T_zz + T^2 = 0 ; T(zmin) = T_zmin ; T(zmax) = T_zmax
+  //
+  //  Has an analytic solution of the form:
+  //
+  //    T(z) = ( a ( z - zmin ) + b )^(-2) where K = 1 / ( 6 a^2 )
+  //
+  //  Given T_0 and T_L compute K for this analytic solution.
+  //
+  //  Two analytic solutions:
+  //
+  //    Solution with singularity:
+  //    , a( ( 1.0 / sqrt(T_zmax) + 1.0 / sqrt(T_zmin) ) / ( zmax - zmin ) )
+  //    , b( -1.0 / sqrt(T_zmin) )
+  //
+  //    Solution without singularity:
+  //    , a( ( 1.0 / sqrt(T_zmax) - 1.0 / sqrt(T_zmin) ) / ( zmax - zmin ) )
+  //    , b( 1.0 / sqrt(T_zmin) )
+
+  const double zmin ;
+  const double zmax ;
+  const double T_zmin ;
+  const double T_zmax ;
+  const double a ;
+  const double b ;
+  const double K ;
+
+  ManufacturedSolution( const double arg_zmin ,
+                        const double arg_zmax ,
+                        const double arg_T_zmin ,
+                        const double arg_T_zmax )
+    : zmin( arg_zmin )
+    , zmax( arg_zmax )
+    , T_zmin( arg_T_zmin )
+    , T_zmax( arg_T_zmax )
+    , a( ( 1.0 / sqrt(T_zmax) - 1.0 / sqrt(T_zmin) ) / ( zmax - zmin ) )
+    , b( 1.0 / sqrt(T_zmin) )
+    , K( 1.0 / ( 6.0 * a * a ) )
+    {}
+
+  double operator()( const double z ) const
+  {
+    const double tmp = a * ( z - zmin ) + b ;
+    return 1.0 / ( tmp * tmp );
+  }
+};
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+template< typename Scalar , class FixtureType >
+PerformanceData run( const typename FixtureType::FEMeshType & mesh ,
+                     const int , // global_max_x ,
+                     const int , // global_max_y ,
+                     const int global_max_z ,
+                     const bool print_error )
+{
+  typedef Scalar                              scalar_type ;
+  typedef FixtureType                         fixture_type ;
+  typedef typename fixture_type::execution_space  execution_space;
+  //typedef typename execution_space::size_type     size_type ; // unused
+
+  typedef typename fixture_type::FEMeshType mesh_type ;
+  typedef typename fixture_type::coordinate_scalar_type coordinate_scalar_type ;
+
+  enum { ElementNodeCount = fixture_type::element_node_count };
+
+  const comm::Machine machine = mesh.parallel_data_map.machine ;
+
+  const size_t element_count = mesh.elem_node_ids.dimension_0();
+
+  //------------------------------------
+  // The amount of nonlinearity is proportional to the ratio
+  // between T(zmax) and T(zmin).  For the manufactured solution
+  // 0 < T(zmin) and 0 < T(zmax)
+
+  const ManufacturedSolution
+    exact_solution( /* zmin */ 0 ,
+                    /* zmax */ global_max_z ,
+                    /* T(zmin) */ 1 ,
+                    /* T(zmax) */ 20 );
+
+  //-----------------------------------
+  // Convergence Criteria and perf data:
+
+  const size_t cg_iteration_limit = 200 ;
+  const double cg_tolerance = 1e-14 ;
+
+  const size_t newton_iteration_limit = 150 ;
+  const double newton_tolerance = 1e-14 ;
+
+  size_t cg_iteration_count_total = 0 ;
+  double cg_iteration_time = 0 ;
+
+  size_t newton_iteration_count = 0 ;
+  double residual_norm_init = 0 ;
+  double residual_norm = 0 ;
+
+  PerformanceData perf_data ;
+
+  //------------------------------------
+  // Sparse linear system types:
+
+  typedef Kokkos::View< scalar_type* , execution_space >     vector_type ;
+  typedef Kokkos::CrsMatrix< scalar_type , execution_space >  matrix_type ;
+  typedef typename matrix_type::graph_type                matrix_graph_type ;
+  typedef typename matrix_type::coefficients_type         matrix_coefficients_type ;
+
+  typedef GraphFactory< matrix_graph_type , mesh_type > graph_factory ;
+
+  //------------------------------------
+  // Problem setup types:
+
+  typedef ElementComputation < mesh_type , scalar_type > ElementFunctor ;
+  typedef DirichletSolution  < mesh_type , scalar_type > DirichletSolutionFunctor ;
+  typedef DirichletResidual  < mesh_type , scalar_type > DirichletResidualFunctor ;
+
+  typedef typename ElementFunctor::elem_matrices_type elem_matrices_type ;
+  typedef typename ElementFunctor::elem_vectors_type  elem_vectors_type ;
+
+  typedef GatherFill< matrix_type ,
+                      mesh_type ,
+                      elem_matrices_type ,
+                      elem_vectors_type > GatherFillFunctor ;
+
+  //------------------------------------
+
+  matrix_type jacobian ;
+  vector_type residual ;
+  vector_type delta ;
+  vector_type nodal_solution ;
+
+  typename graph_factory::element_map_type element_map ;
+
+  //------------------------------------
+  // Generate mesh and corresponding sparse matrix graph
+
+  Kokkos::Timer wall_clock ;
+
+  //------------------------------------
+  // Generate sparse matrix graph and element->graph map.
+
+  wall_clock.reset();
+
+  graph_factory::create( mesh , jacobian.graph , element_map );
+
+  execution_space::fence();
+
+  perf_data.graph_time = comm::max( machine , wall_clock.seconds() );
+
+  //------------------------------------
+  // Allocate linear system coefficients and rhs:
+
+  const size_t local_owned_length = jacobian.graph.row_map.dimension_0() - 1 ;
+  const size_t local_total_length = mesh.node_coords.dimension_0();
+
+  jacobian.coefficients =
+    matrix_coefficients_type( "jacobian_coeff" , jacobian.graph.entries.dimension_0() );
+
+  // Nonlinear residual for owned nodes:
+  residual = vector_type( "residual" , local_owned_length );
+
+  // Nonlinear solution for owned and ghosted nodes:
+  nodal_solution = vector_type( "solution" , local_total_length );
+
+  // Nonlinear solution update for owned nodes:
+  delta = vector_type( "delta" , local_owned_length );
+
+  //------------------------------------
+  // Allocation of arrays to fill the linear system
+
+  elem_matrices_type elem_matrices ; // Jacobian matrices
+  elem_vectors_type  elem_vectors ;  // Residual vectors
+
+  if ( element_count ) {
+    elem_matrices = elem_matrices_type( std::string("elem_matrices"), element_count );
+    elem_vectors = elem_vectors_type( std::string("elem_vectors"), element_count );
+  }
+
+  //------------------------------------
+  // For boundary condition set the correct values in the solution vector
+  //   The 'zmin' face is assigned to 'T_zmin'.
+  //   The 'zmax' face is assigned to 'T_zmax'.
+  //   The resulting solution is one dimensional along the 'Z' axis.
+
+  DirichletSolutionFunctor::apply( nodal_solution , mesh ,
+                                   exact_solution.zmin ,
+                                   exact_solution.zmax ,
+                                   exact_solution.T_zmin ,
+                                   exact_solution.T_zmax );
+
+  for(;;) { // Nonlinear loop
+
+#if defined( KOKKOS_HAVE_MPI )
+
+    { //------------------------------------
+      // Import off-processor nodal solution values
+      // for residual and jacobian computations
+
+      Kokkos::AsyncExchange< typename vector_type::value_type , execution_space ,
+                                  Kokkos::ParallelDataMap >
+        exchange( mesh.parallel_data_map , 1 );
+
+      Kokkos::PackArray< vector_type >
+        ::pack( exchange.buffer() ,
+                mesh.parallel_data_map.count_interior ,
+                mesh.parallel_data_map.count_send ,
+                nodal_solution );
+
+      exchange.setup();
+
+      exchange.send_receive();
+
+      Kokkos::UnpackArray< vector_type >
+        ::unpack( nodal_solution , exchange.buffer() ,
+                  mesh.parallel_data_map.count_owned ,
+                  mesh.parallel_data_map.count_receive );
+    }
+
+#endif
+
+    //------------------------------------
+    // Compute element matrices and vectors:
+
+    wall_clock.reset();
+
+    ElementFunctor( mesh ,
+                    elem_matrices ,
+                    elem_vectors ,
+                    nodal_solution ,
+                    exact_solution.K );
+
+    execution_space::fence();
+    perf_data.elem_time += comm::max( machine , wall_clock.seconds() );
+
+    //------------------------------------
+    // Fill linear system coefficients:
+
+    wall_clock.reset();
+
+    fill( jacobian.coefficients.dimension_0(), 0 , jacobian.coefficients );
+    fill( residual.dimension_0() , 0 , residual );
+
+    GatherFillFunctor::apply( jacobian ,
+                              residual ,
+                              mesh ,
+                              element_map ,
+                              elem_matrices ,
+                              elem_vectors );
+
+    execution_space::fence();
+    perf_data.matrix_gather_fill_time += comm::max( machine , wall_clock.seconds() );
+
+    // Apply boundary conditions:
+
+    wall_clock.reset();
+
+    // Updates jacobian matrix to 1 on the diagonal, zero elsewhere,
+    // and 0 in the residual due to the solution vector having the correct value
+    DirichletResidualFunctor::apply( jacobian, residual, mesh ,
+                                     exact_solution.zmin ,
+                                     exact_solution.zmax );
+
+    execution_space::fence();
+    perf_data.matrix_boundary_condition_time +=
+      comm::max( machine , wall_clock.seconds() );
+
+    //------------------------------------
+    // Has the residual converged?
+
+    residual_norm = norm2( mesh.parallel_data_map.count_owned,
+                           residual,
+                           mesh.parallel_data_map.machine );
+
+    if ( 0 == newton_iteration_count ) {
+      residual_norm_init = residual_norm ;
+    }
+
+    if ( residual_norm / residual_norm_init < newton_tolerance ) {
+      break ;
+    }
+
+    //------------------------------------
+    // Solve linear sytem
+
+    size_t cg_iteration_count = 0 ;
+    double cg_residual_norm = 0 ;
+
+    cgsolve( mesh.parallel_data_map ,
+             jacobian , residual , delta ,
+             cg_iteration_count ,
+             cg_residual_norm ,
+             cg_iteration_time ,
+             cg_iteration_limit , cg_tolerance ) ;
+
+    perf_data.cg_iteration_time += cg_iteration_time ;
+    cg_iteration_count_total += cg_iteration_count ;
+
+    // Update non-linear solution with delta...
+    // delta is : - Dx = [Jacobian]^1 * Residual which is the negative update
+    // LaTeX:
+    // \vec {x}_{n+1} = \vec {x}_{n} - ( - \Delta \vec{x}_{n} )
+    // text:
+    // x[n+1] = x[n] + Dx
+
+    axpy( mesh.parallel_data_map.count_owned ,
+          -1.0, delta, nodal_solution);
+
+    ++newton_iteration_count ;
+
+    if ( newton_iteration_limit < newton_iteration_count ) {
+      break ;
+    }
+  };
+
+  if ( newton_iteration_count ) {
+    perf_data.elem_time /= newton_iteration_count ;
+    perf_data.matrix_gather_fill_time /= newton_iteration_count ;
+    perf_data.matrix_boundary_condition_time /= newton_iteration_count ;
+  }
+
+  if ( cg_iteration_count_total ) {
+    perf_data.cg_iteration_time /= cg_iteration_count_total ;
+  }
+
+  perf_data.newton_iteration_count = newton_iteration_count ;
+  perf_data.cg_iteration_count = cg_iteration_count_total ;
+
+  //------------------------------------
+
+  {
+    // For extracting the nodal solution and its coordinates:
+
+    typename mesh_type::node_coords_type::HostMirror node_coords_host =
+      Kokkos::create_mirror( mesh.node_coords );
+
+    typename vector_type::HostMirror nodal_solution_host =
+      Kokkos::create_mirror( nodal_solution );
+
+    Kokkos::deep_copy( node_coords_host , mesh.node_coords );
+    Kokkos::deep_copy( nodal_solution_host , nodal_solution );
+
+    double tmp = 0 ;
+
+    for ( size_t i = 0 ; i < mesh.parallel_data_map.count_owned ; ++i ) {
+      const coordinate_scalar_type x = node_coords_host(i,0);
+      const coordinate_scalar_type y = node_coords_host(i,1);
+      const coordinate_scalar_type z = node_coords_host(i,2);
+
+      const double Tx = exact_solution(z);
+      const double Ts = nodal_solution_host(i);
+      const double Te = std::abs( Tx - Ts ) / std::abs( Tx );
+
+      tmp = std::max( tmp , Te );
+
+      if ( print_error && 0.02 < Te ) {
+        std::cout << "  node( " << x << " " << y << " " << z << " ) = "
+                  << Ts << " != exact_solution " << Tx
+                  << std::endl ;
+      }
+    }
+    perf_data.error_max = comm::max( machine , tmp );
+  }
+
+  return perf_data ;
+}
+
+//----------------------------------------------------------------------------
+
+template< typename Scalar , class Device , class FixtureElement >
+void driver( const char * const label ,
+             comm::Machine machine ,
+             const int gang_count ,
+             const int elem_count_beg ,
+             const int elem_count_end ,
+             const int runs )
+{
+  typedef Scalar          scalar_type ;
+  typedef Device          execution_space ;
+  typedef double          coordinate_scalar_type ;
+  typedef FixtureElement  fixture_element_type ;
+
+  typedef BoxMeshFixture< coordinate_scalar_type ,
+                          execution_space ,
+                          fixture_element_type > fixture_type ;
+
+  typedef typename fixture_type::FEMeshType mesh_type ;
+
+  const size_t proc_count = comm::size( machine );
+  const size_t proc_rank  = comm::rank( machine );
+
+  if ( elem_count_beg == 0 || elem_count_end == 0 || runs == 0 ) return ;
+
+  if ( comm::rank( machine ) == 0 ) {
+    std::cout << std::endl ;
+    std::cout << "\"Kokkos::HybridFE::Nonlinear " << label << "\"" << std::endl;
+    std::cout
+      << "\"Size\" ,  \"Size\" ,  \"Graphing\" , \"Element\" ,  \"Fill\" ,     \"Boundary\" , \"CG-Iter\" , \"CG-Iter\" ,      \"Newton-Iter\" , \"Max-node-error\""
+      << std::endl
+      << "\"elems\" , \"nodes\" , \"millisec\" , \"millisec\" , \"millisec\" , \"millisec\" , \"millisec\" , \"total-count\" , \"total-count\" , \"ratio\""
+      << std::endl ;
+  }
+
+  const bool print_sample = 0 ;
+  const double x_curve = 1.0 ;
+  const double y_curve = 1.0 ;
+  const double z_curve = 0.8 ;
+
+  for(int i = elem_count_beg ; i < elem_count_end ; i *= 2 )
+  {
+    const int ix = std::max( 1 , (int) cbrt( ((double) i) / 2.0 ) );
+    const int iy = 1 + ix ;
+    const int iz = 2 * iy ;
+    const int global_elem_count = ix * iy * iz ;
+    const int global_node_count = ( 2 * ix + 1 ) *
+                                  ( 2 * iy + 1 ) *
+                                  ( 2 * iz + 1 );
+
+    mesh_type mesh =
+      fixture_type::create( proc_count , proc_rank , gang_count ,
+                            ix , iy , iz ,
+                            x_curve , y_curve , z_curve );
+
+    mesh.parallel_data_map.machine = machine ;
+
+
+    PerformanceData perf_data , perf_best ;
+
+    for(int j = 0; j < runs; j++){
+
+      perf_data = run<scalar_type,fixture_type>(mesh,ix,iy,iz, print_sample );
+
+      if( j == 0 ) {
+        perf_best = perf_data ;
+      }
+      else {
+        perf_best.best( perf_data );
+      }
+    }
+
+    if ( comm::rank( machine ) == 0 ) {
+
+      std::cout << std::setw(8) << global_elem_count << " , "
+                << std::setw(8) << global_node_count << " , "
+                << std::setw(10) << perf_best.graph_time * 1000 << " , "
+                << std::setw(10) << perf_best.elem_time * 1000 << " , "
+                << std::setw(10) << perf_best.matrix_gather_fill_time * 1000 << " , "
+                << std::setw(10) << perf_best.matrix_boundary_condition_time * 1000 << " , "
+                << std::setw(10) << perf_best.cg_iteration_time * 1000 << " , "
+                << std::setw(7) << perf_best.cg_iteration_count << " , "
+                << std::setw(3) << perf_best.newton_iteration_count << " , "
+                << std::setw(10) << perf_best.error_max
+                << std::endl ;
+    }
+  }
+}
+
+//----------------------------------------------------------------------------
+
+} /* namespace Nonlinear */
+} /* namespace HybridFEM */
+
+
+#endif /* #ifndef HYBRIDFEM_IMPLICIT_HPP */
+
diff --git a/lib/kokkos/example/multi_fem/NonlinearElement_Cuda.hpp b/lib/kokkos/example/multi_fem/NonlinearElement_Cuda.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b2adc2adab302ec05f4ca2218e0321583f52a044
--- /dev/null
+++ b/lib/kokkos/example/multi_fem/NonlinearElement_Cuda.hpp
@@ -0,0 +1,390 @@
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+
+#include <stdio.h>
+
+#include <iostream>
+#include <fstream>
+#include <iomanip>
+#include <cstdlib>
+#include <cmath>
+
+#include <Kokkos_Core.hpp>
+#include <HexElement.hpp>
+#include <FEMesh.hpp>
+
+namespace HybridFEM {
+namespace Nonlinear {
+
+template< class MeshType , typename ScalarType > struct ElementComputation ;
+
+//----------------------------------------------------------------------------
+
+template<>
+struct ElementComputation< FEMesh< double , 27 , Kokkos::Cuda > , double >
+{
+  typedef Kokkos::Cuda    execution_space ;
+
+  static const unsigned ElementNodeCount = 27 ;
+
+  typedef HexElement_Data< ElementNodeCount >                element_data_type ;
+  typedef FEMesh< double , ElementNodeCount , execution_space >  mesh_type ;
+
+  static const unsigned SpatialDim       = element_data_type::spatial_dimension ;
+  static const unsigned FunctionCount    = element_data_type::function_count ;
+  static const unsigned IntegrationCount = element_data_type::integration_count ;
+  static const unsigned TensorDim        = SpatialDim * SpatialDim ;
+
+  typedef Kokkos::View< double[][FunctionCount][FunctionCount] , execution_space > elem_matrices_type ;
+  typedef Kokkos::View< double[][FunctionCount] , execution_space > elem_vectors_type ;
+  typedef Kokkos::View< double[] , execution_space > value_vector_type ;
+
+private:
+
+  const element_data_type                       elem_data ;
+  const typename mesh_type::elem_node_ids_type  elem_node_ids ;
+  const typename mesh_type::node_coords_type    node_coords ;
+  const value_vector_type                       nodal_values ;
+  const elem_matrices_type                      element_matrices ;
+  const elem_vectors_type                       element_vectors ;
+  const float                                   coeff_K ;
+  const unsigned                                elem_count ;
+        unsigned                                invJacIndex[9][4] ;
+
+  static const unsigned j11 = 0 , j12 = 1 , j13 = 2 ,
+                        j21 = 3 , j22 = 4 , j23 = 5 ,
+                        j31 = 6 , j32 = 7 , j33 = 8 ;
+
+  // Can only handle up to 16 warps:
+  static const unsigned BlockDimX = 32 ;
+  static const unsigned BlockDimY = 7 ;
+
+  struct WorkSpace {
+    double sum[ BlockDimY ][ BlockDimX ];
+
+    double  value_at_integ[ IntegrationCount ];
+    double  gradx_at_integ[ IntegrationCount ];
+    double  grady_at_integ[ IntegrationCount ];
+    double  gradz_at_integ[ IntegrationCount ];
+
+    float  spaceJac[    BlockDimY ][ 9 ];
+    float  spaceInvJac[ BlockDimY ][ 9 ];
+
+    float  detJweight[ IntegrationCount ];
+
+    float  dpsidx[ FunctionCount ][ IntegrationCount ];
+    float  dpsidy[ FunctionCount ][ IntegrationCount ];
+    float  dpsidz[ FunctionCount ][ IntegrationCount ];
+  };
+
+public:
+
+  ElementComputation ( const mesh_type          & arg_mesh ,
+                       const elem_matrices_type & arg_element_matrices ,
+                       const elem_vectors_type  & arg_element_vectors ,
+                       const value_vector_type  & arg_nodal_values ,
+                       const float                arg_coeff_K )
+  : elem_data()
+  , elem_node_ids(    arg_mesh.elem_node_ids )
+  , node_coords(      arg_mesh.node_coords )
+  , nodal_values(     arg_nodal_values )
+  , element_matrices( arg_element_matrices )
+  , element_vectors(  arg_element_vectors )
+  , coeff_K(          arg_coeff_K )
+  , elem_count(       arg_mesh.elem_node_ids.dimension_0() )
+  {
+    const unsigned jInvJ[9][4] = 
+     { { j22 , j33 , j23 , j32 } ,
+       { j13 , j32 , j12 , j33 } ,
+       { j12 , j23 , j13 , j22 } ,
+
+       { j23 , j31 , j21 , j33 } ,
+       { j11 , j33 , j13 , j31 } ,
+       { j13 , j21 , j11 , j23 } ,
+
+       { j21 , j32 , j22 , j31 } ,
+       { j12 , j31 , j11 , j32 } ,
+       { j11 , j22 , j12 , j21 } };
+
+    for ( unsigned i = 0 ; i < 9 ; ++i ) {
+    for ( unsigned j = 0 ; j < 4 ; ++j ) {
+      invJacIndex[i][j] = jInvJ[i][j] ;
+    }
+    }
+
+    const unsigned shmem = sizeof(WorkSpace);
+    const unsigned grid_max = 65535 ;
+    const unsigned grid_count = std::min( grid_max , elem_count );
+
+    // For compute capability 2.x up to 1024 threads per block
+    const dim3 block( BlockDimX , BlockDimY , 1 );
+    const dim3 grid( grid_count , 1 , 1 );
+
+    Kokkos::Impl::CudaParallelLaunch< ElementComputation >( *this , grid , block , shmem );
+  }
+
+public:
+
+  //------------------------------------
+  // Sum among the threadIdx.x 
+
+  template< typename Type >
+  __device__ inline static
+  void sum_x( Type & result , const double value )
+  {
+    extern __shared__ WorkSpace work_data[] ;
+
+    volatile double * const base_sum =
+      & work_data->sum[ threadIdx.y ][ threadIdx.x ] ;
+
+    base_sum[ 0] = value ;
+
+    if ( threadIdx.x < 16 ) {
+      base_sum[0] += base_sum[16];
+      base_sum[0] += base_sum[ 8];
+      base_sum[0] += base_sum[ 4];
+      base_sum[0] += base_sum[ 2];
+      base_sum[0] += base_sum[ 1];
+    }
+
+    if ( 0 == threadIdx.x ) {
+      result = base_sum[0] ;
+    }
+  }
+
+  __device__ inline static
+  void sum_x_clear()
+  {
+    extern __shared__ WorkSpace work_data[] ;
+
+    work_data->sum[ threadIdx.y ][ threadIdx.x ] = 0 ;
+  }
+
+  //------------------------------------
+  //------------------------------------
+
+  __device__ inline
+  void evaluateFunctions( const unsigned ielem ) const
+  {
+    extern __shared__ WorkSpace work_data[] ;
+
+    // Each warp (threadIdx.y) computes an integration point
+    // Each thread is responsible for a node / function.
+
+    const unsigned iFunc = threadIdx.x ;
+    const bool     hasFunc = iFunc < FunctionCount ;
+
+    //------------------------------------
+    // Each warp gathers a different variable into 'elem_mat' shared memory.
+
+    if ( hasFunc ) {
+
+      const unsigned node = elem_node_ids( ielem , iFunc );
+
+      for ( unsigned iy = threadIdx.y ; iy < 4 ; iy += blockDim.y ) {
+      switch( iy ) {
+      case 0 : work_data->sum[0][iFunc] = node_coords(node,0); break ;
+      case 1 : work_data->sum[1][iFunc] = node_coords(node,1); break ;
+      case 2 : work_data->sum[2][iFunc] = node_coords(node,2); break ;
+      case 3 : work_data->sum[3][iFunc] = nodal_values(node); break ;
+      default: break ;
+      }
+      }
+    }
+
+    __syncthreads(); // Wait for all warps to finish gathering
+
+    // now get local 'const' copies in register space:
+
+    const double x       = work_data->sum[0][ iFunc ];
+    const double y       = work_data->sum[1][ iFunc ];
+    const double z       = work_data->sum[2][ iFunc ];
+    const double dof_val = work_data->sum[3][ iFunc ];
+
+    __syncthreads(); // Wait for all warps to finish extracting
+
+    sum_x_clear(); // Make sure summation scratch is zero
+
+    //------------------------------------
+    // Each warp is now on its own computing an integration point
+    // so no further explicit synchronizations are required.
+
+    if ( hasFunc ) {
+
+      float * const J    = work_data->spaceJac[    threadIdx.y ];
+      float * const invJ = work_data->spaceInvJac[ threadIdx.y ];
+
+      for ( unsigned iInt = threadIdx.y ;
+                     iInt < IntegrationCount ; iInt += blockDim.y ) {
+
+        const float val = elem_data.values[iInt][iFunc] ;
+        const float gx  = elem_data.gradients[iInt][0][iFunc] ;
+        const float gy  = elem_data.gradients[iInt][1][iFunc] ;
+        const float gz  = elem_data.gradients[iInt][2][iFunc] ;
+
+        sum_x( J[j11], gx * x );
+        sum_x( J[j12], gx * y );
+        sum_x( J[j13], gx * z );
+
+        sum_x( J[j21], gy * x );
+        sum_x( J[j22], gy * y );
+        sum_x( J[j23], gy * z );
+
+        sum_x( J[j31], gz * x );
+        sum_x( J[j32], gz * y );
+        sum_x( J[j33], gz * z );
+
+        // Inverse jacobian, only enough parallel work for 9 threads in the warp
+
+        if ( iFunc < TensorDim ) {
+
+          invJ[ iFunc ] =
+            J[ invJacIndex[iFunc][0] ] * J[ invJacIndex[iFunc][1] ] -
+            J[ invJacIndex[iFunc][2] ] * J[ invJacIndex[iFunc][3] ] ;
+
+          // Let all threads in the warp compute determinant into a register
+
+          const float detJ = J[j11] * invJ[j11] +
+                             J[j21] * invJ[j12] +
+                             J[j31] * invJ[j13] ;
+
+          invJ[ iFunc ] /= detJ ;
+
+          if ( 0 == iFunc ) {
+            work_data->detJweight[ iInt ] = detJ * elem_data.weights[ iInt ] ;
+          }
+        }
+
+        // Transform bases gradients and compute value and gradient
+
+        const float dx = gx * invJ[j11] + gy * invJ[j12] + gz * invJ[j13];
+        const float dy = gx * invJ[j21] + gy * invJ[j22] + gz * invJ[j23];
+        const float dz = gx * invJ[j31] + gy * invJ[j32] + gz * invJ[j33];
+
+        work_data->dpsidx[iFunc][iInt] = dx ;
+        work_data->dpsidy[iFunc][iInt] = dy ;
+        work_data->dpsidz[iFunc][iInt] = dz ;
+
+        sum_x( work_data->gradx_at_integ[iInt] , dof_val * dx );
+        sum_x( work_data->grady_at_integ[iInt] , dof_val * dy );
+        sum_x( work_data->gradz_at_integ[iInt] , dof_val * dz );
+        sum_x( work_data->value_at_integ[iInt] , dof_val * val );
+      }
+    }
+
+    __syncthreads(); // All shared data must be populated at return.
+  }
+
+  __device__ inline
+  void contributeResidualJacobian( const unsigned ielem ) const
+  {
+    extern __shared__ WorkSpace work_data[] ;
+
+    sum_x_clear(); // Make sure summation scratch is zero
+
+    // $$ R_i = \int_{\Omega} \nabla \phi_i \cdot (k \nabla T) + \phi_i T^2 d \Omega $$ 
+    // $$ J_{i,j} = \frac{\partial R_i}{\partial T_j} = \int_{\Omega} k \nabla \phi_i \cdot \nabla \phi_j + 2 \phi_i \phi_j T d \Omega $$ 
+
+    const unsigned iInt = threadIdx.x ;
+
+    if ( iInt < IntegrationCount ) {
+
+      const double value_at_integ = work_data->value_at_integ[ iInt ] ;
+      const double gradx_at_integ = work_data->gradx_at_integ[ iInt ] ;
+      const double grady_at_integ = work_data->grady_at_integ[ iInt ] ;
+      const double gradz_at_integ = work_data->gradz_at_integ[ iInt ] ;
+
+      const float detJweight     = work_data->detJweight[ iInt ] ;
+      const float coeff_K_detJweight = coeff_K * detJweight ;
+
+      for ( unsigned iRow = threadIdx.y ;
+                     iRow < FunctionCount ; iRow += blockDim.y ) {
+
+        const float value_row  = elem_data.values[ iInt ][ iRow ] * detJweight ;
+        const float dpsidx_row = work_data->dpsidx[ iRow ][ iInt ] * coeff_K_detJweight ;
+        const float dpsidy_row = work_data->dpsidy[ iRow ][ iInt ] * coeff_K_detJweight ;
+        const float dpsidz_row = work_data->dpsidz[ iRow ][ iInt ] * coeff_K_detJweight ;
+
+        const double res_del = dpsidx_row * gradx_at_integ +
+                               dpsidy_row * grady_at_integ +
+                               dpsidz_row * gradz_at_integ ;
+
+        const double res_val = value_at_integ * value_at_integ * value_row ;
+        const double jac_val_row = 2 * value_at_integ * value_row ;
+
+        sum_x( element_vectors( ielem , iRow ) , res_del + res_val );
+
+        for ( unsigned iCol = 0 ; iCol < FunctionCount ; ++iCol ) {
+
+          const float jac_del = 
+            dpsidx_row * work_data->dpsidx[iCol][iInt] +
+            dpsidy_row * work_data->dpsidy[iCol][iInt] +
+            dpsidz_row * work_data->dpsidz[iCol][iInt] ;
+
+          const double jac_val =
+            jac_val_row * elem_data.values[ iInt ][ iCol ] ;
+
+          sum_x( element_matrices( ielem , iRow , iCol ) , jac_del + jac_val );
+        }
+      }
+    }
+
+    __syncthreads(); // All warps finish before refilling shared data 
+  }
+
+  __device__ inline
+  void operator()(void) const
+  {
+    extern __shared__ WorkSpace work_data[] ;
+
+    for ( unsigned ielem = blockIdx.x ; ielem < elem_count ; ielem += gridDim.x ) {
+
+      evaluateFunctions( ielem );
+
+      contributeResidualJacobian( ielem );
+    }
+  }
+
+}; /* ElementComputation */
+
+} /* namespace Nonlinear */
+} /* namespace HybridFEM */
+
diff --git a/lib/kokkos/example/multi_fem/NonlinearFunctors.hpp b/lib/kokkos/example/multi_fem/NonlinearFunctors.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..9628236181f034f242ce11c2f56783ba9b934797
--- /dev/null
+++ b/lib/kokkos/example/multi_fem/NonlinearFunctors.hpp
@@ -0,0 +1,482 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_NONLINEARFUNCTORS_HPP
+#define KOKKOS_NONLINEARFUNCTORS_HPP
+
+#include <iostream>
+#include <fstream>
+#include <iomanip>
+#include <cstdlib>
+#include <cmath>
+
+namespace HybridFEM {
+namespace Nonlinear {
+
+template< class MeshType , typename ScalarType > struct ElementComputation ;
+template< class MeshType , typename ScalarType > struct DirichletSolution ;
+template< class MeshType , typename ScalarType > struct DirichletResidual ;
+
+}
+}
+
+/* A Cuda-specific specialization for the element computation functor. */
+#if defined( __CUDACC__ )
+#include <NonlinearElement_Cuda.hpp>
+#endif
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace HybridFEM {
+namespace Nonlinear {
+
+template< typename ScalarCoordType , unsigned ElemNode , class DeviceType ,
+          typename ScalarType >
+struct ElementComputation<
+  FEMesh< ScalarCoordType , ElemNode , DeviceType > , ScalarType >
+{
+  typedef DeviceType  execution_space;
+  typedef ScalarType           scalar_type ;
+
+  static const unsigned ElementNodeCount = ElemNode ;
+
+  typedef FEMesh< ScalarCoordType , ElementNodeCount , execution_space > mesh_type ;
+
+  typedef HexElement_Data< ElementNodeCount > element_data_type ;
+
+  static const unsigned SpatialDim       = element_data_type::spatial_dimension ;
+  static const unsigned FunctionCount    = element_data_type::function_count ;
+  static const unsigned IntegrationCount = element_data_type::integration_count ;
+  static const unsigned TensorDim        = SpatialDim * SpatialDim ;
+
+  typedef Kokkos::View< scalar_type[][FunctionCount][FunctionCount] , execution_space > elem_matrices_type ;
+  typedef Kokkos::View< scalar_type[][FunctionCount] , execution_space > elem_vectors_type ;
+  typedef Kokkos::View< scalar_type[] , execution_space > value_vector_type ;
+
+
+private:
+
+  const element_data_type                 elem_data ;
+  typename mesh_type::elem_node_ids_type  elem_node_ids ;
+  typename mesh_type::node_coords_type    node_coords ;
+  value_vector_type                       nodal_values ;
+  elem_matrices_type                      element_matrices ;
+  elem_vectors_type                       element_vectors ;
+  scalar_type                             coeff_K ;
+
+public:
+
+  ElementComputation( const mesh_type   & arg_mesh ,
+                      const elem_matrices_type  & arg_element_matrices ,
+                      const elem_vectors_type   & arg_element_vectors ,
+                      const value_vector_type   & arg_nodal_values ,
+	              const scalar_type   arg_coeff_K )
+  : elem_data()
+  , elem_node_ids( arg_mesh.elem_node_ids )
+  , node_coords(   arg_mesh.node_coords )
+  , nodal_values(   arg_nodal_values )
+  , element_matrices( arg_element_matrices )
+  , element_vectors( arg_element_vectors )
+  , coeff_K( arg_coeff_K )
+  {
+    const size_t elem_count = arg_mesh.elem_node_ids.dimension_0();
+
+    parallel_for( elem_count , *this );
+  }
+
+  //------------------------------------
+
+  static const unsigned FLOPS_transform_gradients =
+     /* Jacobian */           FunctionCount * TensorDim * 2 +
+     /* Inverse jacobian */   TensorDim * 6 + 6 +
+     /* Gradient transform */ FunctionCount * 15 ;
+
+  KOKKOS_INLINE_FUNCTION
+  float transform_gradients(
+    const float grad[][ FunctionCount ] , // Gradient of bases master element
+    const double x[] ,
+    const double y[] ,
+    const double z[] ,
+    float dpsidx[] ,
+    float dpsidy[] ,
+    float dpsidz[] ) const
+  {
+    enum { j11 = 0 , j12 = 1 , j13 = 2 ,
+           j21 = 3 , j22 = 4 , j23 = 5 ,
+           j31 = 6 , j32 = 7 , j33 = 8 };
+
+    // Jacobian accumulation:
+
+    double J[ TensorDim ] = { 0, 0, 0,  0, 0, 0,  0, 0, 0 };
+
+    for( unsigned i = 0; i < FunctionCount ; ++i ) {
+      const double x1 = x[i] ;
+      const double x2 = y[i] ;
+      const double x3 = z[i] ;
+
+      const float g1 = grad[0][i] ;
+      const float g2 = grad[1][i] ;
+      const float g3 = grad[2][i] ;
+
+      J[j11] += g1 * x1 ;
+      J[j12] += g1 * x2 ;
+      J[j13] += g1 * x3 ;
+
+      J[j21] += g2 * x1 ;
+      J[j22] += g2 * x2 ;
+      J[j23] += g2 * x3 ;
+
+      J[j31] += g3 * x1 ;
+      J[j32] += g3 * x2 ;
+      J[j33] += g3 * x3 ;
+    }
+
+    // Inverse jacobian:
+
+    float invJ[ TensorDim ] = {
+      static_cast<float>( J[j22] * J[j33] - J[j23] * J[j32] ) ,
+      static_cast<float>( J[j13] * J[j32] - J[j12] * J[j33] ) ,
+      static_cast<float>( J[j12] * J[j23] - J[j13] * J[j22] ) ,
+
+      static_cast<float>( J[j23] * J[j31] - J[j21] * J[j33] ) ,
+      static_cast<float>( J[j11] * J[j33] - J[j13] * J[j31] ) ,
+      static_cast<float>( J[j13] * J[j21] - J[j11] * J[j23] ) ,
+
+      static_cast<float>( J[j21] * J[j32] - J[j22] * J[j31] ) ,
+      static_cast<float>( J[j12] * J[j31] - J[j11] * J[j32] ) ,
+      static_cast<float>( J[j11] * J[j22] - J[j12] * J[j21] ) };
+
+    const float detJ = J[j11] * invJ[j11] +
+                       J[j21] * invJ[j12] +
+                       J[j31] * invJ[j13] ;
+
+    const float detJinv = 1.0 / detJ ;
+
+    for ( unsigned i = 0 ; i < TensorDim ; ++i ) { invJ[i] *= detJinv ; }
+
+    // Transform gradients:
+
+    for( unsigned i = 0; i < FunctionCount ; ++i ) {
+      const float g0 = grad[0][i];
+      const float g1 = grad[1][i];
+      const float g2 = grad[2][i];
+
+      dpsidx[i] = g0 * invJ[j11] + g1 * invJ[j12] + g2 * invJ[j13];
+      dpsidy[i] = g0 * invJ[j21] + g1 * invJ[j22] + g2 * invJ[j23];
+      dpsidz[i] = g0 * invJ[j31] + g1 * invJ[j32] + g2 * invJ[j33];
+    }
+
+    return detJ ;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void contributeResidualJacobian(
+    const float coeff_k ,
+    const double dof_values[] ,
+    const float dpsidx[] ,
+    const float dpsidy[] ,
+    const float dpsidz[] ,
+    const float detJ ,
+    const float integ_weight ,
+    const float bases_vals[] ,
+    double elem_res[] ,
+    double elem_mat[][ FunctionCount ] ) const
+  {
+    double value_at_pt = 0 ;
+    double gradx_at_pt = 0 ;
+    double grady_at_pt = 0 ;
+    double gradz_at_pt = 0 ;
+
+    for ( unsigned m = 0 ; m < FunctionCount ; m++ ) {
+      value_at_pt += dof_values[m] * bases_vals[m] ;
+      gradx_at_pt += dof_values[m] * dpsidx[m] ;
+      grady_at_pt += dof_values[m] * dpsidy[m] ;
+      gradz_at_pt += dof_values[m] * dpsidz[m] ;
+    }
+
+    const scalar_type k_detJ_weight = coeff_k        * detJ * integ_weight ;
+    const double res_val = value_at_pt * value_at_pt * detJ * integ_weight ;
+    const double mat_val = 2.0 * value_at_pt         * detJ * integ_weight ;
+
+    // $$ R_i = \int_{\Omega} \nabla \phi_i \cdot (k \nabla T) + \phi_i T^2 d \Omega $$
+    // $$ J_{i,j} = \frac{\partial R_i}{\partial T_j} = \int_{\Omega} k \nabla \phi_i \cdot \nabla \phi_j + 2 \phi_i \phi_j T d \Omega $$
+
+    for ( unsigned m = 0; m < FunctionCount; m++) {
+      double * const mat = elem_mat[m] ;
+      const float bases_val_m = bases_vals[m];
+      const float dpsidx_m    = dpsidx[m] ;
+      const float dpsidy_m    = dpsidy[m] ;
+      const float dpsidz_m    = dpsidz[m] ;
+
+      elem_res[m] += k_detJ_weight * ( dpsidx_m * gradx_at_pt +
+                                       dpsidy_m * grady_at_pt +
+                                       dpsidz_m * gradz_at_pt ) +
+                     res_val * bases_val_m ;
+
+      for( unsigned n = 0; n < FunctionCount; n++) {
+
+        mat[n] += k_detJ_weight * ( dpsidx_m * dpsidx[n] +
+                                    dpsidy_m * dpsidy[n] +
+                                    dpsidz_m * dpsidz[n] ) +
+                  mat_val * bases_val_m * bases_vals[n];
+      }
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const unsigned ielem ) const
+  {
+    // Gather nodal coordinates and solution vector:
+
+    double x[ FunctionCount ] ;
+    double y[ FunctionCount ] ;
+    double z[ FunctionCount ] ;
+    double val[ FunctionCount ] ;
+
+    for ( unsigned i = 0 ; i < ElementNodeCount ; ++i ) {
+      const unsigned node_index = elem_node_ids( ielem , i );
+
+      x[i] = node_coords( node_index , 0 );
+      y[i] = node_coords( node_index , 1 );
+      z[i] = node_coords( node_index , 2 );
+
+      val[i] = nodal_values( node_index );
+    }
+
+    double elem_vec[ FunctionCount ] ;
+    double elem_mat[ FunctionCount ][ FunctionCount ] ;
+
+    for( unsigned i = 0; i < FunctionCount ; i++ ) {
+      elem_vec[i] = 0 ;
+      for( unsigned j = 0; j < FunctionCount ; j++){
+        elem_mat[i][j] = 0 ;
+      }
+    }
+
+    for ( unsigned i = 0 ; i < IntegrationCount ; ++i ) {
+      float dpsidx[ FunctionCount ] ;
+      float dpsidy[ FunctionCount ] ;
+      float dpsidz[ FunctionCount ] ;
+
+      const float detJ =
+        transform_gradients( elem_data.gradients[i] , x , y , z ,
+                             dpsidx , dpsidy , dpsidz );
+
+      contributeResidualJacobian( coeff_K ,
+                                  val , dpsidx , dpsidy , dpsidz ,
+                                  detJ ,
+                                  elem_data.weights[i] ,
+                                  elem_data.values[i] ,
+                                  elem_vec , elem_mat );
+    }
+
+    for( unsigned i = 0; i < FunctionCount ; i++){
+      element_vectors(ielem, i) = elem_vec[i] ;
+      for( unsigned j = 0; j < FunctionCount ; j++){
+        element_matrices(ielem, i, j) = elem_mat[i][j] ;
+      }
+    }
+  }
+
+}; /* ElementComputation */
+
+//----------------------------------------------------------------------------
+
+template< typename ScalarCoordType , unsigned ElemNode , class DeviceType ,
+          typename ScalarType >
+struct DirichletSolution<
+  FEMesh< ScalarCoordType , ElemNode , DeviceType > ,
+  ScalarType >
+{
+  typedef DeviceType  execution_space;
+
+  static const unsigned ElementNodeCount = ElemNode ;
+
+  typedef Kokkos::View< ScalarType[] , execution_space >  vector_type ;
+
+  typedef FEMesh< ScalarCoordType , ElementNodeCount , execution_space > mesh_type ;
+
+  typename mesh_type::node_coords_type node_coords ;
+
+  vector_type     solution ;
+  ScalarCoordType bc_lower_z ;
+  ScalarCoordType bc_upper_z ;
+  ScalarType      bc_lower_value ;
+  ScalarType      bc_upper_value ;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const unsigned inode ) const
+  {
+
+  // Apply dirichlet boundary condition on the Solution vector.
+  // Define boundary node values to be either bc_lower_value or
+  // bc_upper_value, depending on which boundary face they lie on.
+  // Non-boundary terms will be left at their previous value.
+
+    const ScalarCoordType z = node_coords(inode,2);
+    const bool bc_lower = z <= bc_lower_z ;
+    const bool bc_upper = bc_upper_z <= z ;
+
+    if ( bc_lower || bc_upper ) {
+      const ScalarType bc_value = bc_lower ? bc_lower_value
+                                           : bc_upper_value ;
+
+      solution(inode) = bc_value ; //  set the solution vector
+    }
+  }
+
+  static void apply( const vector_type    & solution ,
+                     const mesh_type      & mesh ,
+                     const ScalarCoordType  bc_lower_z ,
+                     const ScalarCoordType  bc_upper_z ,
+                     const ScalarType       bc_lower_value ,
+                     const ScalarType       bc_upper_value )
+  {
+    DirichletSolution op ;
+    op.node_coords    = mesh.node_coords ;
+    op.solution       = solution ;
+    op.bc_lower_z     = bc_lower_z ;
+    op.bc_upper_z     = bc_upper_z ;
+    op.bc_lower_value = bc_lower_value ;
+    op.bc_upper_value = bc_upper_value ;
+    parallel_for( solution.dimension_0() , op );
+  }
+};
+
+//----------------------------------------------------------------------------
+
+template< typename ScalarCoordType , unsigned ElemNode , class DeviceType ,
+          typename ScalarType >
+struct DirichletResidual<
+  FEMesh< ScalarCoordType , ElemNode , DeviceType > , ScalarType >
+{
+  typedef DeviceType     execution_space;
+  typedef typename execution_space::size_type  size_type ;
+
+  static const unsigned ElementNodeCount = ElemNode ;
+
+  typedef Kokkos::CrsMatrix< ScalarType , execution_space >    matrix_type ;
+  typedef Kokkos::View< ScalarType[] , execution_space >  vector_type ;
+
+  typedef FEMesh< ScalarCoordType , ElementNodeCount , execution_space > mesh_type ;
+
+  typename mesh_type::node_coords_type node_coords ;
+  matrix_type     matrix ;
+  vector_type     rhs ;
+  ScalarCoordType bc_lower_z ;
+  ScalarCoordType bc_upper_z ;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const unsigned inode ) const
+  {
+    //  Apply a dirichlet boundary condition to 'irow'
+    //  to maintain the symmetry of the original
+    //  global stiffness matrix, zero out the columns
+    //  that correspond to boundary conditions, and
+    //  adjust the load vector accordingly
+
+    const size_type iBeg = matrix.graph.row_map[inode];
+    const size_type iEnd = matrix.graph.row_map[inode+1];
+
+    const ScalarCoordType z = node_coords(inode,2);
+    const bool bc_lower = z <= bc_lower_z ;
+    const bool bc_upper = bc_upper_z <= z ;
+
+    if ( bc_lower || bc_upper ) {
+      rhs(inode) = 0 ; //  set the residual vector
+
+      //  zero each value on the row, and leave a one
+      //  on the diagonal
+
+      for( size_type i = iBeg ; i < iEnd ; i++) {
+        matrix.coefficients(i) =
+          (int) inode == matrix.graph.entries(i) ? 1 : 0 ;
+      }
+    }
+    else {
+
+      //  Find any columns that are boundary conditions.
+      //  Clear them and adjust the load vector
+
+      for( size_type i = iBeg ; i < iEnd ; i++ ) {
+        const size_type cnode = matrix.graph.entries(i) ;
+
+        const ScalarCoordType zc = node_coords(cnode,2);
+        const bool c_bc_lower = zc <= bc_lower_z ;
+        const bool c_bc_upper = bc_upper_z <= zc ;
+
+        if ( c_bc_lower || c_bc_upper ) {
+
+	   matrix.coefficients(i) = 0 ;
+        }
+      }
+    }
+  }
+
+
+  static void apply( const matrix_type & linsys_matrix ,
+                     const vector_type & linsys_rhs ,
+                     const mesh_type   & mesh ,
+                     const ScalarCoordType  bc_lower_z ,
+                     const ScalarCoordType  bc_upper_z)
+  {
+    const size_t row_count = linsys_matrix.graph.row_map.dimension_0() - 1 ;
+
+    DirichletResidual op ;
+    op.node_coords    = mesh.node_coords ;
+    op.matrix         = linsys_matrix ;
+    op.rhs            = linsys_rhs ;
+    op.bc_lower_z     = bc_lower_z ;
+    op.bc_upper_z     = bc_upper_z ;
+    parallel_for( row_count , op );
+  }
+};
+
+//----------------------------------------------------------------------------
+
+} /* namespace Nonlinear */
+} /* namespace HybridFEM */
+
+#endif /* #ifndef KOKKOS_NONLINEARFUNCTORS_HPP */
+
diff --git a/lib/kokkos/example/multi_fem/ParallelComm.hpp b/lib/kokkos/example/multi_fem/ParallelComm.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..163e84a958625cbef857e94a42aa7bb51c0b2e1f
--- /dev/null
+++ b/lib/kokkos/example/multi_fem/ParallelComm.hpp
@@ -0,0 +1,167 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef PARALLELCOMM_HPP
+#define PARALLELCOMM_HPP
+
+//------------------------------------------------------------------------
+
+#include <Kokkos_Macros.hpp>
+
+//------------------------------------------------------------------------
+
+#if defined( KOKKOS_HAVE_MPI )
+
+#include <mpi.h>
+#include <string>
+
+namespace comm {
+
+struct Machine {
+  MPI_Comm mpi_comm ;
+
+  Machine() : mpi_comm( MPI_COMM_NULL ) {}
+
+  Machine( const Machine & rhs )
+    : mpi_comm( rhs.mpi_comm ) {}
+
+  Machine( MPI_Comm c ) : mpi_comm( c ) {}
+
+  static Machine init( int * argc , char *** argv )
+  {
+    MPI_Init( argc , argv );
+    return Machine( MPI_COMM_WORLD );
+  }
+
+  static void finalize() { MPI_Finalize(); }
+};
+
+inline
+unsigned  size( Machine machine )
+{
+  int np ; MPI_Comm_size( machine.mpi_comm , & np ); return np ;
+}
+
+inline
+unsigned  rank( Machine machine )
+{
+  int ip ; MPI_Comm_rank( machine.mpi_comm , & ip ); return ip ;
+}
+
+inline
+double max( Machine machine , double local )
+{
+  double global = 0;
+  MPI_Allreduce( & local , & global , 1 , MPI_DOUBLE , MPI_MAX , machine.mpi_comm );
+  return global ;
+}
+
+inline
+std::string command_line( Machine machine , const int argc , const char * const * const argv )
+{
+  std::string argline ;
+
+  if ( 0 == rank( machine ) ) {
+    for ( int i = 1 ; i < argc ; ++i ) {
+      argline.append(" ").append( argv[i] );
+    }
+  }
+
+  int length = argline.length();
+  MPI_Bcast( & length , 1 , MPI_INT , 0 , machine.mpi_comm );
+  argline.resize( length , ' ' );
+  MPI_Bcast( (void*) argline.data() , length , MPI_CHAR , 0 , machine.mpi_comm );
+
+  return argline ;
+}
+
+}
+
+#else /* ! defined( KOKKOS_HAVE_MPI ) */
+
+#include <string>
+
+namespace comm {
+
+// Stub for non-parallel
+
+struct Machine {
+  static Machine init( int * , char *** )
+  { return Machine(); }
+
+  static void finalize() {}
+};
+
+inline
+unsigned  size( Machine ) { return 1 ; }
+
+inline
+unsigned  rank( Machine ) { return 0 ; }
+
+inline
+double max( Machine , double local )
+{ return local ; }
+
+inline
+std::string command_line( Machine machine , const int argc , const char * const * const argv )
+{
+  std::string argline ;
+
+  if ( 0 == rank( machine ) ) {
+    for ( int i = 1 ; i < argc ; ++i ) {
+      argline.append(" ").append( argv[i] );
+    }
+  }
+
+  return argline ;
+}
+
+}
+
+#endif /* ! defined( KOKKOS_HAVE_MPI ) */
+
+//------------------------------------------------------------------------
+
+#endif /* #ifndef PARALLELCOMM_HPP */
+
+
diff --git a/lib/kokkos/example/multi_fem/ParallelDataMap.hpp b/lib/kokkos/example/multi_fem/ParallelDataMap.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..22a1cfefa1df9aab3cc0ad2823510f4d98e50fa2
--- /dev/null
+++ b/lib/kokkos/example/multi_fem/ParallelDataMap.hpp
@@ -0,0 +1,517 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_PARALLELDATAMAP_HPP
+#define KOKKOS_PARALLELDATAMAP_HPP
+
+#include <utility>
+#include <limits>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+
+#include <Kokkos_Core.hpp>
+#include <ParallelComm.hpp>
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+/** \brief  Parallel distributed data mapping
+ *
+ *  ordering { interior : { owned items not sent elsewhere }
+ *             send     : { owned items sent }
+ *             receive  : { not-owned items received } }
+ *
+ *  recv { { N ghosted items from process P : ( P , N ) } }
+ *
+ *  send { { N send items to process P : ( P , N ) } }
+ *
+ *  send_item { send item offsets within 'send' range }
+ */
+struct ParallelDataMap {
+  typedef View< unsigned*[2], HostSpace >  host_recv_type ;
+  typedef View< unsigned*[2], HostSpace >  host_send_type ;
+  typedef View< unsigned* ,   HostSpace >  host_send_item_type ;
+
+  comm::Machine        machine ;
+  host_recv_type       host_recv ;
+  host_send_type       host_send ;
+  host_send_item_type  host_send_item ;
+  unsigned             count_interior ;
+  unsigned             count_send ;
+  unsigned             count_owned ; // = count_interior + count_send
+  unsigned             count_receive ;
+
+  void assign( const unsigned arg_count_interior ,
+               const unsigned arg_count_owned ,
+               const unsigned arg_count_total ,
+               const unsigned arg_recv_msg ,
+               const unsigned arg_send_msg ,
+               const unsigned arg_send_count )
+  {
+    const std::string label("Kokkos::ParallelDataMap buffer");
+
+    count_interior = arg_count_interior ;
+    count_owned    = arg_count_owned ;
+    count_send     = arg_count_owned - arg_count_interior ;
+    count_receive  = arg_count_total - arg_count_owned ;
+
+    host_recv = host_recv_type( label , arg_recv_msg );
+    host_send = host_send_type( label , arg_send_msg );
+    host_send_item = host_send_item_type( label , arg_send_count );
+  }
+};
+
+//----------------------------------------------------------------------------
+//PackArray
+//----------------------------------------------------------------------------
+template< class ArrayType , class Rank = void >
+struct PackArray ;
+
+template< typename DeviceType, typename ValueType >
+struct PackArray< View< ValueType* , DeviceType > , void >
+{
+  typedef DeviceType                         execution_space ;
+  typedef typename DeviceType::size_type     size_type ;
+  typedef View< ValueType* , execution_space >  array_type ;
+  typedef View< ValueType* , execution_space >  buffer_type ;
+
+private:
+
+  buffer_type  output ;
+  array_type   input ;
+  size_type    base ;
+
+public:
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type i ) const
+  { output[i] = input(base+i); }
+
+  inline
+  static
+  void pack( const buffer_type & arg_output ,
+             const size_type     arg_begin ,
+             const size_type     arg_count ,
+             const array_type  & arg_input )
+  {
+    PackArray op ;
+    op.output = arg_output ;
+    op.input  = arg_input ;
+    op.base   = arg_begin ;
+    parallel_for( arg_count , op );
+  }
+};
+
+template< typename DeviceType, typename ValueType , unsigned N1 >
+struct PackArray< View< ValueType*[N1] , DeviceType > , void >
+{
+  typedef DeviceType                                  execution_space ;
+  typedef typename DeviceType::size_type              size_type ;
+  typedef View< ValueType*[N1] , execution_space >       array_type ;
+  typedef View< ValueType* , execution_space >           buffer_type ;
+
+private:
+
+  buffer_type  output ;
+  array_type   input ;
+  size_type    base ;
+
+public:
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type i ) const
+  {
+    for ( size_type j = 0 , k = i * N1 ; j < N1 ; ++j , ++k ) {
+      output[k] = input(base+i,j);
+    }
+  }
+
+  inline static
+  void pack( const buffer_type & arg_output ,
+             const size_type     arg_begin ,
+             const size_type     arg_count ,
+             const array_type  & arg_input )
+  {
+    if ( arg_count ) {
+      PackArray op ;
+      op.output = arg_output ;
+      op.input  = arg_input ;
+      op.base   = arg_begin ;
+      parallel_for( arg_count , op );
+    }
+  }
+};
+
+//----------------------------------------------------------------------------
+//UnpackArray
+//----------------------------------------------------------------------------
+template< class ArrayType , class Rank = void > struct UnpackArray ;
+
+template< typename DeviceType, typename ValueType >
+struct UnpackArray< View< ValueType* , DeviceType > , void >
+{
+  typedef DeviceType                         execution_space ;
+  typedef typename DeviceType::size_type     size_type ;
+  typedef View< ValueType* , execution_space >  array_type ;
+  typedef View< ValueType* , execution_space >  buffer_type ;
+
+private:
+
+  array_type   output ;
+  buffer_type  input ;
+  size_type    base ;
+
+public:
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type i ) const
+  { output(base+i) = input[i]; }
+
+  inline
+  static
+  void unpack( const array_type  & arg_output ,
+               const buffer_type & arg_input ,
+               const size_type     arg_begin ,
+               const size_type     arg_count )
+  {
+    UnpackArray op ;
+    op.output = arg_output ;
+    op.input  = arg_input ;
+    op.base   = arg_begin ;
+    parallel_for( arg_count , op );
+  }
+};
+
+template< typename DeviceType, typename ValueType , unsigned N1 >
+struct UnpackArray< View< ValueType*[N1] , DeviceType > , void >
+{
+  typedef DeviceType                                  execution_space ;
+  typedef typename DeviceType::size_type              size_type ;
+  typedef View< ValueType* , execution_space >           buffer_type ;
+  typedef View< ValueType*[N1] , execution_space >       array_type ;
+
+private:
+
+  array_type   output ;
+  buffer_type  input ;
+  size_type    base ;
+
+public:
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type i ) const
+  {
+    for ( size_type j = 0 , k = i * N1 ; j < N1 ; ++j , ++k ) {
+      output(base+i,j) = input(k);
+    }
+  }
+
+  inline
+  static
+  void unpack( const array_type  & arg_output ,
+               const buffer_type & arg_input ,
+               const size_type     arg_begin ,
+               const size_type     arg_count )
+  {
+    if ( arg_count ) {
+      UnpackArray op ;
+      op.output = arg_output ;
+      op.input  = arg_input ;
+      op.base   = arg_begin ;
+      parallel_for( arg_count , op );
+    }
+  }
+};
+//----------------------------------------------------------------------------
+template< class ValueType , class Device , class DataMap >
+class AsyncExchange ;
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+// Application call procedure:
+//
+// construct: AsyncExchange object
+// * pack send buffer on device
+// initiate: copy send buffer from device to host
+// * dispatch asynchronous local work
+// complete: send/receive on host, copy receive buffer to device
+// * unpack receive buffer on device
+// destroy: AsyncExchange object
+//
+//----------------------------------------------------------------------------
+
+#ifdef KOKKOS_HAVE_MPI
+
+namespace Kokkos {
+
+template< class ValueType , class Device >
+class AsyncExchange< ValueType, Device , Kokkos::ParallelDataMap > {
+public:
+
+  typedef Device                                    execution_space ;
+  typedef Kokkos::ParallelDataMap                   data_map_type ;
+  typedef Kokkos::View< ValueType* , execution_space >  buffer_dev_type ;
+  typedef typename buffer_dev_type::HostMirror      buffer_host_type ;
+
+private:
+
+  static const int mpi_tag = 11 ;
+
+  const data_map_type  data_map ;
+  unsigned             chunk_size ;
+  unsigned             send_count_max ;
+  buffer_host_type     host_recv_buffer ;
+  buffer_host_type     host_send_buffer ;
+  buffer_host_type     send_msg_buffer ;
+  buffer_dev_type      dev_buffer ;
+  buffer_dev_type      dev_send_buffer ; // Subview for send
+  buffer_dev_type      dev_recv_buffer ; // Subview for receive
+  std::vector< MPI_Request > recv_request ;
+
+public:
+
+  const buffer_dev_type & buffer() const { return dev_buffer ; }
+
+  AsyncExchange( const data_map_type & arg_data_map ,
+                 const size_t          arg_chunk )
+  : data_map( arg_data_map )
+  , chunk_size( arg_chunk )
+  , send_count_max( 0 )
+  , host_recv_buffer()
+  , host_send_buffer()
+  , send_msg_buffer()
+  , dev_buffer()
+  , dev_send_buffer()
+  , dev_recv_buffer()
+  , recv_request()
+  {
+    const size_t send_msg_count = arg_data_map.host_send.dimension_0();
+    const size_t recv_msg_count = arg_data_map.host_recv.dimension_0();
+
+    const size_t send_msg_length = arg_chunk * arg_data_map.count_send ;
+    const size_t recv_msg_length = arg_chunk * arg_data_map.count_receive ;
+
+    for ( size_t i = 0 ; i < send_msg_count ; ++i ) {
+      send_count_max = std::max( send_count_max ,
+                                 (unsigned) arg_data_map.host_send(i,1) );
+    }
+
+    // A single shared buffer on the device can be used for
+    // send and receive message buffers.
+    dev_buffer = buffer_dev_type(
+                     std::string("AsyncExchange dev_buffer") ,
+                     std::max( send_msg_length , recv_msg_length ) );
+
+    // Total send subview of the device buffer
+    dev_send_buffer =
+      Kokkos::subview( dev_buffer , std::pair<size_t,size_t>( 0 , send_msg_length ) );
+
+    // Total receive subview of the device buffer
+    dev_recv_buffer =
+      Kokkos::subview( dev_buffer , std::pair<size_t,size_t>( 0 , recv_msg_length ) );
+
+    // Total receive message buffer on the host:
+    host_recv_buffer = buffer_host_type(
+                           std::string("AsyncExchange host_recv_buffer") ,
+                           recv_msg_length );
+
+    // Total send message buffer on the host:
+    host_send_buffer = buffer_host_type(
+                           std::string("AsyncExchange host_send_buffer") ,
+                           send_msg_length );
+
+    // Individual send message buffer on the host:
+    send_msg_buffer = buffer_host_type(
+                          std::string("AsyncExchange send_msg_buffer") ,
+                          arg_chunk * send_count_max );
+
+    // MPI asynchronous receive request handles:
+    recv_request.assign( recv_msg_count , MPI_REQUEST_NULL );
+  }
+
+  //------------------------------------------------------------------------
+
+  void setup()
+  {
+    { // Post receives:
+      const size_t recv_msg_count = data_map.host_recv.dimension_0();
+
+      ValueType * ptr = host_recv_buffer.ptr_on_device();
+
+      for ( size_t i = 0 ; i < recv_msg_count ; ++i ) {
+        const int proc  = data_map.host_recv(i,0);
+        const int count = data_map.host_recv(i,1) * chunk_size ;
+
+        MPI_Irecv( ptr , count * sizeof(ValueType) , MPI_BYTE ,
+                   proc , mpi_tag , data_map.machine.mpi_comm ,
+                   & recv_request[i] );
+
+        ptr += count ;
+      }
+    }
+
+    // Copy send buffer from the device to host memory for sending
+
+    Kokkos::deep_copy( host_send_buffer , dev_send_buffer );
+
+    // Done with the device until communication is complete.
+    // Application can dispatch asynchronous work on the device.
+  }
+
+  // Application can dispatch local work to device ...
+  // No communication progress until main thread calls 'send_receive'
+
+  void send_receive()
+  {
+    const size_t recv_msg_count = data_map.host_recv.dimension_0();
+    const size_t send_msg_count = data_map.host_send.dimension_0();
+
+    // Pack and send:
+
+    for ( size_t i = 0 , j = 0 ; i < send_msg_count ; ++i ) {
+      const int proc  = data_map.host_send(i,0);
+      const int count = data_map.host_send(i,1);
+
+      for ( int k = 0 , km = 0 ; k < count ; ++k , ++j ) {
+        const int km_end = km + chunk_size ;
+        int ki = chunk_size * data_map.host_send_item(j);
+
+        for ( ; km < km_end ; ++km , ++ki ) {
+          send_msg_buffer[km] = host_send_buffer[ki];
+        }
+      }
+
+      // MPI_Ssend blocks until
+      // (1) a receive is matched for the message and
+      // (2) the send buffer can be re-used.
+      //
+      // It is suggested that MPI_Ssend will have the best performance:
+      // http://www.mcs.anl.gov/research/projects/mpi/sendmode.html .
+
+      MPI_Ssend( send_msg_buffer.ptr_on_device(),
+                 count * chunk_size * sizeof(ValueType) , MPI_BYTE ,
+                 proc , mpi_tag , data_map.machine.mpi_comm );
+    }
+
+    // Wait for receives and verify:
+
+    for ( size_t i = 0 ; i < recv_msg_count ; ++i ) {
+      MPI_Status recv_status ;
+      int recv_which = 0 ;
+      int recv_size  = 0 ;
+
+      MPI_Waitany( recv_msg_count , & recv_request[0] ,
+                   & recv_which , & recv_status );
+
+      const int recv_proc = recv_status.MPI_SOURCE ;
+
+      MPI_Get_count( & recv_status , MPI_BYTE , & recv_size );
+
+      // Verify message properly received:
+
+      const int  expected_proc = data_map.host_recv(recv_which,0);
+      const int  expected_size = data_map.host_recv(recv_which,1) *
+                                 chunk_size * sizeof(ValueType);
+
+      if ( ( expected_proc != recv_proc ) ||
+           ( expected_size != recv_size ) ) {
+        std::ostringstream msg ;
+        msg << "AsyncExchange error:"
+            << " P" << comm::rank( data_map.machine )
+            << " received from P" << recv_proc
+            << " size "     << recv_size
+            << " expected " << expected_size
+            << " from P"    << expected_proc ;
+        throw std::runtime_error( msg.str() );
+      }
+    }
+
+    // Copy received data to device memory.
+
+    Kokkos::deep_copy( dev_recv_buffer , host_recv_buffer );
+  }
+};
+
+} // namespace Kokkos
+
+#else /* ! #ifdef KOKKOS_HAVE_MPI */
+
+namespace Kokkos {
+
+template< class ValueType , class Device >
+class AsyncExchange< ValueType, Device , Kokkos::ParallelDataMap > {
+public:
+
+  typedef Device                                    execution_space ;
+  typedef Kokkos::ParallelDataMap                   data_map_type ;
+  typedef Kokkos::View< ValueType* , execution_space >  buffer_dev_type ;
+  typedef typename buffer_dev_type::HostMirror      buffer_host_type ;
+
+  buffer_dev_type      dev_buffer ;
+
+public:
+
+  const buffer_dev_type & buffer() const { return dev_buffer ; }
+
+  AsyncExchange( const data_map_type & , const size_t )
+  : dev_buffer()
+  { }
+
+  //------------------------------------------------------------------------
+
+  void setup() { }
+
+  void send_receive() { }
+};
+
+} // namespace Kokkos
+
+#endif /* ! #ifdef KOKKOS_HAVE_MPI */
+
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_PARALLELDATAMAP_HPP */
+
+
diff --git a/lib/kokkos/example/multi_fem/ParallelMachine.cpp b/lib/kokkos/example/multi_fem/ParallelMachine.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0953cab760b42adfa75f8aac3186bd8401f997ec
--- /dev/null
+++ b/lib/kokkos/example/multi_fem/ParallelMachine.cpp
@@ -0,0 +1,178 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#if 0
+
+#include <stdlib.h>
+#include <string.h>
+
+#include <ParallelMachine.hpp>
+
+#include <Kokkos_Core.hpp>
+
+#if ! defined( KOKKOS_HAVE_MPI )
+#define MPI_COMM_NULL 0
+#endif
+
+//------------------------------------------------------------------------
+
+namespace Parallel {
+
+Machine::Machine( int * argc , char *** argv )
+  : m_mpi_comm( MPI_COMM_NULL )
+  , m_mpi_size(0)
+  , m_mpi_rank(0)
+  , m_mpi_gpu(0)
+{
+
+#if defined( KOKKOS_HAVE_CUDA )
+  //------------------------------------
+  // Might be using a Cuda aware version of MPI.
+  // Must select Cuda device before initializing MPI.
+  {
+    int i = 1 ;
+    for ( ; i < *argc && strcmp((*argv)[i],"mpi_cuda") ; ++i );
+
+    if ( i < *argc ) {
+      // Determine, if possible, what will be the node-local
+      // rank of the MPI process once MPI has been initialized.
+      // This rank is needed to set the Cuda device before 'mvapich'
+      // is initialized.
+
+      const char * const mvapich_local_rank = getenv("MV2_COMM_WORLD_LOCAL_RANK");
+      const char * const slurm_local_rank   = getenv("SLURM_LOCALID");
+
+      const int pre_mpi_local_rank =
+        0 != mvapich_local_rank ? atoi( mvapich_local_rank ) : (
+        0 != slurm_local_rank   ? atoi( slurm_local_rank ) : (
+        -1 ) );
+
+      if ( 0 <= pre_mpi_local_rank ) {
+
+        const int ngpu = Kokkos::Cuda::detect_device_count();
+
+        const int cuda_device_rank = pre_mpi_local_rank % ngpu ;
+
+        Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice( cuda_device_rank ) );
+
+        m_mpi_gpu = 1 ;
+      }
+    }
+  }
+#endif
+
+  //------------------------------------
+
+#if defined( KOKKOS_HAVE_MPI )
+  MPI_Init( argc , argv );
+  m_mpi_comm = MPI_COMM_WORLD ;
+  MPI_Comm_size( m_mpi_comm , & m_mpi_size );
+  MPI_Comm_rank( m_mpi_comm , & m_mpi_rank );
+#endif
+
+  // Query hwloc after MPI initialization to allow MPI binding:
+  //------------------------------------
+  // Request to use host device:
+  {
+    int i = 1 ;
+    for ( ; i < *argc && strcmp((*argv)[i],"host") ; ++i );
+
+    if ( i < *argc ) {
+
+      unsigned team_count       = Kokkos::hwloc::get_available_numa_count();
+      unsigned threads_per_team = Kokkos::hwloc::get_available_cores_per_numa() *
+                                  Kokkos::hwloc::get_available_threads_per_core();
+ 
+      if ( i + 2 < *argc ) {
+        team_count       = atoi( (*argv)[i+1] );
+        threads_per_team = atoi( (*argv)[i+2] );
+      }
+
+      Kokkos::Threads::initialize( team_count * threads_per_team );
+    }
+  }
+
+#if defined( KOKKOS_HAVE_CUDA )
+  //------------------------------------
+  // Request to use Cuda device and not already initialized.
+  if ( ! m_mpi_gpu ) {
+    int i = 1 ;
+    for ( ; i < *argc && strcmp((*argv)[i],"mpi_cuda") && strcmp((*argv)[i],"cuda") ; ++i );
+
+    if ( i < *argc ) {
+
+      const int ngpu = Kokkos::Cuda::detect_device_count();
+
+      const int cuda_device_rank = m_mpi_rank % ngpu ;
+
+      Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice( cuda_device_rank ) );
+    }
+  }
+#endif
+
+}
+
+Machine::~Machine()
+{
+  Kokkos::Threads::finalize();
+#if defined( KOKKOS_HAVE_CUDA )
+  Kokkos::Cuda::finalize();
+#endif
+#if defined( KOKKOS_HAVE_MPI )
+  MPI_Finalize();
+#endif
+}
+
+void Machine::print_configuration( std::ostream & msg ) const
+{
+  msg << "MPI [ " << m_mpi_rank << " / " << m_mpi_size << " ]" << std::endl ;
+  Kokkos::Threads::print_configuration( msg );
+#if defined( KOKKOS_HAVE_CUDA )
+  Kokkos::Cuda::print_configuration( msg );
+#endif
+}
+
+}
+
+#endif /* #if 0 */
+
diff --git a/lib/kokkos/example/multi_fem/ParallelMachine.hpp b/lib/kokkos/example/multi_fem/ParallelMachine.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..1ddf50ab3b5e9fe28ce6c6b84e7d7d4877f588bd
--- /dev/null
+++ b/lib/kokkos/example/multi_fem/ParallelMachine.hpp
@@ -0,0 +1,118 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#error "ParallelMachine"
+
+#ifndef PARALLELMACHINE_HPP
+#define PARALLELMACHINE_HPP
+
+//------------------------------------------------------------------------
+
+#include <iosfwd>
+
+#include <Kokkos_Core.hpp>
+
+//------------------------------------------------------------------------
+
+#if defined( KOKKOS_HAVE_MPI )
+#include <mpi.h>
+#else
+  typedef int MPI_Comm ;
+#endif
+
+//------------------------------------------------------------------------
+//------------------------------------------------------------------------
+
+namespace Parallel {
+
+/** \brief  Hybrid parallel machine with MPI+Kokkos::Threads or MPI+Kokkos::Cuda.
+ *
+ *  Initialization of MPI and Kokkos device has interdependencies which this
+ *  class manages.  The command line and environment variables are queried to initialize
+ *  the Threads or Cuda device:
+ *
+ *    1)  cuda               : initializes Cuda device
+ *    2)  host               : initializes Threads device with all hwloc detected cores.
+ *    3)  host #gang #worker : initializes Threads with specified
+ */
+class Machine {
+private:
+
+  MPI_Comm m_mpi_comm ;
+  int      m_mpi_size ;
+  int      m_mpi_rank ;
+  unsigned m_mpi_gpu ;
+  unsigned m_gpu_arch ;
+
+  Machine();
+  Machine( const Machine & );
+  Machine & operator = ( const Machine & );
+
+public:
+
+  /** \brief  Coordinated initialize MPI, Cuda, or Threads devices from 'main'.  */
+  Machine( int * argc , char *** argv );
+
+  ~Machine();
+
+  MPI_Comm mpi_comm() const { return m_mpi_comm ; }
+
+  int mpi_size() const { return m_mpi_size ; }
+  int mpi_rank() const { return m_mpi_rank ; }
+
+  /** \brief  If using MPI that can directly operate on GPU memory */
+  bool mpi_gpu() const { return m_mpi_gpu ; }
+
+  /** \brief  If using GPU then what architecture */
+  unsigned gpu_arch() const { return m_gpu_arch ; }
+
+  void print_configuration( std::ostream & ) const ;
+};
+
+}
+
+//------------------------------------------------------------------------
+
+#endif /* #ifndef PARALLELMACHINE_HPP */
+
+
diff --git a/lib/kokkos/example/multi_fem/SparseLinearSystem.hpp b/lib/kokkos/example/multi_fem/SparseLinearSystem.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..8d140b6d257af15bc7d0980624e8a20178f29a19
--- /dev/null
+++ b/lib/kokkos/example/multi_fem/SparseLinearSystem.hpp
@@ -0,0 +1,400 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef SPARSELINEARSYSTEM_HPP
+#define SPARSELINEARSYSTEM_HPP
+
+#include <cmath>
+#include <impl/Kokkos_Timer.hpp>
+
+#include <Kokkos_Core.hpp>
+#include <Kokkos_StaticCrsGraph.hpp>
+
+#include <LinAlgBLAS.hpp>
+
+namespace Kokkos {
+
+template< typename ScalarType , class Device >
+struct CrsMatrix {
+  typedef Device      execution_space ;
+  typedef ScalarType  value_type ;
+
+  typedef StaticCrsGraph< int , execution_space , void , int >  graph_type ;
+  typedef View< value_type* , execution_space >   coefficients_type ;
+
+  graph_type         graph ;
+  coefficients_type  coefficients ;
+};
+
+//----------------------------------------------------------------------------
+
+namespace Impl {
+
+template< class Matrix , class OutputVector , class InputVector >
+struct Multiply ;
+
+}
+}
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< typename AScalarType ,
+          typename VScalarType ,
+          class DeviceType >
+struct Multiply< CrsMatrix<AScalarType,DeviceType> ,
+                 View<VScalarType*,DeviceType > ,
+                 View<VScalarType*,DeviceType > >
+{
+  typedef DeviceType                       execution_space ;
+  typedef typename execution_space::size_type  size_type ;
+
+  typedef View<       VScalarType*, execution_space, MemoryUnmanaged >  vector_type ;
+  typedef View< const VScalarType*, execution_space, MemoryUnmanaged >  vector_const_type ;
+
+  typedef CrsMatrix< AScalarType , execution_space >    matrix_type ;
+
+private:
+
+  matrix_type        m_A ;
+  vector_const_type  m_x ;
+  vector_type        m_y ;
+
+public:
+
+  //--------------------------------------------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type iRow ) const
+  {
+    const size_type iEntryBegin = m_A.graph.row_map[iRow];
+    const size_type iEntryEnd   = m_A.graph.row_map[iRow+1];
+
+    double sum = 0 ;
+
+#if defined( __INTEL_COMPILER )
+#pragma simd reduction(+:sum)
+#pragma ivdep
+    for ( size_type iEntry = iEntryBegin ; iEntry < iEntryEnd ; ++iEntry ) {
+      sum += m_A.coefficients(iEntry) * m_x( m_A.graph.entries(iEntry) );
+    }
+#else
+    for ( size_type iEntry = iEntryBegin ; iEntry < iEntryEnd ; ++iEntry ) {
+      sum += m_A.coefficients(iEntry) * m_x( m_A.graph.entries(iEntry) );
+    }
+#endif
+
+    m_y(iRow) = sum ;
+  }
+
+  Multiply( const matrix_type & A ,
+            const size_type nrow ,
+            const size_type , // ncol ,
+            const vector_type & x ,
+            const vector_type & y )
+    : m_A( A ), m_x( x ), m_y( y )
+  {
+    parallel_for( nrow , *this );
+  }
+};
+
+//----------------------------------------------------------------------------
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+
+template< typename AScalarType ,
+          typename VScalarType ,
+          class Device >
+class Operator {
+  typedef CrsMatrix<AScalarType,Device>  matrix_type ;
+  typedef View<VScalarType*,Device>     vector_type ;
+
+private:
+  const CrsMatrix<AScalarType,Device> A ;
+
+  ParallelDataMap                                         data_map ;
+  AsyncExchange< VScalarType , Device , ParallelDataMap > exchange ;
+
+public:
+
+  Operator( const ParallelDataMap                  & arg_data_map ,
+            const CrsMatrix<AScalarType,Device>    & arg_A )
+    : A( arg_A )
+    , data_map( arg_data_map )
+    , exchange( arg_data_map , 1 )
+    {}
+
+  void apply( const View<VScalarType*,Device>  & x ,
+              const View<VScalarType*,Device>  & y )
+  {
+    // Gather off-processor data for 'x'
+
+    PackArray< vector_type >::pack( exchange.buffer() ,
+                                    data_map.count_interior ,
+                                    data_map.count_send , x );
+
+    exchange.setup();
+
+    // If interior & boundary matrices then could launch interior multiply
+
+    exchange.send_receive();
+
+    UnpackArray< vector_type >::unpack( x , exchange.buffer() ,
+                                        data_map.count_owned ,
+                                        data_map.count_receive );
+
+    const typename Device::size_type nrow = data_map.count_owned ;
+    const typename Device::size_type ncol = data_map.count_owned +
+                                            data_map.count_receive ;
+
+    Impl::Multiply<matrix_type,vector_type,vector_type>( A, nrow, ncol, x, y);
+  }
+};
+
+//----------------------------------------------------------------------------
+
+template< typename AScalarType , typename VScalarType , class Device >
+void cgsolve(
+  const ParallelDataMap                 data_map ,
+  const CrsMatrix<AScalarType,Device>   A ,
+  const View<VScalarType*,Device> b ,
+  const View<VScalarType*,Device> x ,
+  size_t & iteration ,
+  double & normr ,
+  double & iter_time ,
+  const size_t maximum_iteration = 200 ,
+  const double tolerance = std::numeric_limits<VScalarType>::epsilon() )
+{
+  typedef View<VScalarType*,Device> vector_type ;
+  //typedef View<VScalarType,  Device> value_type ; // unused
+
+  const size_t count_owned = data_map.count_owned ;
+  const size_t count_total = data_map.count_owned + data_map.count_receive ;
+
+  Operator<AScalarType,VScalarType,Device> matrix_operator( data_map , A );
+
+  // Need input vector to matvec to be owned + received
+  vector_type pAll ( "cg::p" , count_total );
+
+  vector_type p = Kokkos::subview( pAll , std::pair<size_t,size_t>(0,count_owned) );
+  vector_type r ( "cg::r" , count_owned );
+  vector_type Ap( "cg::Ap", count_owned );
+
+  /* r = b - A * x ; */
+
+  /* p  = x      */ deep_copy( p , x );
+  /* Ap = A * p  */ matrix_operator.apply( pAll , Ap );
+  /* r  = b - Ap */ waxpby( count_owned , 1.0 , b , -1.0 , Ap , r );
+  /* p  = r      */ deep_copy( p , r );
+
+  double old_rdot = dot( count_owned , r , data_map.machine );
+
+  normr     = sqrt( old_rdot );
+  iteration = 0 ;
+
+  Kokkos::Timer wall_clock ;
+
+  while ( tolerance < normr && iteration < maximum_iteration ) {
+
+    /* pAp_dot = dot( p , Ap = A * p ) */
+
+    /* Ap = A * p  */ matrix_operator.apply( pAll , Ap );
+
+    const double pAp_dot = dot( count_owned , p , Ap , data_map.machine );
+    const double alpha   = old_rdot / pAp_dot ;
+
+    /* x += alpha * p ;  */ axpy( count_owned,  alpha, p , x );
+    /* r -= alpha * Ap ; */ axpy( count_owned, -alpha, Ap, r );
+
+    const double r_dot = dot( count_owned , r , data_map.machine );
+    const double beta  = r_dot / old_rdot ;
+
+    /* p = r + beta * p ; */ xpby( count_owned , r , beta , p );
+
+    normr = sqrt( old_rdot = r_dot );
+    ++iteration ;
+  }
+
+  iter_time = wall_clock.seconds();
+}
+
+//----------------------------------------------------------------------------
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_HAVE_CUDA )
+
+#if ( CUDA_VERSION < 6000 )
+#pragma message "cusparse_v2.h"
+#include <cusparse_v2.h>
+#else
+#pragma message "cusparse.h"
+#include <cusparse.h>
+#endif
+
+namespace Kokkos {
+namespace Impl {
+
+struct CudaSparseSingleton {
+  cusparseHandle_t   handle;
+  cusparseMatDescr_t descra;
+
+  CudaSparseSingleton()
+  {
+    cusparseCreate( & handle );
+    cusparseCreateMatDescr( & descra );
+    cusparseSetMatType(       descra , CUSPARSE_MATRIX_TYPE_GENERAL );
+    cusparseSetMatIndexBase(  descra , CUSPARSE_INDEX_BASE_ZERO );
+  }
+
+  static CudaSparseSingleton & singleton();
+
+};
+
+template<>
+struct Multiply< CrsMatrix<double,Cuda> ,
+                 View<double*,Cuda > ,
+                 View<double*,Cuda > >
+{
+  typedef Cuda                                      execution_space ;
+  typedef execution_space::size_type                    size_type ;
+  typedef double                                    scalar_type ;
+  typedef View< scalar_type* , execution_space >        vector_type ;
+  typedef CrsMatrix< scalar_type , execution_space >    matrix_type ;
+
+public:
+
+  Multiply( const matrix_type & A ,
+            const size_type nrow ,
+            const size_type ncol ,
+            const vector_type & x ,
+            const vector_type & y )
+  {
+    CudaSparseSingleton & s = CudaSparseSingleton::singleton();
+    const scalar_type alpha = 1 , beta = 0 ;
+
+    cusparseStatus_t status =
+      cusparseDcsrmv( s.handle ,
+                      CUSPARSE_OPERATION_NON_TRANSPOSE ,
+                      nrow , ncol , A.coefficients.dimension_0() ,
+                      &alpha ,
+                      s.descra ,
+                      A.coefficients.ptr_on_device() ,
+                      A.graph.row_map.ptr_on_device() ,
+                      A.graph.entries.ptr_on_device() ,
+                      x.ptr_on_device() ,
+                      &beta ,
+                      y.ptr_on_device() );
+
+    if ( CUSPARSE_STATUS_SUCCESS != status ) {
+      throw std::runtime_error( std::string("ERROR - cusparseDcsrmv " ) );
+    }
+  }
+};
+
+
+template<>
+struct Multiply< CrsMatrix<float,Cuda> ,
+                 View<float*,Cuda > ,
+                 View<float*,Cuda > >
+{
+  typedef Cuda                                      execution_space ;
+  typedef execution_space::size_type                    size_type ;
+  typedef float                                     scalar_type ;
+  typedef View< scalar_type* , execution_space >        vector_type ;
+  typedef CrsMatrix< scalar_type , execution_space >    matrix_type ;
+
+public:
+
+  Multiply( const matrix_type & A ,
+            const size_type nrow ,
+            const size_type ncol ,
+            const vector_type & x ,
+            const vector_type & y )
+  {
+    CudaSparseSingleton & s = CudaSparseSingleton::singleton();
+    const scalar_type alpha = 1 , beta = 0 ;
+
+    cusparseStatus_t status =
+      cusparseScsrmv( s.handle ,
+                      CUSPARSE_OPERATION_NON_TRANSPOSE ,
+                      nrow , ncol , A.coefficients.dimension_0() ,
+                      &alpha ,
+                      s.descra ,
+                      A.coefficients.ptr_on_device() ,
+                      A.graph.row_map.ptr_on_device() ,
+                      A.graph.entries.ptr_on_device() ,
+                      x.ptr_on_device() ,
+                      &beta ,
+                      y.ptr_on_device() );
+
+    if ( CUSPARSE_STATUS_SUCCESS != status ) {
+      throw std::runtime_error( std::string("ERROR - cusparseDcsrmv " ) );
+    }
+  }
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+#endif /* #if defined( KOKKOS_HAVE_CUDA ) */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef SPARSELINEARSYSTEM_HPP */
+
diff --git a/lib/kokkos/example/multi_fem/SparseLinearSystemFill.hpp b/lib/kokkos/example/multi_fem/SparseLinearSystemFill.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..78680cfb6a1ea00e514c2a17cc8ed162542ee732
--- /dev/null
+++ b/lib/kokkos/example/multi_fem/SparseLinearSystemFill.hpp
@@ -0,0 +1,276 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef SPARSELINEARSYSTEMFILL_HPP
+#define SPARSELINEARSYSTEMFILL_HPP
+
+#include <vector>
+#include <algorithm>
+#include <limits>
+
+#include <FEMesh.hpp>
+#include <SparseLinearSystem.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace HybridFEM {
+
+template< class MatrixType , class MeshType ,
+          class elem_matrices_type ,
+          class elem_vectors_type > struct GatherFill ;
+
+
+template< typename ScalarType ,
+          class    DeviceType ,
+          unsigned ElemNode ,
+          typename CoordScalarType ,
+          class elem_matrices_type ,
+          class elem_vectors_type >
+struct GatherFill< 
+  Kokkos::CrsMatrix< ScalarType , DeviceType > ,
+  FEMesh< CoordScalarType , ElemNode , DeviceType > ,
+  elem_matrices_type , elem_vectors_type >
+{
+  typedef DeviceType     execution_space ;
+  typedef typename execution_space::size_type  size_type ;
+
+  static const size_type ElemNodeCount = ElemNode ;
+
+  typedef Kokkos::CrsMatrix< ScalarType , execution_space >    matrix_type ;
+  typedef typename matrix_type::coefficients_type   coefficients_type ;
+  typedef Kokkos::View< ScalarType[] , execution_space >  vector_type ;
+  typedef Kokkos::View< size_type[][ElemNodeCount][ElemNodeCount] , execution_space >       elem_graph_type ;
+
+  typedef FEMesh< CoordScalarType , ElemNodeCount , execution_space > mesh_type ;
+  typedef typename mesh_type::node_elem_ids_type node_elem_ids_type ;
+
+private:
+
+  node_elem_ids_type  node_elem_ids ;
+  elem_graph_type     elem_graph ;
+  elem_matrices_type  elem_matrices ;
+  elem_vectors_type   elem_vectors ;
+  coefficients_type   system_coeff ;
+  vector_type         system_rhs ;
+
+public:
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( size_type irow ) const
+  {
+    const size_type node_elem_begin = node_elem_ids.row_map[irow];
+    const size_type node_elem_end   = node_elem_ids.row_map[irow+1];
+
+    //  for each element that a node belongs to 
+
+    for ( size_type i = node_elem_begin ; i < node_elem_end ; i++ ) {
+
+      const size_type elem_id   = node_elem_ids.entries( i, 0);
+      const size_type row_index = node_elem_ids.entries( i, 1);
+
+      system_rhs(irow) += elem_vectors(elem_id, row_index);
+
+      //  for each node in a particular related element  
+      //  gather the contents of the element stiffness
+      //  matrix that belong in irow
+
+      for ( size_type j = 0 ; j < ElemNodeCount ; ++j ){
+        const size_type A_index = elem_graph( elem_id , row_index , j );
+
+        system_coeff( A_index ) += elem_matrices( elem_id, row_index, j );
+      }
+    }
+  }
+
+
+  static void apply( const matrix_type & matrix ,
+                     const vector_type & rhs ,
+                     const mesh_type   & mesh ,
+                     const elem_graph_type    & elem_graph ,
+                     const elem_matrices_type & elem_matrices ,
+                     const elem_vectors_type  & elem_vectors )
+  {
+    const size_t row_count = matrix.graph.row_map.dimension_0() - 1 ;
+    GatherFill op ;
+    op.node_elem_ids = mesh.node_elem_ids ;
+    op.elem_graph    = elem_graph ;
+    op.elem_matrices = elem_matrices ;
+    op.elem_vectors  = elem_vectors ;
+    op.system_coeff  = matrix.coefficients ;
+    op.system_rhs    = rhs ;
+
+    parallel_for( row_count , op );
+  }
+};
+
+} /* namespace HybridFEM */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace HybridFEM {
+
+template< class GraphType , class MeshType >
+struct GraphFactory {
+  typedef GraphType                         graph_type ;
+  typedef MeshType                          mesh_type ;
+  typedef typename graph_type::execution_space  execution_space ;
+  typedef typename execution_space::size_type   size_type  ;
+
+  static const unsigned ElemNodeCount = mesh_type::element_node_count ;
+
+  typedef Kokkos::View< size_type[][ElemNodeCount][ElemNodeCount] , execution_space >         element_map_type ;
+
+  static
+  void
+  create( const mesh_type & mesh ,
+          graph_type & graph ,
+          element_map_type & elem_map )
+  {
+    typename mesh_type::node_elem_ids_type::HostMirror
+      node_elem_ids = create_mirror( mesh.node_elem_ids );
+
+    typename mesh_type::elem_node_ids_type::HostMirror
+      elem_node_ids = create_mirror( mesh.elem_node_ids );
+
+    typedef typename element_map_type::HostMirror element_map_host_type ;
+
+    deep_copy( elem_node_ids , mesh.elem_node_ids );
+    deep_copy( node_elem_ids.entries , mesh.node_elem_ids.entries );
+
+    const size_t owned_node = mesh.parallel_data_map.count_owned ;
+    const size_t total_elem = mesh.elem_node_ids.dimension_0();
+
+    if ( total_elem ) {
+      elem_map = element_map_type( std::string("element_map"), total_elem );
+    }
+
+    element_map_host_type elem_map_host = create_mirror( elem_map );
+
+    //------------------------------------
+    //  Node->node mapping for the CrsMatrix graph
+
+    std::vector< std::vector< unsigned > > node_node_ids( owned_node );
+    std::vector< unsigned > node_node_begin( owned_node );
+
+    size_t offset = 0 ;
+    for ( size_t i = 0 ; i < owned_node ; ++i ) {
+      const size_t j_end = node_elem_ids.row_map[i+1];
+            size_t j     = node_elem_ids.row_map[i];
+
+      node_node_begin[i] = offset ;
+
+      std::vector< unsigned > & work = node_node_ids[i] ;
+
+      for ( ; j < j_end ; ++j ) {
+        const size_t elem_id = node_elem_ids.entries(j,0);
+        for ( size_t k = 0 ; k < ElemNodeCount ; ++k ) {
+          work.push_back( elem_node_ids( elem_id , k ) );
+        }
+      }
+
+      std::sort( work.begin() , work.end() );
+
+      work.erase( std::unique( work.begin() , work.end() ) , work.end() );
+
+      offset += work.size();
+    }
+
+    graph = Kokkos::create_staticcrsgraph< graph_type >( "node_node_ids" , node_node_ids );
+
+    //------------------------------------
+    // ( element , node_row , node_column ) -> matrix_crs_column
+
+    for ( size_t elem_id = 0 ; elem_id < total_elem ; ++elem_id ) {
+      for ( size_t i = 0 ; i < ElemNodeCount ; ++i ) {
+
+        const size_t node_row = elem_node_ids( elem_id , i );
+        const size_t node_row_begin = node_node_begin[ node_row ];
+        const std::vector< unsigned > & column = node_node_ids[ node_row ] ;
+
+        if ( owned_node <= node_row ) {
+          for ( unsigned j = 0 ; j < ElemNodeCount ; ++j ) {
+            elem_map_host( elem_id , i , j ) = std::numeric_limits<size_type>::max();
+          }
+        }
+        else {
+
+          for ( unsigned j = 0 ; j < ElemNodeCount ; ++j ) {
+            const size_type node_col = elem_node_ids( elem_id , j );
+
+            int col_search = 0 ;
+
+            for ( int len = column.size() ; 0 < len ; ) {
+
+              const int half = len >> 1;
+              const int middle = col_search + half ;
+
+              if ( column[middle] < node_col ){
+                col_search = middle + 1 ;
+                len -= half + 1 ;
+              }
+              else {
+                len = half ;
+              }
+            }
+if ( node_col != column[col_search] ) {
+  throw std::runtime_error(std::string("Failed"));
+}
+            elem_map_host( elem_id , i , j ) = col_search + node_row_begin ;
+          }
+        }
+      }
+    }
+
+    deep_copy( elem_map , elem_map_host );
+  }
+};
+
+} // namespace HybridFEM
+
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef SPARSELINEARSYSTEMFILL_HPP */
+
diff --git a/lib/kokkos/example/multi_fem/SparseLinearSystem_Cuda.hpp b/lib/kokkos/example/multi_fem/SparseLinearSystem_Cuda.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..3b22d4c5d0e73d9c5c39e92ad043debf796993a5
--- /dev/null
+++ b/lib/kokkos/example/multi_fem/SparseLinearSystem_Cuda.hpp
@@ -0,0 +1,164 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef SPARSELINEARSYSTEM_CUDA_HPP
+#define SPARSELINEARSYSTEM_CUDA_HPP
+
+#if defined( BUILD_FROM_CU_FILE )
+
+#include <cusparse_v2.h>
+#include <Kokkos_Core.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+
+struct CudaSparseSingleton {
+  cusparseHandle_t   handle;
+  cusparseMatDescr_t descra;
+
+  CudaSparseSingleton()
+  {
+    cusparseCreate( & handle );
+    cusparseCreateMatDescr( & descra );
+    cusparseSetMatType(       descra , CUSPARSE_MATRIX_TYPE_GENERAL );
+    cusparseSetMatIndexBase(  descra , CUSPARSE_INDEX_BASE_ZERO );
+  }
+
+  static CudaSparseSingleton & singleton();
+
+};
+
+CudaSparseSingleton & CudaSparseSingleton::singleton()
+{ static CudaSparseSingleton s ; return s ; }
+
+
+template<>
+struct Multiply< CrsMatrix<double,Cuda> ,
+                 View<double*,Cuda > ,
+                 View<double*,Cuda > >
+{
+  typedef Cuda                                      execution_space ;
+  typedef execution_space::size_type                    size_type ;
+  typedef double                                    scalar_type ;
+  typedef View< scalar_type* , execution_space >        vector_type ;
+  typedef CrsMatrix< scalar_type , execution_space >    matrix_type ;
+
+public:
+
+  Multiply( const matrix_type & A ,
+            const size_type nrow ,
+            const size_type ncol ,
+            const vector_type & x ,
+            const vector_type & y )
+  {
+    CudaSparseSingleton & s = CudaSparseSingleton::singleton();
+    const scalar_type alpha = 1 , beta = 0 ;
+
+    cusparseStatus_t status =
+      cusparseDcsrmv( s.handle ,
+                      CUSPARSE_OPERATION_NON_TRANSPOSE ,
+                      nrow , ncol , A.coefficients.dimension_0() ,
+                      &alpha ,
+                      s.descra ,
+                      A.coefficients.ptr_on_device() ,
+                      A.graph.row_map.ptr_on_device() ,
+                      A.graph.entries.ptr_on_device() ,
+                      x.ptr_on_device() ,
+                      &beta ,
+                      y.ptr_on_device() );
+
+    if ( CUSPARSE_STATUS_SUCCESS != status ) {
+      throw std::runtime_error( std::string("ERROR - cusparseDcsrmv " ) );
+    }
+  }
+};
+
+
+template<>
+struct Multiply< CrsMatrix<float,Cuda> ,
+                 View<float*,Cuda > ,
+                 View<float*,Cuda > >
+{
+  typedef Cuda                                      execution_space ;
+  typedef execution_space::size_type                    size_type ;
+  typedef float                                     scalar_type ;
+  typedef View< scalar_type* , execution_space >        vector_type ;
+  typedef CrsMatrix< scalar_type , execution_space >    matrix_type ;
+
+public:
+
+  Multiply( const matrix_type & A ,
+            const size_type nrow ,
+            const size_type ncol ,
+            const vector_type & x ,
+            const vector_type & y )
+  {
+    CudaSparseSingleton & s = CudaSparseSingleton::singleton();
+    const scalar_type alpha = 1 , beta = 0 ;
+
+    cusparseStatus_t status =
+      cusparseScsrmv( s.handle ,
+                      CUSPARSE_OPERATION_NON_TRANSPOSE ,
+                      nrow , ncol , A.coefficients.dimension_0() ,
+                      &alpha ,
+                      s.descra ,
+                      A.coefficients.ptr_on_device() ,
+                      A.graph.row_map.ptr_on_device() ,
+                      A.graph.entries.ptr_on_device() ,
+                      x.ptr_on_device() ,
+                      &beta ,
+                      y.ptr_on_device() );
+
+    if ( CUSPARSE_STATUS_SUCCESS != status ) {
+      throw std::runtime_error( std::string("ERROR - cusparseDcsrmv " ) );
+    }
+  }
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+#endif /* #if defined( __CUDACC__ ) */
+#endif /* #ifndef SPARSELINEARSYSTEM_CUDA_HPP */
+
diff --git a/lib/kokkos/example/multi_fem/TestBoxMeshFixture.hpp b/lib/kokkos/example/multi_fem/TestBoxMeshFixture.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..9cc32b6b104f5d161d05cb3ced0f8412dbfbefbc
--- /dev/null
+++ b/lib/kokkos/example/multi_fem/TestBoxMeshFixture.hpp
@@ -0,0 +1,242 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef TESTFEMESHBOXFIXTURE_HPP
+#define TESTFEMESHBOXFIXTURE_HPP
+
+#include <stdio.h>
+#include <iostream>
+#include <stdexcept>
+#include <limits>
+#include <utility>
+#include <BoxMeshFixture.hpp>
+
+#include <ParallelComm.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace TestFEMesh {
+
+template< class ViewType >
+struct VerifyUnpack  ;
+
+template< typename DeviceType, typename T >
+struct VerifyUnpack< Kokkos::View< T*[3] , DeviceType > >
+{
+  typedef DeviceType     execution_space ;
+  typedef typename execution_space::size_type  size_type ;
+  typedef size_type               value_type ;
+
+  typedef Kokkos::View< T* ,    execution_space > buffer_type ;
+  typedef Kokkos::View< T*[3] , execution_space > array_type ;
+
+private:
+
+  array_type  node_coords ;
+  buffer_type buffer ;
+  size_type   node_begin ;
+
+public:
+
+  KOKKOS_INLINE_FUNCTION
+  static void init( value_type & update )
+  { update = 0 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update ,
+                    const volatile value_type & source )
+  { update += source ; }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type i , value_type & update ) const
+  {
+    const size_type node_id = i + node_begin ;
+    const size_type k = i * 3 ;
+
+    const long xb = buffer[k];
+    const long yb = buffer[k+1];
+    const long zb = buffer[k+2];
+    const long xn = node_coords(node_id,0);
+    const long yn = node_coords(node_id,1);
+    const long zn = node_coords(node_id,2);
+
+    if ( xb != xn || yb != yn || zb != zn ) {
+      printf("TestFEMesh::VerifyUnpack failed at %d : node %d : { %ld %ld %ld } != { %ld %ld %ld }\n",
+             (int)i,(int)node_id, xb,yb,zb, xn, yn, zn );
+      ++update ;
+    }
+  }
+
+  static inline
+  size_type unpack( const array_type  & arg_node_coords ,
+                    const size_type     arg_node_begin ,
+                    const size_type     arg_node_count ,
+                    const buffer_type & arg_buffer )
+  {
+    VerifyUnpack op ;
+    op.node_coords = arg_node_coords ;
+    op.buffer      = arg_buffer ;
+    op.node_begin  = arg_node_begin ;
+    size_type count = 0 ;
+    Kokkos::parallel_reduce( arg_node_count , op , count );
+    return count ;
+  }
+};
+
+}
+
+//----------------------------------------------------------------------------
+
+#ifdef KOKKOS_HAVE_MPI
+
+namespace TestFEMesh {
+
+template< typename coordinate_scalar_type ,
+          unsigned ElemNodeCount ,
+          class Device >
+void verify_parallel(
+  const HybridFEM::FEMesh< coordinate_scalar_type ,
+                           ElemNodeCount ,
+                           Device > & mesh )
+{
+  typedef HybridFEM::FEMesh< coordinate_scalar_type, ElemNodeCount, Device > femesh_type ;
+  typedef typename femesh_type::node_coords_type node_coords_type ;
+
+  comm::Machine machine = mesh.parallel_data_map.machine ;
+
+  // Communicate node coordinates to verify communication and setup.
+
+  const size_t chunk_size = 3 ;
+
+  Kokkos::AsyncExchange< coordinate_scalar_type, Device, Kokkos::ParallelDataMap >
+    exchange( mesh.parallel_data_map , chunk_size );
+
+  const size_t send_begin = mesh.parallel_data_map.count_interior ;
+  const size_t send_count = mesh.parallel_data_map.count_send ;
+
+  const size_t recv_begin = mesh.parallel_data_map.count_owned ;
+  const size_t recv_count = mesh.parallel_data_map.count_receive ;
+
+  typedef Kokkos::PackArray< node_coords_type > pack_type ;
+
+  pack_type::pack( exchange.buffer(), send_begin, send_count, mesh.node_coords );
+
+  exchange.setup();
+
+  // Launch local-action device kernels
+
+  exchange.send_receive();
+
+  unsigned long local[3] ;
+  local[0] = mesh.parallel_data_map.count_owned ;
+  local[1] = mesh.parallel_data_map.count_receive ;
+  local[2] = TestFEMesh::VerifyUnpack< node_coords_type >::unpack( mesh.node_coords, recv_begin, recv_count, exchange.buffer() );
+
+  unsigned long global[3] = { 0 , 0 , 0 };
+
+  MPI_Allreduce( local , global ,
+                 3 , MPI_UNSIGNED_LONG , MPI_SUM , machine.mpi_comm );
+
+  if ( 0 == comm::rank( machine ) ) {
+    std::cout << ( global[2] ? "FAILED" : "PASSED" )
+              << ": TestFEMesh::verify_parallel "
+              << "NP(" << comm::size( machine )
+              << ") total_node(" << global[0]
+              << ") verified_nodes(" << global[1]
+              << ") failed_nodes(" << global[2]
+              << ")" << std::endl ;
+  }
+}
+
+} // namespace TestFEMesh
+
+#else /* ! #ifdef KOKKOS_HAVE_MPI */
+
+namespace TestFEMesh {
+
+template< typename coordinate_scalar_type ,
+          unsigned ElemNodeCount ,
+          class Device >
+void verify_parallel(
+  const HybridFEM::FEMesh< coordinate_scalar_type ,
+                           ElemNodeCount ,
+                           Device > & )
+{}
+
+} // namespace TestFEMesh
+
+#endif /* ! #ifdef KOKKOS_HAVE_MPI */
+
+//----------------------------------------------------------------------------
+
+template< class Device >
+void test_box_fixture( comm::Machine machine ,
+                       const size_t gang_count ,
+                       const size_t nodes_nx ,
+                       const size_t nodes_ny ,
+                       const size_t nodes_nz )
+{
+  typedef long                coordinate_scalar_type ;
+  typedef FixtureElementHex8  fixture_element_type ;
+
+  typedef BoxMeshFixture< coordinate_scalar_type ,
+                          Device ,
+                          fixture_element_type > fixture_type ;
+
+  typedef typename fixture_type::FEMeshType  mesh_type ;
+
+  const size_t proc_count = comm::size( machine );
+  const size_t proc_local = comm::rank( machine ) ;
+
+  mesh_type mesh =
+    fixture_type::create( proc_count , proc_local , gang_count ,
+                          nodes_nx - 1 , nodes_ny - 1 , nodes_nz - 1 );
+
+  mesh.parallel_data_map.machine = machine ;
+
+  TestFEMesh::verify_parallel( mesh );
+}
+
+#endif /* #ifndef TESTFEMESHBOXFIXTURE_HPP */
+
+
diff --git a/lib/kokkos/example/multi_fem/TestBoxMeshPartition.cpp b/lib/kokkos/example/multi_fem/TestBoxMeshPartition.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ffaeeb6af5e1403b0b57b72af909905f7811ccf8
--- /dev/null
+++ b/lib/kokkos/example/multi_fem/TestBoxMeshPartition.cpp
@@ -0,0 +1,172 @@
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+
+#include <iostream>
+#include <stdexcept>
+#include <limits>
+#include <utility>
+#include <BoxMeshPartition.hpp>
+
+//----------------------------------------------------------------------------
+
+void test_box_partition( bool print )
+{
+  const size_t np_max = 10000 ;
+
+  const BoxBoundsLinear use_box ;
+
+  BoxType root_box ;
+
+  root_box[0][0] = 0 ; root_box[0][1] = 100 ;
+  root_box[1][0] = 0 ; root_box[1][1] = 200 ;
+  root_box[2][0] = 0 ; root_box[2][1] = 300 ;
+
+  const size_t cell_total =
+    ( root_box[0][1] - root_box[0][0] ) *
+    ( root_box[1][1] - root_box[1][0] ) *
+    ( root_box[2][1] - root_box[2][0] );
+
+  for ( size_t np = 2 ; np < np_max ; np = 2 * ( np + 1 ) ) {
+
+    std::vector<BoxType> part_boxes( np );
+
+    box_partition_rcb( root_box , part_boxes );
+
+    size_t cell_goal = ( cell_total + np - 1 ) / np ;
+    size_t cell_max = 0 ;
+
+    for ( size_t i = 0 ; i < np ; ++i ) {
+      cell_max = std::max( cell_max , count( part_boxes[i] ) );
+    }
+
+    if ( print ) {
+      std::cout << std::endl
+                << "box_part( " << np 
+                << " ) max( " << cell_max
+                << " ) goal( " << cell_goal
+                << " ) ratio( " << double(cell_max) / double(cell_goal)
+                << " )" << std::endl ;
+    }
+
+    const size_t nsample = std::min(np,(size_t)4);
+    const size_t stride = ( np + nsample - 1 ) / nsample ;
+
+    for ( size_t my_part = 0 ; my_part < np ; my_part += stride ) {
+      BoxType             my_use_box ;
+      std::vector<size_t> my_use_id_map ;
+      size_t              my_count_interior ;
+      size_t              my_count_owned ;
+      size_t              my_count_uses ;
+      std::vector<size_t> my_recv_counts ;
+      std::vector<std::vector<size_t> > my_send_map ;
+
+      size_t count_verify = 0 ;
+
+      box_partition_maps( root_box , part_boxes ,
+                          use_box , my_part ,
+                          my_use_box , my_use_id_map ,
+                          my_count_interior ,
+                          my_count_owned ,
+                          my_count_uses ,
+                          my_recv_counts ,
+                          my_send_map );
+
+      count_verify = my_count_owned ;
+
+      if ( print ) {
+        std::cout << "  my_part(" << my_part << ") layout { "
+                  << "P" << my_part
+                  << "(" << my_count_interior
+                  << "," << ( my_count_owned - my_count_interior )
+                  << ")" ;
+      }
+
+      for ( size_t i = 1 ; i < np ; ++i ) {
+        if ( my_recv_counts[i] ) {
+          count_verify += my_recv_counts[i] ;
+          const size_t ip = ( my_part + i ) % np ;
+
+          if ( print ) {
+            std::cout << " P" << ip << "(" << my_recv_counts[i] << ")" ;
+          }
+
+          // Compare recv & send lists
+
+          BoxType             ip_use_box ;
+          std::vector<size_t> ip_use_id_map ;
+          size_t              ip_count_interior ;
+          size_t              ip_count_owned ;
+          size_t              ip_count_uses ;
+          std::vector<size_t> ip_recv_counts ;
+          std::vector<std::vector<size_t> > ip_send_map ;
+
+          box_partition_maps( root_box , part_boxes ,
+                              use_box , ip ,
+                              ip_use_box , ip_use_id_map ,
+                              ip_count_interior ,
+                              ip_count_owned ,
+                              ip_count_uses ,
+                              ip_recv_counts ,
+                              ip_send_map );
+
+          // Sent by ip, received by my_part:
+
+          const BoxType recv_send = intersect( part_boxes[ip] , my_use_box );
+          const size_t recv_send_count = count( recv_send );
+
+          const size_t j = ( my_part + np - ip ) % np ;
+
+          if ( recv_send_count != my_recv_counts[i] ||
+               recv_send_count != ip_send_map[j].size() ) {
+            throw std::runtime_error( std::string("bad recv/send map") );
+          }
+        }
+      }
+      if ( print ) { std::cout << " }" << std::endl ; }
+
+      if ( count_verify != my_count_uses ) {
+        throw std::runtime_error( std::string("bad partition map") );
+      }
+    }
+  }
+}
+
+
diff --git a/lib/kokkos/example/multi_fem/TestCuda.cpp b/lib/kokkos/example/multi_fem/TestCuda.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..11370a53504589c1f06e19caa5cf0110a7e9a909
--- /dev/null
+++ b/lib/kokkos/example/multi_fem/TestCuda.cpp
@@ -0,0 +1,192 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+
+#include <TestBoxMeshFixture.hpp>
+#include <Implicit.hpp>
+#include <Nonlinear.hpp>
+#include <Explicit.hpp>
+
+#include <SparseLinearSystem.hpp>
+
+#if defined( KOKKOS_HAVE_CUDA )
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+CudaSparseSingleton & CudaSparseSingleton::singleton()
+{ static CudaSparseSingleton s ; return s ; }
+
+}
+}
+
+//----------------------------------------------------------------------------
+
+void test_cuda_query( comm::Machine machine )
+{
+  const size_t comm_rank = comm::rank( machine );
+  std::cout << "P" << comm_rank
+            << ": Cuda device_count = "
+            << Kokkos::Cuda::detect_device_count()
+            << std::endl ;
+}
+
+//----------------------------------------------------------------------------
+
+void test_cuda_fixture( comm::Machine machine ,
+                        size_t nx , size_t ny , size_t nz )
+{
+  const size_t comm_rank = comm::rank( machine );
+  const size_t comm_size = comm::size( machine );
+  const size_t dev_count = Kokkos::Cuda::detect_device_count();
+  const size_t dev_rank =
+    dev_count && dev_count <= comm_size ? comm_rank % dev_count : 0 ;
+  const size_t gang_count = 0 ;
+
+  Kokkos::HostSpace::execution_space::initialize();
+  Kokkos::Cuda::SelectDevice select_device( dev_rank );
+  Kokkos::Cuda::initialize( select_device );
+  test_box_fixture<Kokkos::Cuda>( machine , gang_count , nx , ny , nz );
+  Kokkos::Cuda::finalize();
+  Kokkos::HostSpace::execution_space::finalize();
+}
+
+//----------------------------------------------------------------------------
+
+void test_cuda_implicit( comm::Machine machine , 
+                         size_t elem_count_begin ,
+                         size_t elem_count_end ,
+                         size_t count_run )
+{
+  const size_t comm_rank = comm::rank( machine );
+  const size_t comm_size = comm::size( machine );
+  const size_t dev_count = Kokkos::Cuda::detect_device_count();
+  const size_t dev_rank =
+    dev_count && dev_count <= comm_size ? comm_rank % dev_count : 0 ;
+  const size_t gang_count = 0 ;
+
+  Kokkos::HostSpace::execution_space::initialize();
+  Kokkos::Cuda::SelectDevice select_device( dev_rank );
+  Kokkos::Cuda::initialize( select_device );
+  HybridFEM::Implicit::driver<double,Kokkos::Cuda>( "Cuda" , machine , gang_count , elem_count_begin , elem_count_end , count_run );
+  Kokkos::Cuda::finalize();
+  Kokkos::HostSpace::execution_space::finalize();
+}
+
+//----------------------------------------------------------------------------
+
+void test_cuda_explicit( comm::Machine machine , 
+                         size_t elem_count_begin ,
+                         size_t elem_count_end ,
+                         size_t count_run )
+{
+  const size_t comm_rank = comm::rank( machine );
+  const size_t comm_size = comm::size( machine );
+  const size_t dev_count = Kokkos::Cuda::detect_device_count();
+  const size_t dev_rank =
+    dev_count && dev_count <= comm_size ? comm_rank % dev_count : 0 ;
+  const size_t gang_count = 0 ;
+
+  Kokkos::HostSpace::execution_space::initialize();
+  Kokkos::Cuda::SelectDevice select_device( dev_rank );
+  Kokkos::Cuda::initialize( select_device );
+  Explicit::driver<double,Kokkos::Cuda>( "Cuda" , machine , gang_count , elem_count_begin , elem_count_end , count_run );
+  Kokkos::Cuda::finalize();
+  Kokkos::HostSpace::execution_space::finalize();
+}
+
+//----------------------------------------------------------------------------
+
+void test_cuda_nonlinear( comm::Machine machine , 
+                          size_t elem_count_begin ,
+                          size_t elem_count_end ,
+                          size_t count_run )
+{
+  const size_t comm_rank = comm::rank( machine );
+  const size_t comm_size = comm::size( machine );
+  const size_t dev_count = Kokkos::Cuda::detect_device_count();
+  const size_t dev_rank =
+    dev_count && dev_count <= comm_size ? comm_rank % dev_count : 0 ;
+  const size_t gang_count = 0 ;
+
+  Kokkos::HostSpace::execution_space::initialize();
+  Kokkos::Cuda::SelectDevice select_device( dev_rank );
+  Kokkos::Cuda::initialize( select_device );
+
+  typedef Kokkos::Cuda device ;
+  typedef FixtureElementHex8 hex8 ;
+  HybridFEM::Nonlinear::driver<double,device,hex8>( "Cuda" , machine , gang_count , elem_count_begin , elem_count_end , count_run );
+  Kokkos::Cuda::finalize();
+  Kokkos::HostSpace::execution_space::finalize();
+}
+
+void test_cuda_nonlinear_quadratic( comm::Machine machine , 
+                                    size_t elem_count_begin ,
+                                    size_t elem_count_end ,
+                                    size_t count_run )
+{
+  const size_t comm_rank = comm::rank( machine );
+  const size_t comm_size = comm::size( machine );
+  const size_t dev_count = Kokkos::Cuda::detect_device_count();
+  const size_t dev_rank =
+    dev_count && dev_count <= comm_size ? comm_rank % dev_count : 0 ;
+  const size_t gang_count = 0 ;
+
+  Kokkos::HostSpace::execution_space::initialize();
+  Kokkos::Cuda::SelectDevice select_device( dev_rank );
+  Kokkos::Cuda::initialize( select_device );
+
+  typedef Kokkos::Cuda device ;
+  typedef FixtureElementHex27 hex27 ;
+  HybridFEM::Nonlinear::driver<double,device,hex27>( "Cuda" , machine , gang_count , elem_count_begin , elem_count_end , count_run );
+  Kokkos::Cuda::finalize();
+  Kokkos::HostSpace::execution_space::finalize();
+}
+
+//----------------------------------------------------------------------------
+
+#endif  /* #if defined( KOKKOS_HAVE_CUDA ) */
+
diff --git a/lib/kokkos/example/multi_fem/TestHost.cpp b/lib/kokkos/example/multi_fem/TestHost.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..facdd0f28a2e0f5d19f42cfefaccb643c01becdd
--- /dev/null
+++ b/lib/kokkos/example/multi_fem/TestHost.cpp
@@ -0,0 +1,137 @@
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+
+// Must be included first on Intel-Phi systems due to
+// redefinition of SEEK_SET in <mpi.h>.
+
+#include <ParallelComm.hpp>
+
+#include <iostream>
+#include <stdexcept>
+#include <limits>
+#include <utility>
+
+//----------------------------------------------------------------------------
+
+#include <Kokkos_Core.hpp>
+
+#include <BoxMeshFixture.hpp>
+#include <TestBoxMeshFixture.hpp>
+#include <Implicit.hpp>
+#include <Nonlinear.hpp>
+#include <Explicit.hpp>
+#include <SparseLinearSystem.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+void test_host_fixture( comm::Machine machine ,
+                        size_t gang_count ,
+                        size_t gang_worker_count ,
+                        size_t nx , size_t ny , size_t nz )
+{
+  Kokkos::HostSpace::execution_space::initialize( gang_count * gang_worker_count );
+  test_box_fixture<Kokkos::HostSpace::execution_space>( machine , gang_count , nx , ny , nz );
+  Kokkos::HostSpace::execution_space::finalize();
+}
+
+//----------------------------------------------------------------------------
+
+void test_host_implicit( comm::Machine machine ,
+                         size_t gang_count ,
+                         size_t gang_worker_count ,
+                         size_t elem_count_begin ,
+                         size_t elem_count_end ,
+                         size_t count_run )
+{
+  Kokkos::HostSpace::execution_space::initialize( gang_count * gang_worker_count );
+  HybridFEM::Implicit::driver<double,Kokkos::HostSpace::execution_space>( "Threads" , machine , gang_count , elem_count_begin , elem_count_end , count_run );
+  Kokkos::HostSpace::execution_space::finalize();
+}
+
+//----------------------------------------------------------------------------
+
+void test_host_explicit( comm::Machine machine ,
+                         size_t gang_count ,
+                         size_t gang_worker_count ,
+                         size_t elem_count_begin ,
+                         size_t elem_count_end ,
+                         size_t count_run )
+{
+  Kokkos::HostSpace::execution_space::initialize( gang_count * gang_worker_count );
+  Explicit::driver<double,Kokkos::HostSpace::execution_space>( "Threads" , machine , gang_count , elem_count_begin , elem_count_end , count_run );
+  Kokkos::HostSpace::execution_space::finalize();
+}
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+void test_host_nonlinear( comm::Machine machine ,
+                          size_t gang_count ,
+                          size_t gang_worker_count ,
+                          size_t elem_count_begin ,
+                          size_t elem_count_end ,
+                          size_t count_run )
+{
+  Kokkos::HostSpace::execution_space::initialize( gang_count * gang_worker_count );
+  typedef FixtureElementHex8 hex8 ;
+  typedef Kokkos::HostSpace::execution_space             device ;
+  HybridFEM::Nonlinear::driver<double,device,hex8>( "Threads" , machine , gang_count , elem_count_begin , elem_count_end , count_run );
+  Kokkos::HostSpace::execution_space::finalize();
+}
+
+void test_host_nonlinear_quadratic( comm::Machine machine ,
+                                    size_t gang_count ,
+                                    size_t gang_worker_count ,
+                                    size_t elem_count_begin ,
+                                    size_t elem_count_end ,
+                                    size_t count_run )
+{
+  Kokkos::HostSpace::execution_space::initialize( gang_count * gang_worker_count );
+  typedef FixtureElementHex27 hex27 ;
+  typedef Kokkos::HostSpace::execution_space              device ;
+  HybridFEM::Nonlinear::driver<double,device,hex27>( "Threads" , machine , gang_count , elem_count_begin , elem_count_end , count_run );
+  Kokkos::HostSpace::execution_space::finalize();
+}
+
+//----------------------------------------------------------------------------
+
+
diff --git a/lib/kokkos/example/multi_fem/TestHybridFEM.cpp b/lib/kokkos/example/multi_fem/TestHybridFEM.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1bbd74e4d7f5818712850ef2294849e01d078b92
--- /dev/null
+++ b/lib/kokkos/example/multi_fem/TestHybridFEM.cpp
@@ -0,0 +1,348 @@
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+
+// Must be included first on Intel-Phi systems due to
+// redefinition of SEEK_SET in <mpi.h>.
+
+#include <ParallelComm.hpp>
+
+#include <string>
+#include <sstream>
+#include <iostream>
+#include <Kokkos_hwloc.hpp>
+
+//----------------------------------------------------------------------------
+
+void test_box_partition( bool print );
+
+//----------------------------------------------------------------------------
+
+void test_host_fixture( comm::Machine machine ,
+                        size_t gang_count ,
+                        size_t gang_worker_count ,
+                        size_t nx , size_t ny , size_t nz );
+
+void test_host_implicit( comm::Machine machine ,
+                         size_t gang_count ,
+                         size_t gang_worker_count ,
+                         size_t elem_count_begin ,
+                         size_t elem_count_end ,
+                         size_t count_run );
+
+void test_host_explicit( comm::Machine machine ,
+                         size_t gang_count ,
+                         size_t gang_worker_count ,
+                         size_t elem_count_begin ,
+                         size_t elem_count_end ,
+                         size_t count_run );
+
+void test_host_nonlinear( comm::Machine machine ,
+                          size_t gang_count ,
+                          size_t gang_worker_count ,
+                          size_t elem_count_begin ,
+                          size_t elem_count_end ,
+                          size_t count_run );
+
+void test_host_nonlinear_quadratic( comm::Machine machine ,
+                                    size_t gang_count ,
+                                    size_t gang_worker_count ,
+                                    size_t elem_count_begin ,
+                                    size_t elem_count_end ,
+                                    size_t count_run );
+
+
+//----------------------------------------------------------------------------
+
+void test_cuda_query( comm::Machine );
+
+void test_cuda_fixture( comm::Machine machine ,
+                        size_t nx , size_t ny , size_t nz );
+
+void test_cuda_implicit( comm::Machine machine ,
+                         size_t elem_count_begin ,
+                         size_t elem_count_end ,
+                         size_t count_run );
+
+void test_cuda_explicit( comm::Machine machine ,
+                         size_t elem_count_begin ,
+                         size_t elem_count_end ,
+                         size_t count_run );
+
+void test_cuda_nonlinear( comm:: Machine machine ,
+                          size_t elem_count_begin ,
+                          size_t elem_count_end ,
+                          size_t count_run );
+
+void test_cuda_nonlinear_quadratic( comm::Machine machine ,
+                                    size_t elem_count_begin ,
+                                    size_t elem_count_end ,
+                                    size_t count_run );
+
+
+//----------------------------------------------------------------------------
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace {
+
+bool run_host( std::istream & input ,
+               comm::Machine machine ,
+               const size_t host_gang_count ,
+               const size_t host_gang_worker_count )
+{
+  bool cmd_error = false ;
+
+  std::string which ; input >> which ;
+
+  if ( which == std::string("fixture") ) {
+
+    size_t nx = 0 , ny = 0 , nz = 0 ;
+    input >> nx >> ny >> nz ;
+    test_host_fixture( machine , host_gang_count , host_gang_worker_count , nx , ny , nz );
+
+  }
+  else if ( which == std::string("explicit") ) {
+
+    size_t mesh_node_begin = 100 ;
+    size_t mesh_node_end   = 300 ;
+    size_t run             =   1 ;
+    input >> mesh_node_begin >> mesh_node_end >> run ;
+    test_host_explicit( machine , host_gang_count , host_gang_worker_count , mesh_node_begin , mesh_node_end , run );
+
+  }
+  else if ( which == std::string("implicit") ) {
+
+    size_t mesh_node_begin = 100 ;
+    size_t mesh_node_end   = 300 ;
+    size_t run             =   1 ;
+    input >> mesh_node_begin >> mesh_node_end >> run ;
+    test_host_implicit( machine , host_gang_count , host_gang_worker_count , mesh_node_begin , mesh_node_end , run );
+
+  }
+  else if ( which == std::string("nonlinear") ) {
+
+    size_t mesh_node_begin = 100 ;
+    size_t mesh_node_end   = 300 ;
+    size_t run             =   1 ;
+    input >> mesh_node_begin >> mesh_node_end >> run ;
+    test_host_nonlinear( machine , host_gang_count , host_gang_worker_count , mesh_node_begin , mesh_node_end , run );
+
+  }
+  else if ( which == std::string("nonlinear_quadratic") ) {
+
+    size_t mesh_node_begin = 100 ;
+    size_t mesh_node_end   = 300 ;
+    size_t run             =   1 ;
+    input >> mesh_node_begin >> mesh_node_end >> run ;
+    test_host_nonlinear_quadratic( machine , host_gang_count , host_gang_worker_count , mesh_node_begin , mesh_node_end , run );
+
+  }
+  else {
+    cmd_error = true ;
+  }
+
+  return cmd_error ;
+}
+
+#if defined( KOKKOS_HAVE_CUDA )
+bool run_cuda( std::istream & input , comm::Machine machine )
+{
+  bool cmd_error = false ;
+
+  std::string which ; input >> which ;
+
+  if ( which == std::string("fixture") ) {
+
+    size_t nx = 0 , ny = 0 , nz = 0 ;
+    input >> nx >> ny >> nz ;
+    test_cuda_fixture( machine , nx , ny , nz );
+
+  }
+  else if ( which == std::string("explicit") ) {
+
+    size_t mesh_node_begin = 100 ;
+    size_t mesh_node_end   = 300 ;
+    size_t run             =   1 ;
+    input >> mesh_node_begin >> mesh_node_end >> run ;
+    test_cuda_explicit( machine , mesh_node_begin , mesh_node_end , run );
+
+  }
+  else if ( which == std::string("implicit") ) {
+
+    size_t mesh_node_begin = 100 ;
+    size_t mesh_node_end   = 300 ;
+    size_t run             =   1 ;
+    input >> mesh_node_begin >> mesh_node_end >> run ;
+    test_cuda_implicit( machine , mesh_node_begin , mesh_node_end , run );
+
+  }
+  else if ( which == std::string("nonlinear") ) {
+
+    size_t mesh_node_begin = 100 ;
+    size_t mesh_node_end   = 300 ;
+    size_t run             =   1 ;
+    input >> mesh_node_begin >> mesh_node_end >> run ;
+    test_cuda_nonlinear( machine , mesh_node_begin , mesh_node_end , run );
+
+  }
+  else if ( which == std::string("nonlinear_quadratic") ) {
+
+    size_t mesh_node_begin = 100 ;
+    size_t mesh_node_end   = 300 ;
+    size_t run             =   1 ;
+    input >> mesh_node_begin >> mesh_node_end >> run ;
+    test_cuda_nonlinear_quadratic( machine , mesh_node_begin , mesh_node_end , run );
+
+  }
+  else {
+    cmd_error = true ;
+  }
+
+  return cmd_error ;
+}
+#endif
+
+void run( const std::string & argline , comm::Machine machine )
+{
+  const unsigned numa_count       = Kokkos::hwloc::get_available_numa_count();
+  const unsigned cores_per_numa   = Kokkos::hwloc::get_available_cores_per_numa();
+  const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core();
+
+  std::istringstream input( argline );
+
+  bool cmd_error = false ;
+
+  std::string which ; input >> which ;
+
+  if ( which == std::string("query") ) {
+    std::cout << "P" << comm::rank( machine )
+              << ": hwloc { NUMA[" << numa_count << "]"
+              << " CORE[" << cores_per_numa << "]"
+              << " PU[" << threads_per_core << "] }"
+              << std::endl ;
+#if defined( KOKKOS_HAVE_CUDA )
+    test_cuda_query( machine );
+#endif
+  }
+  else if ( which == std::string("partition") ) {
+    if ( 0 == comm::rank( machine ) ) {
+      test_box_partition( false /* print flag */ );
+    }
+  }
+  else {
+    if ( which == std::string("host") ) {
+      size_t host_gang_count = 0 ;
+      size_t host_gang_worker_count = 1 ;
+
+      input >> host_gang_count ;
+      input >> host_gang_worker_count ;
+
+      cmd_error = run_host( input , machine , host_gang_count , host_gang_worker_count );
+    }
+    else if ( which == std::string("host-all") ) {
+      size_t host_gang_count        = numa_count ;
+      size_t host_gang_worker_count = cores_per_numa * threads_per_core ;
+
+      cmd_error = run_host( input , machine , host_gang_count , host_gang_worker_count );
+    }
+    else if ( which == std::string("host-most") ) {
+      size_t host_gang_count        = numa_count ;
+      size_t host_gang_worker_count = ( cores_per_numa - 1 ) * threads_per_core ;
+
+      cmd_error = run_host( input , machine , host_gang_count , host_gang_worker_count );
+    }
+#if defined( KOKKOS_HAVE_CUDA )
+    else if ( which == std::string("cuda") ) {
+      cmd_error = run_cuda( input , machine );
+    }
+#endif
+    else {
+      cmd_error = true ;
+    }
+  }
+
+  if ( cmd_error && 0 == comm::rank( machine ) ) {
+    std::cout << "Expecting command line with" << std::endl
+              << "    query" << std::endl
+              << "    partition" << std::endl
+              << "    host NumNumaNode NumThreadPerNode <test>" << std::endl
+              << "    host-all <test>" << std::endl
+              << "    host-most <test>" << std::endl
+              << "    cuda <test>" << std::endl
+              << "where <test> is" << std::endl
+              << "    fixture   NumElemX NumElemY NumElemZ" << std::endl
+              << "    implicit  NumElemBegin NumElemEnd NumRun" << std::endl
+              << "    explicit  NumElemBegin NumElemEnd NumRun" << std::endl
+              << "    nonlinear NumElemBegin NumElemEnd NumRun" << std::endl
+              << "    nonlinear_quadratic NumElemBegin NumElemEnd NumRun" << std::endl ;
+
+  }
+}
+
+} // namespace
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+int main( int argc , char ** argv )
+{
+  comm::Machine machine = comm::Machine::init( & argc , & argv );
+
+  const unsigned comm_rank = comm::rank( machine );
+
+  const std::string argline = comm::command_line( machine , argc , argv );
+
+  try {
+    run( argline , machine );
+  }
+  catch( const std::exception & x ) {
+    std::cerr << "P" << comm_rank << " throw: " << x.what() << std::endl ;
+  }
+  catch( ... ) {
+    std::cerr << "P" << comm_rank << " throw: unknown exception" << std::endl ;
+  }
+
+  comm::Machine::finalize();
+
+  return 0 ;
+}
+
diff --git a/lib/kokkos/example/query_device/CMakeLists.txt b/lib/kokkos/example/query_device/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..dade7f01fef5c935ab3e11bcffc5722ed4b9d1d5
--- /dev/null
+++ b/lib/kokkos/example/query_device/CMakeLists.txt
@@ -0,0 +1,14 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+SET(SOURCES "")
+
+FILE(GLOB SOURCES *.cpp)
+
+TRIBITS_ADD_EXECUTABLE(
+  query_device
+  SOURCES ${SOURCES}
+  COMM serial mpi
+  )
+
diff --git a/lib/kokkos/example/query_device/Makefile b/lib/kokkos/example/query_device/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..bf8fbea3e09a5d71f900de85ff2100cf41bd5738
--- /dev/null
+++ b/lib/kokkos/example/query_device/Makefile
@@ -0,0 +1,53 @@
+KOKKOS_PATH ?= ../..
+
+MAKEFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST)))
+SRC_DIR := $(dir $(MAKEFILE_PATH))
+
+SRC = $(wildcard $(SRC_DIR)/*.cpp)
+OBJ = $(SRC:$(SRC_DIR)/%.cpp=%.o)
+
+#SRC = $(wildcard *.cpp)
+#OBJ = $(SRC:%.cpp=%.o)
+
+default: build
+	echo "Start Build"
+
+# use installed Makefile.kokkos
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+CXX = $(NVCC_WRAPPER)
+CXXFLAGS = -I$(SRC_DIR) -O3
+LINK = $(CXX)
+LINKFLAGS = 
+EXE = $(addsuffix .cuda, $(shell basename $(SRC_DIR)))
+#KOKKOS_DEVICES = "Cuda,OpenMP"
+#KOKKOS_ARCH = "SNB,Kepler35"
+else
+CXX = g++
+CXXFLAGS = -I$(SRC_DIR) -O3
+LINK = $(CXX)
+LINKFLAGS =  
+EXE = $(addsuffix .host, $(shell basename $(SRC_DIR)))
+#KOKKOS_DEVICES = "OpenMP"
+#KOKKOS_ARCH = "SNB"
+endif
+
+DEPFLAGS = -M
+
+LIB =
+
+
+build: $(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: 
+	rm -f *.a *.o *.cuda *.host
+
+# Compilation rules
+
+%.o:$(SRC_DIR)/%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
+
diff --git a/lib/kokkos/example/query_device/query_device.cpp b/lib/kokkos/example/query_device/query_device.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ced8cc4e95a5170441ecf1f9fae9113c885dd7d4
--- /dev/null
+++ b/lib/kokkos/example/query_device/query_device.cpp
@@ -0,0 +1,100 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <iostream>
+#include <sstream>
+
+#include <Kokkos_Macros.hpp>
+
+#if defined( KOKKOS_HAVE_MPI )
+#include <mpi.h>
+#endif
+
+#include <Kokkos_Core.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+int main( int argc , char ** argv )
+{
+  std::ostringstream msg ;
+
+#if defined( KOKKOS_HAVE_MPI )
+
+  MPI_Init( & argc , & argv );
+
+  int mpi_rank = 0 ;
+
+  MPI_Comm_rank( MPI_COMM_WORLD , & mpi_rank );
+
+  msg << "MPI rank(" << mpi_rank << ") " ;
+
+#endif
+
+  msg << "{" << std::endl ;
+
+  if ( Kokkos::hwloc::available() ) {
+    msg << "hwloc( NUMA[" << Kokkos::hwloc::get_available_numa_count()
+        << "] x CORE["    << Kokkos::hwloc::get_available_cores_per_numa()
+        << "] x HT["      << Kokkos::hwloc::get_available_threads_per_core()
+        << "] )"
+        << std::endl ;
+  }
+
+#if defined( KOKKOS_HAVE_CUDA )
+  Kokkos::Cuda::print_configuration( msg );
+#endif
+
+  msg << "}" << std::endl ;
+
+  std::cout << msg.str();
+
+#if defined( KOKKOS_HAVE_MPI )
+
+  MPI_Finalize();
+
+#endif
+
+  return 0 ;
+}
+
diff --git a/lib/kokkos/example/sort_array/CMakeLists.txt b/lib/kokkos/example/sort_array/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..0c7da74f4a9b94dbcdb2a2dc5d192203a319b048
--- /dev/null
+++ b/lib/kokkos/example/sort_array/CMakeLists.txt
@@ -0,0 +1,14 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+SET(SOURCES "")
+
+FILE(GLOB SOURCES *.cpp)
+
+TRIBITS_ADD_EXECUTABLE(
+  sort_array
+  SOURCES ${SOURCES}
+  COMM serial mpi
+  )
+
diff --git a/lib/kokkos/example/sort_array/Makefile b/lib/kokkos/example/sort_array/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..bf8fbea3e09a5d71f900de85ff2100cf41bd5738
--- /dev/null
+++ b/lib/kokkos/example/sort_array/Makefile
@@ -0,0 +1,53 @@
+KOKKOS_PATH ?= ../..
+
+MAKEFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST)))
+SRC_DIR := $(dir $(MAKEFILE_PATH))
+
+SRC = $(wildcard $(SRC_DIR)/*.cpp)
+OBJ = $(SRC:$(SRC_DIR)/%.cpp=%.o)
+
+#SRC = $(wildcard *.cpp)
+#OBJ = $(SRC:%.cpp=%.o)
+
+default: build
+	echo "Start Build"
+
+# use installed Makefile.kokkos
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+CXX = $(NVCC_WRAPPER)
+CXXFLAGS = -I$(SRC_DIR) -O3
+LINK = $(CXX)
+LINKFLAGS = 
+EXE = $(addsuffix .cuda, $(shell basename $(SRC_DIR)))
+#KOKKOS_DEVICES = "Cuda,OpenMP"
+#KOKKOS_ARCH = "SNB,Kepler35"
+else
+CXX = g++
+CXXFLAGS = -I$(SRC_DIR) -O3
+LINK = $(CXX)
+LINKFLAGS =  
+EXE = $(addsuffix .host, $(shell basename $(SRC_DIR)))
+#KOKKOS_DEVICES = "OpenMP"
+#KOKKOS_ARCH = "SNB"
+endif
+
+DEPFLAGS = -M
+
+LIB =
+
+
+build: $(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: 
+	rm -f *.a *.o *.cuda *.host
+
+# Compilation rules
+
+%.o:$(SRC_DIR)/%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
+
diff --git a/lib/kokkos/example/sort_array/main.cpp b/lib/kokkos/example/sort_array/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..74c4ab154cf0856bd53f654e2f33dd884d49ddcd
--- /dev/null
+++ b/lib/kokkos/example/sort_array/main.cpp
@@ -0,0 +1,95 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <string.h>
+#include <stdlib.h>
+#include <iostream>
+#include <sstream>
+
+#include <Kokkos_Core.hpp>
+
+#include <sort_array.hpp>
+
+
+int main( int argc , char ** argv )
+{
+#if defined( KOKKOS_HAVE_CUDA ) || defined( KOKKOS_HAVE_PTHREAD ) || defined( KOKKOS_HAVE_OPENMP )
+  Kokkos::initialize( argc , argv );
+
+  int length_array = 100000 ;
+
+  for ( int i = 0 ; i < argc ; ++i ) {
+    if ( 0 == strcmp( argv[i] , "length_array" ) ) {
+      length_array = atoi( argv[i+1] );
+    }
+  }
+
+  int length_total_array  = length_array * 100;
+
+#if defined( KOKKOS_HAVE_CUDA )
+  if ( Kokkos::Cuda::is_initialized() ) {
+    std::cout << "Kokkos::Cuda" << std::endl ;
+    Example::sort_array< Kokkos::Cuda >( length_array , length_total_array );
+  }
+#endif
+
+#if defined( KOKKOS_HAVE_PTHREAD )
+  if ( Kokkos::Threads::is_initialized() ) {
+    std::cout << "Kokkos::Threads" << std::endl ;
+    Example::sort_array< Kokkos::Threads >( length_array , length_total_array );
+  }
+#endif
+
+#if defined( KOKKOS_HAVE_OPENMP )
+  if ( Kokkos::OpenMP::is_initialized() ) {
+    std::cout << "Kokkos::OpenMP" << std::endl ;
+    Example::sort_array< Kokkos::OpenMP >( length_array , length_total_array );
+  }
+#endif
+
+  Kokkos::finalize();
+#endif
+
+  return 0 ;
+}
+
diff --git a/lib/kokkos/example/sort_array/sort_array.hpp b/lib/kokkos/example/sort_array/sort_array.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d21f9989582c7be28e7c5c1c0f325330cc340e78
--- /dev/null
+++ b/lib/kokkos/example/sort_array/sort_array.hpp
@@ -0,0 +1,190 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef EXAMPLE_SORT_ARRAY
+#define EXAMPLE_SORT_ARRAY
+
+#include <stdlib.h>
+#include <algorithm>
+
+#include <Kokkos_Core.hpp>
+
+#include <impl/Kokkos_Timer.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Example {
+
+template< class Device >
+struct SortView {
+
+  template< typename ValueType >
+  SortView( const Kokkos::View<ValueType*,Device> v , int begin , int end )
+    {
+      std::sort( v.ptr_on_device() + begin , v.ptr_on_device() + end );
+    }
+};
+
+}
+
+#if defined(KOKKOS_HAVE_CUDA)
+
+#include <thrust/device_ptr.h>
+#include <thrust/sort.h>
+
+namespace Example {
+
+template<>
+struct SortView< Kokkos::Cuda > {
+  template< typename ValueType >
+  SortView( const Kokkos::View<ValueType*,Kokkos::Cuda> v , int begin , int end )
+    {
+      thrust::sort( thrust::device_ptr<ValueType>( v.ptr_on_device() + begin )
+                  , thrust::device_ptr<ValueType>( v.ptr_on_device() + end ) );
+    }
+};
+
+}
+
+#endif
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Example {
+
+template< class Device >
+void sort_array( const size_t array_length /* length of spans of array to sort */
+               , const size_t total_length /* total length of array */
+               , const int print = 1 )
+{
+  typedef Device execution_space ;
+  typedef Kokkos::View<int*,Device>  device_array_type ;
+
+#if defined( KOKKOS_HAVE_CUDA )
+
+  typedef typename
+    Kokkos::Impl::if_c< Kokkos::Impl::is_same< Device , Kokkos::Cuda >::value
+                      , Kokkos::View<int*,Kokkos::Cuda::array_layout,Kokkos::CudaHostPinnedSpace>
+                      , typename device_array_type::HostMirror
+                      >::type  host_array_type ;
+
+#else
+
+  typedef typename device_array_type::HostMirror  host_array_type ;
+
+#endif
+
+  Kokkos::Timer timer;
+
+  const device_array_type  work_array("work_array" , array_length );
+  const host_array_type    host_array("host_array" , total_length );
+
+  std::cout << "sort_array length( " << total_length << " )"
+            << " in chunks( " << array_length << " )"
+            << std::endl ;
+
+  double sec = timer.seconds();
+  std::cout << "declaring Views took "
+            << sec << " seconds" << std::endl;
+  timer.reset();
+
+  for ( size_t i = 0 ; i < total_length ; ++i ) {
+    host_array(i) = ( lrand48() * total_length ) >> 31 ;
+  }
+
+  sec = timer.seconds();
+  std::cout << "initializing " << total_length << " elements on host took "
+            << sec << " seconds" << std::endl;
+  timer.reset();
+
+  double sec_copy_in  = 0 ;
+  double sec_sort     = 0 ;
+  double sec_copy_out = 0 ;
+  double sec_error    = 0 ;
+  size_t error_count  = 0 ;
+
+  for ( size_t begin = 0 ; begin < total_length ; begin += array_length ) {
+
+    const size_t end = begin + array_length < total_length
+                     ? begin + array_length : total_length ;
+
+    const std::pair<size_t,size_t> host_range(begin,end);
+
+    const host_array_type host_subarray = Kokkos::subview( host_array , host_range );
+
+    timer.reset();
+
+    Kokkos::deep_copy( work_array , host_subarray );
+
+    sec_copy_in += timer.seconds(); timer.reset();
+
+    SortView< execution_space >( work_array , 0 , end - begin );
+
+    sec_sort += timer.seconds(); timer.reset();
+
+    Kokkos::deep_copy( host_subarray , work_array );
+
+    sec_copy_out += timer.seconds(); timer.reset();
+
+    for ( size_t i = begin + 1 ; i < end ; ++i ) {
+      if ( host_array(i) < host_array(i-1) ) ++error_count ;
+    }
+
+    sec_error += timer.seconds(); timer.reset();
+  }
+
+  std::cout << "copy to   device " << sec_copy_in  << " seconds" << std::endl
+            << "sort on   device " << sec_sort     << " seconds" << std::endl
+            << "copy from device " << sec_copy_out << " seconds" << std::endl
+            << "errors " << error_count << " took " << sec_error << " seconds" << std::endl
+            ;
+}
+
+} // namespace Example
+
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef EXAMPLE_SORT_ARRAY */
+
diff --git a/lib/kokkos/example/tutorial/01_hello_world/CMakeLists.txt b/lib/kokkos/example/tutorial/01_hello_world/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5e5b1fcb46ffbdcb7dacf3bcb6627fa90c7a1157
--- /dev/null
+++ b/lib/kokkos/example/tutorial/01_hello_world/CMakeLists.txt
@@ -0,0 +1,11 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+# This is a tutorial, not a test, so we don't ask CTest to run it.
+TRIBITS_ADD_EXECUTABLE(
+  tutorial_01_hello_world
+  SOURCES hello_world.cpp
+  COMM serial mpi
+  )
+
diff --git a/lib/kokkos/example/tutorial/01_hello_world/Makefile b/lib/kokkos/example/tutorial/01_hello_world/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..78a9fed0cce641b48c85f4d67a1d0ab6c5a63388
--- /dev/null
+++ b/lib/kokkos/example/tutorial/01_hello_world/Makefile
@@ -0,0 +1,43 @@
+KOKKOS_PATH = ../../..
+SRC = $(wildcard *.cpp)
+
+default: build
+	echo "Start Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+CXX = ../../../config/nvcc_wrapper
+CXXFLAGS = -O3
+LINK = ${CXX}
+LINKFLAGS = 
+EXE = $(SRC:.cpp=.cuda)
+KOKKOS_DEVICES = "Cuda,OpenMP"
+KOKKOS_ARCH = "SNB,Kepler35"
+else
+CXX = g++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LINKFLAGS =  
+EXE = $(SRC:.cpp=.host)
+KOKKOS_DEVICES = "OpenMP"
+KOKKOS_ARCH = "SNB"
+endif
+
+DEPFLAGS = -M
+
+OBJ = $(SRC:.cpp=.o)
+LIB =
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+build: $(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: kokkos-clean 
+	rm -f *.o *.cuda *.host
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
diff --git a/lib/kokkos/example/tutorial/01_hello_world/hello_world.cpp b/lib/kokkos/example/tutorial/01_hello_world/hello_world.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..459b9b094fc93475ef62482ab4fff62d956cc9eb
--- /dev/null
+++ b/lib/kokkos/example/tutorial/01_hello_world/hello_world.cpp
@@ -0,0 +1,130 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <cstdio>
+#include <typeinfo>
+
+//
+// "Hello world" parallel_for example:
+//   1. Start up Kokkos
+//   2. Execute a parallel for loop in the default execution space,
+//      using a functor to define the loop body
+//   3. Shut down Kokkos
+//
+// If Kokkos was built with C++11 enabled, try comparing this example
+// to 01_hello_world_lambda.  The latter uses C++11 lambdas (anonymous
+// functions) to define the loop body of the parallel_for.  That makes
+// the code much more concise and readable.  On the other hand,
+// breaking out the loop body into an explicit functor makes it easier
+// to test the loop independently of the parallel pattern.
+//
+
+// Functor that defines the parallel_for's loop body.
+//
+// A "functor" is just a class or struct with a public operator()
+// instance method.
+struct hello_world {
+  // If a functor has an "execution_space" (or "execution_space", for
+  // backwards compatibility) public typedef, parallel_* will only run
+  // the functor in that execution space.  That's a good way to mark a
+  // functor as specific to an execution space.  If the functor lacks
+  // this typedef, parallel_for will run it in the default execution
+  // space, unless you tell it otherwise (that's an advanced topic;
+  // see "execution policies").
+
+  // The functor's operator() defines the loop body.  It takes an
+  // integer argument which is the parallel for loop index.  Other
+  // arguments are possible; see the "hierarchical parallelism" part
+  // of the tutorial.
+  //
+  // The operator() method must be const, and must be marked with the
+  // KOKKOS_INLINE_FUNCTION macro.  If building with CUDA, this macro
+  // will mark your method as suitable for running on the CUDA device
+  // (as well as on the host).  If not building with CUDA, the macro
+  // is unnecessary but harmless.
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int i) const {
+    printf ("Hello from i = %i\n", i);
+  }
+};
+
+int main (int argc, char* argv[]) {
+  // You must call initialize() before you may call Kokkos.
+  //
+  // With no arguments, this initializes the default execution space
+  // (and potentially its host execution space) with default
+  // parameters.  You may also pass in argc and argv, analogously to
+  // MPI_Init().  It reads and removes command-line arguments that
+  // start with "--kokkos-".
+  Kokkos::initialize (argc, argv);
+
+  // Print the name of Kokkos' default execution space.  We're using
+  // typeid here, so the name might get a bit mangled by the linker,
+  // but you should still be able to figure out what it is.
+  printf ("Hello World on Kokkos execution space %s\n",
+          typeid (Kokkos::DefaultExecutionSpace).name ());
+
+  // Run the above functor on the default Kokkos execution space in
+  // parallel, with a parallel for loop count of 15.
+  //
+  // The Kokkos::DefaultExecutionSpace typedef gives the default
+  // execution space.  Depending on how Kokkos was configured, this
+  // could be OpenMP, Threads, Cuda, Serial, or even some other
+  // execution space.
+  //
+  // The following line of code would look like this in OpenMP:
+  //
+  // #pragma omp parallel for
+  // for (int i = 0; i < 15; ++i) {
+  //   printf ("Hello from i = %i\n", i);
+  // }
+  //
+  // You may notice that the printed numbers do not print out in
+  // order.  Parallel for loops may execute in any order.
+  Kokkos::parallel_for ("HelloWorld",15, hello_world ());
+
+  // You must call finalize() after you are done using Kokkos.
+  Kokkos::finalize ();
+}
+
diff --git a/lib/kokkos/example/tutorial/01_hello_world_lambda/CMakeLists.txt b/lib/kokkos/example/tutorial/01_hello_world_lambda/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3fcca4bceba577bf644f1929e1c62c1893b5d5a5
--- /dev/null
+++ b/lib/kokkos/example/tutorial/01_hello_world_lambda/CMakeLists.txt
@@ -0,0 +1,13 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+IF (Kokkos_ENABLE_CXX11)
+  # This is a tutorial, not a test, so we don't ask CTest to run it.
+  TRIBITS_ADD_EXECUTABLE(
+    tutorial_01_hello_world_lambda
+    SOURCES hello_world_lambda.cpp
+    COMM serial mpi
+    )
+ENDIF ()
+
diff --git a/lib/kokkos/example/tutorial/01_hello_world_lambda/Makefile b/lib/kokkos/example/tutorial/01_hello_world_lambda/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..95ee2c47feacf363f99052173a28596144a75734
--- /dev/null
+++ b/lib/kokkos/example/tutorial/01_hello_world_lambda/Makefile
@@ -0,0 +1,44 @@
+KOKKOS_PATH = ../../..
+SRC = $(wildcard *.cpp)
+
+default: build
+	echo "Start Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+CXX = ../../../config/nvcc_wrapper
+CXXFLAGS = -O3
+LINK = ${CXX}
+LINKFLAGS = 
+EXE = $(SRC:.cpp=.cuda)
+KOKKOS_DEVICES = "Cuda,OpenMP"
+KOKKOS_ARCH = "SNB,Kepler35"
+KOKKOS_CUDA_OPTIONS = "enable_lambda"
+else
+CXX = g++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LINKFLAGS =  
+EXE = $(SRC:.cpp=.host)
+KOKKOS_DEVICES = "OpenMP"
+KOKKOS_ARCH = "SNB"
+endif
+
+DEPFLAGS = -M
+
+OBJ = $(SRC:.cpp=.o)
+LIB =
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+build: $(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: kokkos-clean 
+	rm -f *.o *.cuda *.host
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
diff --git a/lib/kokkos/example/tutorial/01_hello_world_lambda/hello_world_lambda.cpp b/lib/kokkos/example/tutorial/01_hello_world_lambda/hello_world_lambda.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b6c9cc5e4380d4ea8b825c9305f2e7cea6316a10
--- /dev/null
+++ b/lib/kokkos/example/tutorial/01_hello_world_lambda/hello_world_lambda.cpp
@@ -0,0 +1,109 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <cstdio>
+#include <typeinfo>
+
+//
+// "Hello world" parallel_for example:
+//   1. Start up Kokkos
+//   2. Execute a parallel for loop in the default execution space,
+//      using a C++11 lambda to define the loop body
+//   3. Shut down Kokkos
+//
+// This example only builds if C++11 is enabled.  Compare this example
+// to 01_hello_world, which uses functors (explicitly defined classes)
+// to define the loop body of the parallel_for.  Both functors and
+// lambdas have their places.
+//
+
+int main (int argc, char* argv[]) {
+  // You must call initialize() before you may call Kokkos.
+  //
+  // With no arguments, this initializes the default execution space
+  // (and potentially its host execution space) with default
+  // parameters.  You may also pass in argc and argv, analogously to
+  // MPI_Init().  It reads and removes command-line arguments that
+  // start with "--kokkos-".
+  Kokkos::initialize (argc, argv);
+
+  // Print the name of Kokkos' default execution space.  We're using
+  // typeid here, so the name might get a bit mangled by the linker,
+  // but you should still be able to figure out what it is.
+  printf ("Hello World on Kokkos execution space %s\n",
+          typeid (Kokkos::DefaultExecutionSpace).name ());
+
+  // Run lambda on the default Kokkos execution space in parallel,
+  // with a parallel for loop count of 15.  The lambda's argument is
+  // an integer which is the parallel for's loop index.  As you learn
+  // about different kinds of parallelism, you will find out that
+  // there are other valid argument types as well.
+  //
+  // For a single level of parallelism, we prefer that you use the
+  // KOKKOS_LAMBDA macro.  If CUDA is disabled, this just turns into
+  // [=].  That captures variables from the surrounding scope by
+  // value.  Do NOT capture them by reference!  If CUDA is enabled,
+  // this macro may have a special definition that makes the lambda
+  // work correctly with CUDA.  Compare to the KOKKOS_INLINE_FUNCTION
+  // macro, which has a special meaning if CUDA is enabled.
+  //
+  // The following parallel_for would look like this if we were using
+  // OpenMP by itself, instead of Kokkos:
+  //
+  // #pragma omp parallel for
+  // for (int i = 0; i < 15; ++i) {
+  //   printf ("Hello from i = %i\n", i);
+  // }
+  //
+  // You may notice that the printed numbers do not print out in
+  // order.  Parallel for loops may execute in any order.
+  Kokkos::parallel_for (15, KOKKOS_LAMBDA (const int i) {
+      // printf works in a CUDA parallel kernel; std::ostream does not.
+      printf ("Hello from i = %i\n", i);
+    });
+
+  // You must call finalize() after you are done using Kokkos.
+  Kokkos::finalize ();
+}
+
diff --git a/lib/kokkos/example/tutorial/02_simple_reduce/CMakeLists.txt b/lib/kokkos/example/tutorial/02_simple_reduce/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7c78db840f849fd9625676c6a73e8aa037b52b4d
--- /dev/null
+++ b/lib/kokkos/example/tutorial/02_simple_reduce/CMakeLists.txt
@@ -0,0 +1,10 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+# This is a tutorial, not a test, so we don't ask CTest to run it.
+TRIBITS_ADD_EXECUTABLE(
+  tutorial_02_simple_reduce
+  SOURCES simple_reduce.cpp
+  COMM serial mpi
+  )
diff --git a/lib/kokkos/example/tutorial/02_simple_reduce/Makefile b/lib/kokkos/example/tutorial/02_simple_reduce/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..78a9fed0cce641b48c85f4d67a1d0ab6c5a63388
--- /dev/null
+++ b/lib/kokkos/example/tutorial/02_simple_reduce/Makefile
@@ -0,0 +1,43 @@
+KOKKOS_PATH = ../../..
+SRC = $(wildcard *.cpp)
+
+default: build
+	echo "Start Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+CXX = ../../../config/nvcc_wrapper
+CXXFLAGS = -O3
+LINK = ${CXX}
+LINKFLAGS = 
+EXE = $(SRC:.cpp=.cuda)
+KOKKOS_DEVICES = "Cuda,OpenMP"
+KOKKOS_ARCH = "SNB,Kepler35"
+else
+CXX = g++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LINKFLAGS =  
+EXE = $(SRC:.cpp=.host)
+KOKKOS_DEVICES = "OpenMP"
+KOKKOS_ARCH = "SNB"
+endif
+
+DEPFLAGS = -M
+
+OBJ = $(SRC:.cpp=.o)
+LIB =
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+build: $(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: kokkos-clean 
+	rm -f *.o *.cuda *.host
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
diff --git a/lib/kokkos/example/tutorial/02_simple_reduce/simple_reduce.cpp b/lib/kokkos/example/tutorial/02_simple_reduce/simple_reduce.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..236618f7be9cf59044169cb82b0894e43bf8351e
--- /dev/null
+++ b/lib/kokkos/example/tutorial/02_simple_reduce/simple_reduce.cpp
@@ -0,0 +1,101 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <cstdio>
+
+//
+// First reduction (parallel_reduce) example:
+//   1. Start up Kokkos
+//   2. Execute a parallel_reduce loop in the default execution space,
+//      using a functor to define the loop body
+//   3. Shut down Kokkos
+//
+// Compare this example to 02_simple_reduce_lambda, which uses a C++11
+// lambda to define the loop body of the parallel_reduce.
+//
+
+// Reduction functor for computing the sum of squares.
+//
+// More advanced reduction examples will show how to control the
+// reduction's "join" operator.  If the join operator is not provided,
+// it defaults to binary operator+ (adding numbers together).
+struct squaresum {
+  // Specify the type of the reduction value with a "value_type"
+  // typedef.  In this case, the reduction value has type int.
+  typedef int value_type;
+
+  // The reduction functor's operator() looks a little different than
+  // the parallel_for functor's operator().  For the reduction, we
+  // pass in both the loop index i, and the intermediate reduction
+  // value lsum.  The latter MUST be passed in by nonconst reference.
+  // (If the reduction type is an array like int[], indicating an
+  // array reduction result, then the second argument is just int[].)
+  KOKKOS_INLINE_FUNCTION
+  void operator () (const int i, int& lsum) const {
+    lsum += i*i; // compute the sum of squares
+  }
+};
+
+int main (int argc, char* argv[]) {
+  Kokkos::initialize (argc, argv);
+  const int n = 10;
+
+  // Compute the sum of squares of integers from 0 to n-1, in
+  // parallel, using Kokkos.
+  int sum = 0;
+  Kokkos::parallel_reduce (n, squaresum (), sum);
+  printf ("Sum of squares of integers from 0 to %i, "
+          "computed in parallel, is %i\n", n - 1, sum);
+
+  // Compare to a sequential loop.
+  int seqSum = 0;
+  for (int i = 0; i < n; ++i) {
+    seqSum += i*i;
+  }
+  printf ("Sum of squares of integers from 0 to %i, "
+          "computed sequentially, is %i\n", n - 1, seqSum);
+  Kokkos::finalize ();
+  return (sum == seqSum) ? 0 : -1;
+}
+
diff --git a/lib/kokkos/example/tutorial/02_simple_reduce_lambda/CMakeLists.txt b/lib/kokkos/example/tutorial/02_simple_reduce_lambda/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e2e3a929f1ade97ce639670a3f28c43bb9ce084f
--- /dev/null
+++ b/lib/kokkos/example/tutorial/02_simple_reduce_lambda/CMakeLists.txt
@@ -0,0 +1,12 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+IF (Kokkos_ENABLE_CXX11)
+  # This is a tutorial, not a test, so we don't ask CTest to run it.
+  TRIBITS_ADD_EXECUTABLE(
+    tutorial_02_simple_reduce_lambda
+    SOURCES simple_reduce_lambda.cpp
+    COMM serial mpi
+    )
+ENDIF ()
diff --git a/lib/kokkos/example/tutorial/02_simple_reduce_lambda/Makefile b/lib/kokkos/example/tutorial/02_simple_reduce_lambda/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..95ee2c47feacf363f99052173a28596144a75734
--- /dev/null
+++ b/lib/kokkos/example/tutorial/02_simple_reduce_lambda/Makefile
@@ -0,0 +1,44 @@
+KOKKOS_PATH = ../../..
+SRC = $(wildcard *.cpp)
+
+default: build
+	echo "Start Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+CXX = ../../../config/nvcc_wrapper
+CXXFLAGS = -O3
+LINK = ${CXX}
+LINKFLAGS = 
+EXE = $(SRC:.cpp=.cuda)
+KOKKOS_DEVICES = "Cuda,OpenMP"
+KOKKOS_ARCH = "SNB,Kepler35"
+KOKKOS_CUDA_OPTIONS = "enable_lambda"
+else
+CXX = g++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LINKFLAGS =  
+EXE = $(SRC:.cpp=.host)
+KOKKOS_DEVICES = "OpenMP"
+KOKKOS_ARCH = "SNB"
+endif
+
+DEPFLAGS = -M
+
+OBJ = $(SRC:.cpp=.o)
+LIB =
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+build: $(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: kokkos-clean 
+	rm -f *.o *.cuda *.host
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
diff --git a/lib/kokkos/example/tutorial/02_simple_reduce_lambda/simple_reduce_lambda.cpp b/lib/kokkos/example/tutorial/02_simple_reduce_lambda/simple_reduce_lambda.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a403633a8a898375f2f5c0d4015fc3930570ef0d
--- /dev/null
+++ b/lib/kokkos/example/tutorial/02_simple_reduce_lambda/simple_reduce_lambda.cpp
@@ -0,0 +1,86 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <cstdio>
+
+//
+// First reduction (parallel_reduce) example:
+//   1. Start up Kokkos
+//   2. Execute a parallel_reduce loop in the default execution space,
+//      using a C++11 lambda to define the loop body
+//   3. Shut down Kokkos
+//
+// This example only builds if C++11 is enabled.  Compare this example
+// to 02_simple_reduce, which uses a functor to define the loop body
+// of the parallel_reduce.
+//
+
+int main (int argc, char* argv[]) {
+  Kokkos::initialize (argc, argv);
+  const int n = 10;
+
+  // Compute the sum of squares of integers from 0 to n-1, in
+  // parallel, using Kokkos.  This time, use a lambda instead of a
+  // functor.  The lambda takes the same arguments as the functor's
+  // operator().
+  int sum = 0;
+  // The KOKKOS_LAMBDA macro replaces the capture-by-value clause [=].
+  // It also handles any other syntax needed for CUDA.
+  Kokkos::parallel_reduce (n, KOKKOS_LAMBDA (const int i, int& lsum) {
+      lsum += i*i;
+    }, sum);
+  printf ("Sum of squares of integers from 0 to %i, "
+          "computed in parallel, is %i\n", n - 1, sum);
+
+  // Compare to a sequential loop.
+  int seqSum = 0;
+  for (int i = 0; i < n; ++i) {
+    seqSum += i*i;
+  }
+  printf ("Sum of squares of integers from 0 to %i, "
+          "computed sequentially, is %i\n", n - 1, seqSum);
+  Kokkos::finalize ();
+  return (sum == seqSum) ? 0 : -1;
+}
+
diff --git a/lib/kokkos/example/tutorial/03_simple_view/CMakeLists.txt b/lib/kokkos/example/tutorial/03_simple_view/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7475a99e492bcf88c6a3ca9b98cc698fa9a38b3d
--- /dev/null
+++ b/lib/kokkos/example/tutorial/03_simple_view/CMakeLists.txt
@@ -0,0 +1,10 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+# This is a tutorial, not a test, so we don't ask CTest to run it.
+TRIBITS_ADD_EXECUTABLE(
+  tutorial_03_simple_view
+  SOURCES simple_view.cpp
+  COMM serial mpi
+  )
diff --git a/lib/kokkos/example/tutorial/03_simple_view/Makefile b/lib/kokkos/example/tutorial/03_simple_view/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..78a9fed0cce641b48c85f4d67a1d0ab6c5a63388
--- /dev/null
+++ b/lib/kokkos/example/tutorial/03_simple_view/Makefile
@@ -0,0 +1,43 @@
+KOKKOS_PATH = ../../..
+SRC = $(wildcard *.cpp)
+
+default: build
+	echo "Start Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+CXX = ../../../config/nvcc_wrapper
+CXXFLAGS = -O3
+LINK = ${CXX}
+LINKFLAGS = 
+EXE = $(SRC:.cpp=.cuda)
+KOKKOS_DEVICES = "Cuda,OpenMP"
+KOKKOS_ARCH = "SNB,Kepler35"
+else
+CXX = g++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LINKFLAGS =  
+EXE = $(SRC:.cpp=.host)
+KOKKOS_DEVICES = "OpenMP"
+KOKKOS_ARCH = "SNB"
+endif
+
+DEPFLAGS = -M
+
+OBJ = $(SRC:.cpp=.o)
+LIB =
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+build: $(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: kokkos-clean 
+	rm -f *.o *.cuda *.host
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
diff --git a/lib/kokkos/example/tutorial/03_simple_view/simple_view.cpp b/lib/kokkos/example/tutorial/03_simple_view/simple_view.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..dbbb9d867dc626ae251efe3fa3f5be27b2b8dfcf
--- /dev/null
+++ b/lib/kokkos/example/tutorial/03_simple_view/simple_view.cpp
@@ -0,0 +1,142 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+//
+// First Kokkos::View (multidimensional array) example:
+//   1. Start up Kokkos
+//   2. Allocate a Kokkos::View
+//   3. Execute a parallel_for and a parallel_reduce over that View's data
+//   4. Shut down Kokkos
+//
+// Compare this example to 03_simple_view_lambda, which uses C++11
+// lambdas to define the loop bodies of the parallel_for and
+// parallel_reduce.
+//
+
+#include <Kokkos_Core.hpp>
+#include <cstdio>
+
+// A Kokkos::View is an array of zero or more dimensions.  The number
+// of dimensions is specified at compile time, as part of the type of
+// the View.  This array has two dimensions.  The first one
+// (represented by the asterisk) is a run-time dimension, and the
+// second (represented by [3]) is a compile-time dimension.  Thus,
+// this View type is an N x 3 array of type double, where N is
+// specified at run time in the View's constructor.
+//
+// The first dimension of the View is the dimension over which it is
+// efficient for Kokkos to parallelize.
+typedef Kokkos::View<double*[3]> view_type;
+
+// parallel_for functor that fills the View given to its constructor.
+// The View must already have been allocated.
+struct InitView {
+  view_type a;
+
+  // Views have "view semantics."  This means that they behave like
+  // pointers, not like std::vector.  Their copy constructor and
+  // operator= only do shallow copies.  Thus, you can pass View
+  // objects around by "value"; they won't do a deep copy unless you
+  // explicitly ask for a deep copy.
+  InitView (view_type a_) :
+    a (a_)
+  {}
+
+  // Fill the View with some data.  The parallel_for loop will iterate
+  // over the View's first dimension N.
+  KOKKOS_INLINE_FUNCTION
+  void operator () (const int i) const {
+    // Acesss the View just like a Fortran array.  The layout depends
+    // on the View's memory space, so don't rely on the View's
+    // physical memory layout unless you know what you're doing.
+    a(i,0) = 1.0*i;
+    a(i,1) = 1.0*i*i;
+    a(i,2) = 1.0*i*i*i;
+  }
+};
+
+// Reduction functor that reads the View given to its constructor.
+struct ReduceFunctor {
+  view_type a;
+
+  // Constructor takes View by "value"; this does a shallow copy.
+  ReduceFunctor (view_type a_) : a (a_) {}
+
+  // If you write a functor to do a reduction, you must specify the
+  // type of the reduction result via a public 'value_type' typedef.
+  typedef double value_type;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (int i, double &lsum) const {
+    lsum += a(i,0)*a(i,1)/(a(i,2)+0.1);
+  }
+};
+
+int main (int argc, char* argv[]) {
+  Kokkos::initialize (argc, argv);
+  const int N = 10;
+
+  // Allocate the View.  The first dimension is a run-time parameter
+  // N.  We set N = 10 here.  The second dimension is a compile-time
+  // parameter, 3.  We don't specify it here because we already set it
+  // by declaring the type of the View.
+  //
+  // Views get initialized to zero by default.  This happens in
+  // parallel, using the View's memory space's default execution
+  // space.  Parallel initialization ensures first-touch allocation.
+  // There is a way to shut off default initialization.
+  //
+  // You may NOT allocate a View inside of a parallel_{for, reduce,
+  // scan}.  Treat View allocation as a "thread collective."
+  //
+  // The string "A" is just the label; it only matters for debugging.
+  // Different Views may have the same label.
+  view_type a ("A", N);
+
+  Kokkos::parallel_for (N, InitView (a));
+  double sum = 0;
+  Kokkos::parallel_reduce (N, ReduceFunctor (a), sum);
+  printf ("Result: %f\n", sum);
+  Kokkos::finalize ();
+}
+
diff --git a/lib/kokkos/example/tutorial/03_simple_view_lambda/CMakeLists.txt b/lib/kokkos/example/tutorial/03_simple_view_lambda/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..601fe452a4c90a1506aa012a6a99a617fbc1d9af
--- /dev/null
+++ b/lib/kokkos/example/tutorial/03_simple_view_lambda/CMakeLists.txt
@@ -0,0 +1,12 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+IF (Kokkos_ENABLE_CXX11)
+  # This is a tutorial, not a test, so we don't ask CTest to run it.
+  TRIBITS_ADD_EXECUTABLE(
+    tutorial_03_simple_view_lambda
+    SOURCES simple_view_lambda.cpp
+    COMM serial mpi
+    )
+ENDIF ()
diff --git a/lib/kokkos/example/tutorial/03_simple_view_lambda/Makefile b/lib/kokkos/example/tutorial/03_simple_view_lambda/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..95ee2c47feacf363f99052173a28596144a75734
--- /dev/null
+++ b/lib/kokkos/example/tutorial/03_simple_view_lambda/Makefile
@@ -0,0 +1,44 @@
+KOKKOS_PATH = ../../..
+SRC = $(wildcard *.cpp)
+
+default: build
+	echo "Start Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+CXX = ../../../config/nvcc_wrapper
+CXXFLAGS = -O3
+LINK = ${CXX}
+LINKFLAGS = 
+EXE = $(SRC:.cpp=.cuda)
+KOKKOS_DEVICES = "Cuda,OpenMP"
+KOKKOS_ARCH = "SNB,Kepler35"
+KOKKOS_CUDA_OPTIONS = "enable_lambda"
+else
+CXX = g++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LINKFLAGS =  
+EXE = $(SRC:.cpp=.host)
+KOKKOS_DEVICES = "OpenMP"
+KOKKOS_ARCH = "SNB"
+endif
+
+DEPFLAGS = -M
+
+OBJ = $(SRC:.cpp=.o)
+LIB =
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+build: $(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: kokkos-clean 
+	rm -f *.o *.cuda *.host
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
diff --git a/lib/kokkos/example/tutorial/03_simple_view_lambda/simple_view_lambda.cpp b/lib/kokkos/example/tutorial/03_simple_view_lambda/simple_view_lambda.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..974af747763bfba23a2f6d3dfeefe68fb9ec4e25
--- /dev/null
+++ b/lib/kokkos/example/tutorial/03_simple_view_lambda/simple_view_lambda.cpp
@@ -0,0 +1,116 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+//
+// First Kokkos::View (multidimensional array) example:
+//   1. Start up Kokkos
+//   2. Allocate a Kokkos::View
+//   3. Execute a parallel_for and a parallel_reduce over that View's data
+//   4. Shut down Kokkos
+//
+// Compare this example to 03_simple_view, which uses functors to
+// define the loop bodies of the parallel_for and parallel_reduce.
+//
+
+#include <Kokkos_Core.hpp>
+#include <cstdio>
+
+// A Kokkos::View is an array of zero or more dimensions.  The number
+// of dimensions is specified at compile time, as part of the type of
+// the View.  This array has two dimensions.  The first one
+// (represented by the asterisk) is a run-time dimension, and the
+// second (represented by [3]) is a compile-time dimension.  Thus,
+// this View type is an N x 3 array of type double, where N is
+// specified at run time in the View's constructor.
+//
+// The first dimension of the View is the dimension over which it is
+// efficient for Kokkos to parallelize.
+typedef Kokkos::View<double*[3]> view_type;
+
+int main (int argc, char* argv[]) {
+  Kokkos::initialize (argc, argv);
+
+  // Allocate the View.  The first dimension is a run-time parameter
+  // N.  We set N = 10 here.  The second dimension is a compile-time
+  // parameter, 3.  We don't specify it here because we already set it
+  // by declaring the type of the View.
+  //
+  // Views get initialized to zero by default.  This happens in
+  // parallel, using the View's memory space's default execution
+  // space.  Parallel initialization ensures first-touch allocation.
+  // There is a way to shut off default initialization.
+  //
+  // You may NOT allocate a View inside of a parallel_{for, reduce,
+  // scan}.  Treat View allocation as a "thread collective."
+  //
+  // The string "A" is just the label; it only matters for debugging.
+  // Different Views may have the same label.
+  view_type a ("A", 10);
+
+  // Fill the View with some data.  The parallel_for loop will iterate
+  // over the View's first dimension N.
+  //
+  // Note that the View is passed by value into the lambda.  The macro
+  // KOKKOS_LAMBDA includes the "capture by value" clause [=].  This
+  // tells the lambda to "capture all variables in the enclosing scope
+  // by value."  Views have "view semantics"; they behave like
+  // pointers, not like std::vector.  Passing them by value does a
+  // shallow copy.  A deep copy never happens unless you explicitly
+  // ask for one.
+  Kokkos::parallel_for (10, KOKKOS_LAMBDA (const int i) {
+    // Acesss the View just like a Fortran array.  The layout depends
+    // on the View's memory space, so don't rely on the View's
+    // physical memory layout unless you know what you're doing.
+    a(i,0) = 1.0*i;
+    a(i,1) = 1.0*i*i;
+    a(i,2) = 1.0*i*i*i;
+  });
+  // Reduction functor that reads the View given to its constructor.
+  double sum = 0;
+  Kokkos::parallel_reduce (10, KOKKOS_LAMBDA (const int i, double& lsum) {
+    lsum += a(i,0)*a(i,1)/(a(i,2)+0.1);
+  }, sum);
+  printf ("Result: %f\n", sum);
+  Kokkos::finalize ();
+}
+
diff --git a/lib/kokkos/example/tutorial/04_simple_memoryspaces/CMakeLists.txt b/lib/kokkos/example/tutorial/04_simple_memoryspaces/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..09f209077a08d64c86454a59875ecda8d329e2f7
--- /dev/null
+++ b/lib/kokkos/example/tutorial/04_simple_memoryspaces/CMakeLists.txt
@@ -0,0 +1,10 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+# This is a tutorial, not a test, so we don't ask CTest to run it.
+TRIBITS_ADD_EXECUTABLE(
+  tutorial_04_simple_memoryspaces
+  SOURCES simple_memoryspaces.cpp
+  COMM serial mpi
+  )
diff --git a/lib/kokkos/example/tutorial/04_simple_memoryspaces/Makefile b/lib/kokkos/example/tutorial/04_simple_memoryspaces/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..78a9fed0cce641b48c85f4d67a1d0ab6c5a63388
--- /dev/null
+++ b/lib/kokkos/example/tutorial/04_simple_memoryspaces/Makefile
@@ -0,0 +1,43 @@
+KOKKOS_PATH = ../../..
+SRC = $(wildcard *.cpp)
+
+default: build
+	echo "Start Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+CXX = ../../../config/nvcc_wrapper
+CXXFLAGS = -O3
+LINK = ${CXX}
+LINKFLAGS = 
+EXE = $(SRC:.cpp=.cuda)
+KOKKOS_DEVICES = "Cuda,OpenMP"
+KOKKOS_ARCH = "SNB,Kepler35"
+else
+CXX = g++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LINKFLAGS =  
+EXE = $(SRC:.cpp=.host)
+KOKKOS_DEVICES = "OpenMP"
+KOKKOS_ARCH = "SNB"
+endif
+
+DEPFLAGS = -M
+
+OBJ = $(SRC:.cpp=.o)
+LIB =
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+build: $(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: kokkos-clean 
+	rm -f *.o *.cuda *.host
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
diff --git a/lib/kokkos/example/tutorial/04_simple_memoryspaces/simple_memoryspaces.cpp b/lib/kokkos/example/tutorial/04_simple_memoryspaces/simple_memoryspaces.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c6a4bebfa525c6114681122af59e0dfaf1a71c70
--- /dev/null
+++ b/lib/kokkos/example/tutorial/04_simple_memoryspaces/simple_memoryspaces.cpp
@@ -0,0 +1,101 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <cstdio>
+
+// The type of a two-dimensional N x 3 array of double.
+// It lives in Kokkos' default memory space.
+typedef Kokkos::View<double*[3]> view_type;
+
+// The "HostMirror" type corresponding to view_type above is also a
+// two-dimensional N x 3 array of double.  However, it lives in the
+// host memory space corresponding to view_type's memory space.  For
+// example, if view_type lives in CUDA device memory, host_view_type
+// lives in host (CPU) memory.  Furthermore, declaring host_view_type
+// as the host mirror of view_type means that host_view_type has the
+// same layout as view_type.  This makes it easier to copy between the
+// two Views.
+// Advanced issues: If a memory space is accessible from the host without
+// performance penalties then it is its own host_mirror_space. This is
+// the case for HostSpace, CudaUVMSpace and CudaHostPinnedSpace.
+
+typedef view_type::HostMirror host_view_type;
+
+struct ReduceFunctor {
+  view_type a;
+  ReduceFunctor (view_type a_) : a (a_) {}
+  typedef int value_type; //Specify type for reduction value, lsum
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (int i, int &lsum) const {
+    lsum += a(i,0)-a(i,1)+a(i,2);
+  }
+};
+
+int main() {
+  Kokkos::initialize();
+
+  view_type a ("A", 10);
+  // If view_type and host_mirror_type live in the same memory space,
+  // a "mirror view" is just an alias, and deep_copy does nothing.
+  // Otherwise, a mirror view of a device View lives in host memory,
+  // and deep_copy does a deep copy.
+  host_view_type h_a = Kokkos::create_mirror_view (a);
+
+  // The View h_a lives in host (CPU) memory, so it's legal to fill
+  // the view sequentially using ordinary code, like this.
+  for (int i = 0; i < 10; i++) {
+    for (int j = 0; j < 3; j++) {
+      h_a(i,j) = i*10 + j;
+    }
+  }
+  Kokkos::deep_copy (a, h_a); // Copy from host to device.
+
+  int sum = 0;
+  Kokkos::parallel_reduce (10, ReduceFunctor (a), sum);
+  printf ("Result is %i\n",sum);
+
+  Kokkos::finalize ();
+}
+
diff --git a/lib/kokkos/example/tutorial/05_simple_atomics/CMakeLists.txt b/lib/kokkos/example/tutorial/05_simple_atomics/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5a5790fb0488be791112c3ef0c38655e6da78724
--- /dev/null
+++ b/lib/kokkos/example/tutorial/05_simple_atomics/CMakeLists.txt
@@ -0,0 +1,10 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+# This is a tutorial, not a test, so we don't ask CTest to run it.
+TRIBITS_ADD_EXECUTABLE(
+  tutorial_05_simple_atomics
+  SOURCES simple_atomics.cpp
+  COMM serial mpi
+  )
diff --git a/lib/kokkos/example/tutorial/05_simple_atomics/Makefile b/lib/kokkos/example/tutorial/05_simple_atomics/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..78a9fed0cce641b48c85f4d67a1d0ab6c5a63388
--- /dev/null
+++ b/lib/kokkos/example/tutorial/05_simple_atomics/Makefile
@@ -0,0 +1,43 @@
+KOKKOS_PATH = ../../..
+SRC = $(wildcard *.cpp)
+
+default: build
+	echo "Start Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+CXX = ../../../config/nvcc_wrapper
+CXXFLAGS = -O3
+LINK = ${CXX}
+LINKFLAGS = 
+EXE = $(SRC:.cpp=.cuda)
+KOKKOS_DEVICES = "Cuda,OpenMP"
+KOKKOS_ARCH = "SNB,Kepler35"
+else
+CXX = g++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LINKFLAGS =  
+EXE = $(SRC:.cpp=.host)
+KOKKOS_DEVICES = "OpenMP"
+KOKKOS_ARCH = "SNB"
+endif
+
+DEPFLAGS = -M
+
+OBJ = $(SRC:.cpp=.o)
+LIB =
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+build: $(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: kokkos-clean 
+	rm -f *.o *.cuda *.host
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
diff --git a/lib/kokkos/example/tutorial/05_simple_atomics/simple_atomics.cpp b/lib/kokkos/example/tutorial/05_simple_atomics/simple_atomics.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..de9c219d5b0b9576b042caaefcc2f7b149901647
--- /dev/null
+++ b/lib/kokkos/example/tutorial/05_simple_atomics/simple_atomics.cpp
@@ -0,0 +1,137 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <cstdio>
+#include <cstdlib>
+#include <cmath>
+
+// Type of a one-dimensional length-N array of int.
+typedef Kokkos::View<int*> view_type;
+typedef view_type::HostMirror host_view_type;
+// This is a "zero-dimensional" View, that is, a View of a single
+// value (an int, in this case).  Access the value using operator()
+// with no arguments: e.g., 'count()'.
+//
+// Zero-dimensional Views are useful for reduction results that stay
+// resident in device memory, as well as for irregularly updated
+// shared state.  We use it for the latter in this example.
+typedef Kokkos::View<int> count_type;
+typedef count_type::HostMirror host_count_type;
+
+
+// Functor for finding a list of primes in a given set of numbers.  If
+// run in parallel, the order of results is nondeterministic, because
+// hardware atomic updates do not guarantee an order of execution.
+struct findprimes {
+  view_type data;
+  view_type result;
+  count_type count;
+
+  findprimes (view_type data_, view_type result_, count_type count_) :
+    data (data_), result (result_), count (count_)
+  {}
+
+  // Test if data(i) is prime.  If it is, increment the count of
+  // primes (stored in the zero-dimensional View 'count') and add the
+  // value to the current list of primes 'result'.
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int i) const {
+    const int number = data(i); // the current number
+
+    // Test all numbers from 3 to ceiling(sqrt(data(i))), to see if
+    // they are factors of data(i).  It's not the most efficient prime
+    // test, but it works.
+    const int upper_bound = sqrt(1.0*number)+1;
+    bool is_prime = !(number%2 == 0);
+    int k = 3;
+    while (k < upper_bound && is_prime) {
+      is_prime = !(number%k == 0);
+      k += 2; // don't have to test even numbers
+    }
+
+    if (is_prime) {
+      // Use an atomic update both to update the current count of
+      // primes, and to find a place in the current list of primes for
+      // the new result.
+      //
+      // atomic_fetch_add results the _current_ count, but increments
+      // it (by 1 in this case).  The current count of primes indexes
+      // into the first unoccupied position of the 'result' array.
+      const int idx = Kokkos::atomic_fetch_add (&count(), 1);
+      result(idx) = number;
+    }
+  }
+
+};
+
+int main () {
+  Kokkos::initialize ();
+
+  srand (61391); // Set the random seed
+
+  int nnumbers = 100000;
+  view_type data ("RND", nnumbers);
+  view_type result ("Prime", nnumbers);
+  count_type count ("Count");
+
+  host_view_type h_data = Kokkos::create_mirror_view (data);
+  host_view_type h_result = Kokkos::create_mirror_view (result);
+  host_count_type h_count = Kokkos::create_mirror_view (count);
+
+  typedef view_type::size_type size_type;
+  // Fill the 'data' array on the host with random numbers.  We assume
+  // that they come from some process which is only implemented on the
+  // host, via some library.  (That's true in this case.)
+  for (size_type i = 0; i < data.dimension_0 (); ++i) {
+    h_data(i) = rand () % nnumbers;
+  }
+  Kokkos::deep_copy (data, h_data); // copy from host to device
+
+  Kokkos::parallel_for (data.dimension_0 (), findprimes (data, result, count));
+  Kokkos::deep_copy (h_count, count); // copy from device to host
+
+  printf ("Found %i prime numbers in %i random numbers\n", h_count(), nnumbers);
+  Kokkos::finalize ();
+}
+
diff --git a/lib/kokkos/example/tutorial/Advanced_Views/01_data_layouts/CMakeLists.txt b/lib/kokkos/example/tutorial/Advanced_Views/01_data_layouts/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2eb3a8f6c98d69c394f83591e59aa7073f1e59e2
--- /dev/null
+++ b/lib/kokkos/example/tutorial/Advanced_Views/01_data_layouts/CMakeLists.txt
@@ -0,0 +1,10 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+# This is a tutorial, not a test, so we don't ask CTest to run it.
+TRIBITS_ADD_EXECUTABLE(
+  tutorial_advancedviews_01_data_layouts
+  SOURCES data_layouts.cpp
+  COMM serial mpi
+  )
diff --git a/lib/kokkos/example/tutorial/Advanced_Views/01_data_layouts/Makefile b/lib/kokkos/example/tutorial/Advanced_Views/01_data_layouts/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..12ad36b31e458d155aa6dc653ab8188a7773bd18
--- /dev/null
+++ b/lib/kokkos/example/tutorial/Advanced_Views/01_data_layouts/Makefile
@@ -0,0 +1,43 @@
+KOKKOS_PATH = ../../../..
+SRC = $(wildcard *.cpp)
+
+default: build
+	echo "Start Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+CXX = ../../../../config/nvcc_wrapper
+CXXFLAGS = -O3
+LINK = ${CXX}
+LINKFLAGS = 
+EXE = $(SRC:.cpp=.cuda)
+KOKKOS_DEVICES = "Cuda,OpenMP"
+KOKKOS_ARCH = "SNB,Kepler35"
+else
+CXX = g++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LINKFLAGS =  
+EXE = $(SRC:.cpp=.host)
+KOKKOS_DEVICES = "OpenMP"
+KOKKOS_ARCH = "SNB"
+endif
+
+DEPFLAGS = -M
+
+OBJ = $(SRC:.cpp=.o)
+LIB =
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+build: $(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: kokkos-clean 
+	rm -f *.o *.cuda *.host
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
diff --git a/lib/kokkos/example/tutorial/Advanced_Views/01_data_layouts/data_layouts.cpp b/lib/kokkos/example/tutorial/Advanced_Views/01_data_layouts/data_layouts.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8406c504c9678cbe86dd8bd84b0f9e00391e8737
--- /dev/null
+++ b/lib/kokkos/example/tutorial/Advanced_Views/01_data_layouts/data_layouts.cpp
@@ -0,0 +1,171 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <impl/Kokkos_Timer.hpp>
+#include <cstdio>
+
+// These two View types are both 2-D arrays of double.  However, they
+// have different layouts in memory.  left_type has "layout left,"
+// which means "column major," the same as in Fortran, the BLAS, or
+// LAPACK.  right_type has "layout right," which means "row major,"
+// the same as in C, C++, or Java.
+typedef Kokkos::View<double**, Kokkos::LayoutLeft> left_type;
+typedef Kokkos::View<double**, Kokkos::LayoutRight> right_type;
+// This is a one-dimensional View, so the layout matters less.
+// However, it still has a layout!  Since its layout is not specified
+// explicitly in the type, its layout is a function of the memory
+// space.  For example, the default Cuda layout is LayoutLeft, and the
+// default Host layout is LayoutRight.
+typedef Kokkos::View<double*> view_type;
+
+// parallel_for functor that fills the given View with some data.  It
+// expects to access the View by rows in parallel: each call i of
+// operator() accesses a row.
+template<class ViewType>
+struct init_view {
+  ViewType a;
+  init_view (ViewType a_) : a (a_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const typename ViewType::size_type i) const {
+    // On CPUs this loop could be vectorized so j should do stride 1
+    // access on a for optimal performance. I.e. a should be LayoutRight.
+    // On GPUs threads should do coalesced loads and stores. That means
+    // that i should be the stride one access for optimal performance.
+    for (typename ViewType::size_type j = 0; j < a.dimension_1 (); ++j) {
+      a(i,j) = 1.0*a.dimension_0()*i + 1.0*j;
+    }
+  }
+};
+
+// Compute a contraction of v1 and v2 into a:
+//
+//   a(i) := sum_j (v1(i,j) * v2(j,i))
+//
+// Since the functor is templated on the ViewTypes itself it doesn't matter what
+// there layouts are. That means you can use different layouts on different
+// architectures.
+template<class ViewType1, class ViewType2>
+struct contraction {
+  view_type a;
+  typename ViewType1::const_type v1;
+  typename ViewType2::const_type v2;
+  contraction (view_type a_, ViewType1 v1_, ViewType2 v2_) :
+    a (a_), v1 (v1_), v2 (v2_)
+  {}
+
+  // As with the initialization functor the performance of this operator
+  // depends on the architecture and the chosen data layouts.
+  // On CPUs optimal would be to vectorize the inner loop, so j should be the
+  // stride 1 access. That means v1 should be LayoutRight and v2 LayoutLeft.
+  // In order to get coalesced access on GPUs where i corresponds closely to
+  // the thread Index, i must be the stride 1 dimension. That means v1 should be
+  // LayoutLeft and v2 LayoutRight.
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const view_type::size_type i) const {
+    for (view_type::size_type j = 0; j < v1.dimension_1 (); ++j) {
+      a(i) = v1(i,j)*v2(j,i);
+    }
+  }
+};
+
+// Compute a dot product. This is used for result verification.
+struct dot {
+  view_type a;
+  dot (view_type a_) : a (a_) {}
+  typedef double value_type; //Specify type for reduction target, lsum
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const view_type::size_type i, double &lsum) const {
+    lsum += a(i)*a(i);
+  }
+};
+
+int main (int narg, char* arg[]) {
+  // When initializing Kokkos, you may pass in command-line arguments,
+  // just like with MPI_Init().  Kokkos reserves the right to remove
+  // arguments from the list that start with '--kokkos-'.
+  Kokkos::initialize (narg, arg);
+
+  int size = 10000;
+  view_type a("A",size);
+
+  // Define two views with LayoutLeft and LayoutRight.
+  left_type l("L",size,10000);
+  right_type r("R",size,10000);
+
+  // Initialize the data in the views.
+  Kokkos::parallel_for(size,init_view<left_type>(l));
+  Kokkos::parallel_for(size,init_view<right_type>(r));
+  Kokkos::fence();
+
+  // Measure time to execute the contraction kernel when giving it a
+  // LayoutLeft view for v1 and a LayoutRight view for v2. This should be
+  // fast on GPUs and slow on CPUs
+  Kokkos::Timer time1;
+  Kokkos::parallel_for(size,contraction<left_type,right_type>(a,l,r));
+  Kokkos::fence();
+  double sec1 = time1.seconds();
+
+  double sum1 = 0;
+  Kokkos::parallel_reduce(size,dot(a),sum1);
+  Kokkos::fence();
+
+  // Measure time to execute the contraction kernel when giving it a
+  // LayoutRight view for v1 and a LayoutLeft view for v2. This should be
+  // fast on CPUs and slow on GPUs
+  Kokkos::Timer time2;
+  Kokkos::parallel_for(size,contraction<right_type,left_type>(a,r,l));
+  Kokkos::fence();
+  double sec2 = time2.seconds();
+
+  double sum2 = 0;
+  Kokkos::parallel_reduce(size,dot(a),sum2);
+
+  // Kokkos' reductions are deterministic.
+  // The results should always be equal.
+  printf("Result Left/Right %f Right/Left %f (equal result: %i)\n",sec1,sec2,sum2==sum1);
+
+  Kokkos::finalize();
+}
+
diff --git a/lib/kokkos/example/tutorial/Advanced_Views/02_memory_traits/CMakeLists.txt b/lib/kokkos/example/tutorial/Advanced_Views/02_memory_traits/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1963e544d7a113e8b50cf3fa2444df2f95d983e2
--- /dev/null
+++ b/lib/kokkos/example/tutorial/Advanced_Views/02_memory_traits/CMakeLists.txt
@@ -0,0 +1,10 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+# This is a tutorial, not a test, so we don't ask CTest to run it.
+TRIBITS_ADD_EXECUTABLE(
+  tutorial_advancedviews_02_memory_traits
+  SOURCES memory_traits.cpp
+  COMM serial mpi
+  )
diff --git a/lib/kokkos/example/tutorial/Advanced_Views/02_memory_traits/Makefile b/lib/kokkos/example/tutorial/Advanced_Views/02_memory_traits/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..12ad36b31e458d155aa6dc653ab8188a7773bd18
--- /dev/null
+++ b/lib/kokkos/example/tutorial/Advanced_Views/02_memory_traits/Makefile
@@ -0,0 +1,43 @@
+KOKKOS_PATH = ../../../..
+SRC = $(wildcard *.cpp)
+
+default: build
+	echo "Start Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+CXX = ../../../../config/nvcc_wrapper
+CXXFLAGS = -O3
+LINK = ${CXX}
+LINKFLAGS = 
+EXE = $(SRC:.cpp=.cuda)
+KOKKOS_DEVICES = "Cuda,OpenMP"
+KOKKOS_ARCH = "SNB,Kepler35"
+else
+CXX = g++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LINKFLAGS =  
+EXE = $(SRC:.cpp=.host)
+KOKKOS_DEVICES = "OpenMP"
+KOKKOS_ARCH = "SNB"
+endif
+
+DEPFLAGS = -M
+
+OBJ = $(SRC:.cpp=.o)
+LIB =
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+build: $(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: kokkos-clean 
+	rm -f *.o *.cuda *.host
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
diff --git a/lib/kokkos/example/tutorial/Advanced_Views/02_memory_traits/memory_traits.cpp b/lib/kokkos/example/tutorial/Advanced_Views/02_memory_traits/memory_traits.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ddd28a97c3739bea326b7b71e09c06e42b05f9f9
--- /dev/null
+++ b/lib/kokkos/example/tutorial/Advanced_Views/02_memory_traits/memory_traits.cpp
@@ -0,0 +1,141 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <impl/Kokkos_Timer.hpp>
+#include <cstdio>
+#include <cstdlib>
+
+typedef Kokkos::View<double*> view_type;
+// Kokkos::Views have an MemoryTraits template parameter which
+// allows users to specify usage scenarios of a View.
+// Some of those act simply as hints, which can be used to insert
+// optimal load and store paths, others change the symantics of the
+// access. The trait Kokkos::Atomic is one of the latter. A view with
+// that MemoryTrait will perform any access atomicly (read, write, update).
+//
+// In this example we use a view with a usage hint for RandomAccess.
+// Kokkos::RandomAccess means that we expect to use this view
+// with indirect indexing.
+//
+// In CUDA, RandomAccess allows accesses through the texture
+// cache.  This only works if the View is read-only, which we enforce
+// through the first template parameter.
+//
+// Note that we are still talking about views of the data, its not a new allocation.
+// For example you can have an atomic view of a default view. While you even
+// could use both in the same kernel, this could lead to undefined behaviour because
+// one of your access paths is not atomic. Think of it in the same way as you think of
+// pointers to const data and pointers to non-const data (i.e. const double* and double*).
+// While these pointers can point to the same data you should not use them together if that
+// brakes the const guarantee of the first pointer.
+typedef Kokkos::View<const double*, Kokkos::MemoryTraits<Kokkos::RandomAccess> > view_type_rnd;
+typedef Kokkos::View<int**> idx_type;
+typedef idx_type::HostMirror idx_type_host;
+
+// We template this functor on the ViewTypes to show the effect of the RandomAccess trait.
+template<class DestType, class SrcType>
+struct localsum {
+  idx_type::const_type idx;
+  DestType dest;
+  SrcType src;
+  localsum (idx_type idx_, DestType dest_, SrcType src_) :
+    idx (idx_), dest (dest_), src (src_)
+  {}
+
+  // Calculate a local sum of values
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int i) const {
+    double tmp = 0.0;
+    for (int j = 0; j < (int) idx.dimension_1 (); ++j) {
+      // This is an indirect access on src
+      const double val = src(idx(i,j));
+      tmp += val*val + 0.5*(idx.dimension_0()*val -idx.dimension_1()*val);
+    }
+    dest(i) = tmp;
+  }
+};
+
+int main(int narg, char* arg[]) {
+  Kokkos::initialize (narg, arg);
+
+  int size = 1000000;
+
+  idx_type idx("Idx",size,64);
+  idx_type_host h_idx = Kokkos::create_mirror_view (idx);
+
+  view_type dest ("Dest", size);
+  view_type src ("Src", size);
+
+  srand(134231);
+
+  for (int i = 0; i < size; i++) {
+    for (view_type::size_type j = 0; j < h_idx.dimension_1 (); ++j) {
+      h_idx(i,j) = (size + i + (rand () % 500 - 250)) % size;
+    }
+  }
+
+  // Deep copy the initial data to the device
+  Kokkos::deep_copy(idx,h_idx);
+  // Run the first kernel to warmup caches
+  Kokkos::parallel_for(size,localsum<view_type,view_type_rnd>(idx,dest,src));
+  Kokkos::fence();
+
+  // Run the localsum functor using the RandomAccess trait. On CPUs there should
+  // not be any different in performance to not using the RandomAccess trait.
+  // On GPUs where can be a dramatic difference
+  Kokkos::Timer time1;
+  Kokkos::parallel_for(size,localsum<view_type,view_type_rnd>(idx,dest,src));
+  Kokkos::fence();
+  double sec1 = time1.seconds();
+
+  Kokkos::Timer time2;
+  Kokkos::parallel_for(size,localsum<view_type,view_type>(idx,dest,src));
+  Kokkos::fence();
+  double sec2 = time2.seconds();
+
+  printf("Time with Trait RandomAccess: %f with Plain: %f \n",sec1,sec2);
+
+  Kokkos::finalize();
+}
+
diff --git a/lib/kokkos/example/tutorial/Advanced_Views/03_subviews/CMakeLists.txt b/lib/kokkos/example/tutorial/Advanced_Views/03_subviews/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..cbe394c78b832f7bee3bb659b2776d5b246adbd1
--- /dev/null
+++ b/lib/kokkos/example/tutorial/Advanced_Views/03_subviews/CMakeLists.txt
@@ -0,0 +1,10 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+# This is a tutorial, not a test, so we don't ask CTest to run it.
+TRIBITS_ADD_EXECUTABLE(
+  tutorial_advancedviews_03_subviews
+  SOURCES subviews.cpp
+  COMM serial mpi
+  )
diff --git a/lib/kokkos/example/tutorial/Advanced_Views/03_subviews/Makefile b/lib/kokkos/example/tutorial/Advanced_Views/03_subviews/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..12ad36b31e458d155aa6dc653ab8188a7773bd18
--- /dev/null
+++ b/lib/kokkos/example/tutorial/Advanced_Views/03_subviews/Makefile
@@ -0,0 +1,43 @@
+KOKKOS_PATH = ../../../..
+SRC = $(wildcard *.cpp)
+
+default: build
+	echo "Start Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+CXX = ../../../../config/nvcc_wrapper
+CXXFLAGS = -O3
+LINK = ${CXX}
+LINKFLAGS = 
+EXE = $(SRC:.cpp=.cuda)
+KOKKOS_DEVICES = "Cuda,OpenMP"
+KOKKOS_ARCH = "SNB,Kepler35"
+else
+CXX = g++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LINKFLAGS =  
+EXE = $(SRC:.cpp=.host)
+KOKKOS_DEVICES = "OpenMP"
+KOKKOS_ARCH = "SNB"
+endif
+
+DEPFLAGS = -M
+
+OBJ = $(SRC:.cpp=.o)
+LIB =
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+build: $(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: kokkos-clean 
+	rm -f *.o *.cuda *.host
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
diff --git a/lib/kokkos/example/tutorial/Advanced_Views/03_subviews/subviews.cpp b/lib/kokkos/example/tutorial/Advanced_Views/03_subviews/subviews.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c8001ebaa60664dcb7b644c8b55b5ac1d7a0ba76
--- /dev/null
+++ b/lib/kokkos/example/tutorial/Advanced_Views/03_subviews/subviews.cpp
@@ -0,0 +1,190 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+// This example simulates one timestep of an explicit
+// finite-difference discretization of a time-dependent partial
+// differential equation (PDE).  It shows how to take subviews of the
+// mesh in order to represent particular boundaries or the interior of
+// the mesh.
+
+#include <Kokkos_Core.hpp>
+#include <impl/Kokkos_Timer.hpp>
+#include <cstdio>
+
+typedef Kokkos::View<double***, Kokkos::LayoutRight> mesh_type;
+
+// These View types represent subviews of the mesh.  Some of the Views
+// have layout LayoutStride, meaning that they have run-time "strides"
+// in each dimension which may differ from that dimension.  For
+// example, inner_mesh_type (which represents the interior of the
+// mesh) has to skip over the boundaries when computing its stride;
+// the dimensions of the interior mesh differ from these strides.  You
+// may safely always use a LayoutStride layout when taking a subview
+// of a LayoutRight or LayoutLeft subview, but strided accesses may
+// cost a bit more, especially for 1-D Views.
+typedef Kokkos::View<double**, Kokkos::LayoutStride> xz_plane_type;
+typedef Kokkos::View<double**, Kokkos::LayoutRight> yz_plane_type;
+typedef Kokkos::View<double**, Kokkos::LayoutStride> xy_plane_type;
+typedef Kokkos::View<double***, Kokkos::LayoutStride> inner_mesh_type;
+
+// Functor to set all entries of a boundary of the mesh to a constant
+// value.  The functor is templated on ViewType because different
+// boundaries may have different layouts.
+template<class ViewType>
+struct set_boundary {
+  ViewType a;
+  double value;
+
+  set_boundary (ViewType a_, double value_) :
+    a (a_), value (value_)
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const typename ViewType::size_type i) const {
+    for (typename ViewType::size_type j = 0; j < a.dimension_1 (); ++j) {
+      a(i,j) = value;
+    }
+  }
+};
+
+// Functor to set all entries of a boundary of the mesh to a constant
+// value.  The functor is templated on ViewType because different
+// boundaries may have different layouts.
+template<class ViewType>
+struct set_inner {
+  ViewType a;
+  double value;
+
+  set_inner (ViewType a_, double value_) :
+    a (a_), value (value_)
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator () (const typename ViewType::size_type i) const {
+    typedef typename ViewType::size_type size_type;
+    for (size_type j = 0; j < a.dimension_1 (); ++j) {
+      for (size_type k = 0; k < a.dimension_2 (); ++k) {
+        a(i,j,k) = value;
+      }
+    }
+  }
+};
+
+// Update the interior of the mesh.  This simulates one timestep of a
+// finite-difference method.
+template<class ViewType>
+struct update {
+  ViewType a;
+  const double dt;
+
+  update (ViewType a_, const double dt_) :
+    a (a_), dt (dt_)
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (typename ViewType::size_type i) const {
+    typedef typename ViewType::size_type size_type;
+    i++;
+    for (size_type j = 1; j < a.dimension_1()-1; j++) {
+      for (size_type k = 1; k < a.dimension_2()-1; k++) {
+        a(i,j,k) += dt* (a(i,j,k+1) - a(i,j,k-1) +
+                         a(i,j+1,k) - a(i,j-1,k) +
+                         a(i+1,j,k) - a(i-1,j,k));
+      }
+    }
+  }
+};
+
+
+int main (int narg, char* arg[]) {
+  using Kokkos::ALL;
+  using Kokkos::pair;
+  using Kokkos::parallel_for;
+  using Kokkos::subview;
+  typedef mesh_type::size_type size_type;
+
+  Kokkos::initialize (narg, arg);
+
+  // The number of mesh points along each dimension of the mesh, not
+  // including boundaries.
+  const size_type size = 100;
+
+  // A is the full cubic 3-D mesh, including the boundaries.
+  mesh_type A ("A", size+2, size+2, size+2);
+  // Ai is the "inner" part of A, _not_ including the boundaries.
+  //
+  // A pair of indices in a particular dimension means the contiguous
+  // zero-based index range in that dimension, including the first
+  // entry of the pair but _not_ including the second entry.
+  inner_mesh_type Ai = subview(A, pair<size_type, size_type> (1, size+1),
+                                  pair<size_type, size_type> (1, size+1),
+                                  pair<size_type, size_type> (1, size+1));
+  // A has six boundaries, one for each face of the cube.
+  // Create a View of each of these boundaries.
+  // ALL() means "select all indices in that dimension."
+  xy_plane_type Zneg_halo = subview(A, ALL (), ALL (), 0);
+  xy_plane_type Zpos_halo = subview(A, ALL (), ALL (), 101);
+  xz_plane_type Yneg_halo = subview(A, ALL (), 0, ALL ());
+  xz_plane_type Ypos_halo = subview(A, ALL (), 101, ALL ());
+  yz_plane_type Xneg_halo = subview(A, 0, ALL (), ALL ());
+  yz_plane_type Xpos_halo = subview(A, 101, ALL (), ALL ());
+
+  // Set the boundaries to their initial conditions.
+  parallel_for (Zneg_halo.dimension_0 (), set_boundary<xy_plane_type> (Zneg_halo,  1));
+  parallel_for (Zpos_halo.dimension_0 (), set_boundary<xy_plane_type> (Zpos_halo, -1));
+  parallel_for (Yneg_halo.dimension_0 (), set_boundary<xz_plane_type> (Yneg_halo,  2));
+  parallel_for (Ypos_halo.dimension_0 (), set_boundary<xz_plane_type> (Ypos_halo, -2));
+  parallel_for (Xneg_halo.dimension_0 (), set_boundary<yz_plane_type> (Xneg_halo,  3));
+  parallel_for (Xpos_halo.dimension_0 (), set_boundary<yz_plane_type> (Xpos_halo, -3));
+
+  // Set the interior of the mesh to its initial condition.
+  parallel_for (Ai.dimension_0 (), set_inner<inner_mesh_type> (Ai, 0));
+
+  // Update the interior of the mesh.
+  // This simulates one timestep with dt = 0.1.
+  parallel_for (Ai.dimension_0 (), update<mesh_type> (A, 0.1));
+
+  printf ("Done\n");
+  Kokkos::finalize ();
+}
+
diff --git a/lib/kokkos/example/tutorial/Advanced_Views/04_dualviews/CMakeLists.txt b/lib/kokkos/example/tutorial/Advanced_Views/04_dualviews/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..300dab128e45779002cf123d7e7238777abab4d5
--- /dev/null
+++ b/lib/kokkos/example/tutorial/Advanced_Views/04_dualviews/CMakeLists.txt
@@ -0,0 +1,10 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+# This is a tutorial, not a test, so we don't ask CTest to run it.
+TRIBITS_ADD_EXECUTABLE(
+  tutorial_advancedviews_04_dualviews
+  SOURCES dual_view.cpp
+  COMM serial mpi
+  )
diff --git a/lib/kokkos/example/tutorial/Advanced_Views/04_dualviews/Makefile b/lib/kokkos/example/tutorial/Advanced_Views/04_dualviews/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..12ad36b31e458d155aa6dc653ab8188a7773bd18
--- /dev/null
+++ b/lib/kokkos/example/tutorial/Advanced_Views/04_dualviews/Makefile
@@ -0,0 +1,43 @@
+KOKKOS_PATH = ../../../..
+SRC = $(wildcard *.cpp)
+
+default: build
+	echo "Start Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+CXX = ../../../../config/nvcc_wrapper
+CXXFLAGS = -O3
+LINK = ${CXX}
+LINKFLAGS = 
+EXE = $(SRC:.cpp=.cuda)
+KOKKOS_DEVICES = "Cuda,OpenMP"
+KOKKOS_ARCH = "SNB,Kepler35"
+else
+CXX = g++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LINKFLAGS =  
+EXE = $(SRC:.cpp=.host)
+KOKKOS_DEVICES = "OpenMP"
+KOKKOS_ARCH = "SNB"
+endif
+
+DEPFLAGS = -M
+
+OBJ = $(SRC:.cpp=.o)
+LIB =
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+build: $(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: kokkos-clean 
+	rm -f *.o *.cuda *.host
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
diff --git a/lib/kokkos/example/tutorial/Advanced_Views/04_dualviews/dual_view.cpp b/lib/kokkos/example/tutorial/Advanced_Views/04_dualviews/dual_view.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4905e4bf88485c70527d9080844940a61c60365c
--- /dev/null
+++ b/lib/kokkos/example/tutorial/Advanced_Views/04_dualviews/dual_view.cpp
@@ -0,0 +1,218 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <Kokkos_DualView.hpp>
+#include <impl/Kokkos_Timer.hpp>
+#include <cstdio>
+#include <cstdlib>
+
+// DualView helps you manage data and computations that take place on
+// two different memory spaces.  Examples include CUDA device memory
+// and (CPU) host memory (currently implemented), or Intel Knights
+// Landing MCDRAM and DRAM (not yet implemented).  For example, if you
+// have ported only some parts of you application to run in CUDA,
+// DualView can help manage moving data between the parts of your
+// application that work best with CUDA, and the parts that work
+// better on the CPU.
+//
+// A DualView takes the same template parameters as a View, but
+// contains two Views: One that lives in the DualView's memory space,
+// and one that lives in that memory space's host mirror space.  If
+// both memory spaces are the same, then the two Views just alias one
+// another.  This means that you can use DualView all the time, even
+// when not running in a memory space like CUDA.  DualView's
+// operations to help you manage memory take almost no time in that
+// case.  This makes your code even more performance portable.
+
+typedef Kokkos::DualView<double*> view_type;
+typedef Kokkos::DualView<int**> idx_type;
+
+
+template<class ExecutionSpace>
+struct localsum {
+  // If the functor has a public 'execution_space' typedef, that defines
+  // the functor's execution space (where it runs in parallel).  This
+  // overrides Kokkos' default execution space.
+  typedef ExecutionSpace execution_space;
+
+  typedef typename Kokkos::Impl::if_c<Kokkos::Impl::is_same<ExecutionSpace,Kokkos::DefaultExecutionSpace>::value ,
+     idx_type::memory_space, idx_type::host_mirror_space>::type memory_space;
+
+  // Get the view types on the particular device for which the functor
+  // is instantiated.
+  //
+  // "const_data_type" is a typedef in View (and DualView) which is
+  // the const version of the first template parameter of the View.
+  // For example, the const_data_type version of double** is const
+  // double**.
+  Kokkos::View<idx_type::const_data_type, idx_type::array_layout, memory_space> idx;
+  // "scalar_array_type" is a typedef in ViewTraits (and DualView) which is the
+  // array version of the value(s) stored in the View.
+  Kokkos::View<view_type::scalar_array_type, view_type::array_layout, memory_space> dest;
+  Kokkos::View<view_type::const_data_type, view_type::array_layout,
+               memory_space, Kokkos::MemoryRandomAccess> src;
+
+  // Constructor takes DualViews, synchronizes them to the device,
+  // then marks them as modified on the device.
+  localsum (idx_type dv_idx, view_type dv_dest, view_type dv_src)
+  {
+    // Extract the view on the correct Device (i.e., the correct
+    // memory space) from the DualView.  DualView has a template
+    // method, view(), which is templated on the memory space.  If the
+    // DualView has a View from that memory space, view() returns the
+    // View in that space.
+    idx = dv_idx.view<memory_space> ();
+    dest = dv_dest.template view<memory_space> ();
+    src = dv_src.template view<memory_space> ();
+
+    // Synchronize the DualView to the correct Device.
+    //
+    // DualView's sync() method is templated on a memory space, and
+    // synchronizes the DualView in a one-way fashion to that memory
+    // space.  "Synchronizing" means copying, from the other memory
+    // space to the Device memory space.  sync() does _nothing_ if the
+    // Views on the two memory spaces are in sync.  DualView
+    // determines this by the user manually marking one side or the
+    // other as modified; see the modify() call below.
+
+    dv_idx.sync<memory_space> ();
+    dv_dest.template sync<memory_space> ();
+    dv_src.template sync<memory_space> ();
+
+    // Mark dest as modified on Device.
+    dv_dest.template modify<memory_space> ();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int i) const {
+    double tmp = 0.0;
+    for (int j = 0; j < (int) idx.dimension_1(); ++j) {
+      const double val = src(idx(i,j));
+      tmp += val*val + 0.5*(idx.dimension_0()*val -idx.dimension_1()*val);
+    }
+    dest(i) += tmp;
+  }
+};
+
+class ParticleType {
+  public:
+    double q;
+    double m;
+    double q_over_m;
+    KOKKOS_INLINE_FUNCTION
+    ParticleType(double q_ = -1, double m_ = 1):
+     q(q_), m(m_), q_over_m(q/m) {}
+protected:
+};
+
+  typedef Kokkos::DualView<ParticleType[10]> ParticleTypes;
+int main (int narg, char* arg[]) {
+  Kokkos::initialize (narg, arg);
+
+// If View is non-trivial constructible type then add braces so it is out of scope
+// before Kokkos::finalize() call
+{
+  ParticleTypes test("Test");
+  Kokkos::fence();
+  test.h_view(0) = ParticleType(-1e4,1);
+  Kokkos::fence();
+
+  int size = 1000000;
+
+  // Create DualViews. This will allocate on both the device and its
+  // host_mirror_device.
+  idx_type idx ("Idx",size,64);
+  view_type dest ("Dest",size);
+  view_type src ("Src",size);
+
+
+  srand (134231);
+
+  // Get a reference to the host view of idx directly (equivalent to
+  // idx.view<idx_type::host_mirror_space>() )
+  idx_type::t_host h_idx = idx.h_view;
+  for (int i = 0; i < size; ++i) {
+    for (view_type::size_type j = 0; j < h_idx.dimension_1 (); ++j) {
+      h_idx(i,j) = (size + i + (rand () % 500 - 250)) % size;
+    }
+  }
+
+  // Mark idx as modified on the host_mirror_space so that a
+  // sync to the device will actually move data.  The sync happens in
+  // the functor's constructor.
+  idx.modify<idx_type::host_mirror_space> ();
+
+  // Run on the device.  This will cause a sync of idx to the device,
+  // since it was marked as modified on the host.
+  Kokkos::Timer timer;
+  Kokkos::parallel_for(size,localsum<view_type::execution_space>(idx,dest,src));
+  Kokkos::fence();
+  double sec1_dev = timer.seconds();
+
+  timer.reset();
+  Kokkos::parallel_for(size,localsum<view_type::execution_space>(idx,dest,src));
+  Kokkos::fence();
+  double sec2_dev = timer.seconds();
+
+  // Run on the host's default execution space (could be the same as device).
+  // This will cause a sync back to the host of dest.  Note that if the Device is CUDA,
+  // the data layout will not be optimal on host, so performance is
+  // lower than what it would be for a pure host compilation.
+  timer.reset();
+  Kokkos::parallel_for(size,localsum<Kokkos::HostSpace::execution_space>(idx,dest,src));
+  Kokkos::fence();
+  double sec1_host = timer.seconds();
+
+  timer.reset();
+  Kokkos::parallel_for(size,localsum<Kokkos::HostSpace::execution_space>(idx,dest,src));
+  Kokkos::fence();
+  double sec2_host = timer.seconds();
+
+  printf("Device Time with Sync: %f without Sync: %f \n",sec1_dev,sec2_dev);
+  printf("Host   Time with Sync: %f without Sync: %f \n",sec1_host,sec2_host);
+}
+
+  Kokkos::finalize();
+}
+
diff --git a/lib/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/CMakeLists.txt b/lib/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f0ed569f9f48a02ebcca091adced52a8c3a1f2ad
--- /dev/null
+++ b/lib/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/CMakeLists.txt
@@ -0,0 +1,13 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+IF (Kokkos_ENABLE_Cuda_UVM)
+  # This is a tutorial, not a test, so we don't ask CTest to run it.
+  TRIBITS_ADD_EXECUTABLE(
+    tutorial_advancedviews_05_nvidia_uvm
+    SOURCES uvm_example.cpp
+    COMM serial mpi
+    DEPLIBS kokkoscontainers kokkoscore
+    )
+ENDIF ()
diff --git a/lib/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/Makefile b/lib/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..12ad36b31e458d155aa6dc653ab8188a7773bd18
--- /dev/null
+++ b/lib/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/Makefile
@@ -0,0 +1,43 @@
+KOKKOS_PATH = ../../../..
+SRC = $(wildcard *.cpp)
+
+default: build
+	echo "Start Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+CXX = ../../../../config/nvcc_wrapper
+CXXFLAGS = -O3
+LINK = ${CXX}
+LINKFLAGS = 
+EXE = $(SRC:.cpp=.cuda)
+KOKKOS_DEVICES = "Cuda,OpenMP"
+KOKKOS_ARCH = "SNB,Kepler35"
+else
+CXX = g++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LINKFLAGS =  
+EXE = $(SRC:.cpp=.host)
+KOKKOS_DEVICES = "OpenMP"
+KOKKOS_ARCH = "SNB"
+endif
+
+DEPFLAGS = -M
+
+OBJ = $(SRC:.cpp=.o)
+LIB =
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+build: $(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: kokkos-clean 
+	rm -f *.o *.cuda *.host
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
diff --git a/lib/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/uvm_example.cpp b/lib/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/uvm_example.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..cf5326b687199ff8c5c14580b18a9e406279cd11
--- /dev/null
+++ b/lib/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/uvm_example.cpp
@@ -0,0 +1,134 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <Kokkos_DualView.hpp>
+#include <impl/Kokkos_Timer.hpp>
+#include <cstdio>
+#include <cstdlib>
+
+typedef Kokkos::View<double*> view_type;
+typedef Kokkos::View<int**> idx_type;
+
+
+template<class Device>
+struct localsum {
+  // Define the execution space for the functor (overrides the DefaultExecutionSpace)
+  typedef Device execution_space;
+
+  // Get the view types on the particular device the functor is instantiated for
+  idx_type::const_type idx;
+  view_type dest;
+  Kokkos::View<view_type::const_data_type, view_type::array_layout, view_type::execution_space, Kokkos::MemoryRandomAccess > src;
+
+  localsum(idx_type idx_, view_type dest_,
+      view_type src_):idx(idx_),dest(dest_),src(src_) {
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (int i) const {
+    double tmp = 0.0;
+    for(int j = 0; j < idx.dimension_1(); j++) {
+      const double val = src(idx(i,j));
+      tmp += val*val + 0.5*(idx.dimension_0()*val -idx.dimension_1()*val);
+    }
+    dest(i) += tmp;
+  }
+};
+
+int main(int narg, char* arg[]) {
+  Kokkos::initialize(narg,arg);
+
+  int size = 1000000;
+
+  // Create Views
+  idx_type idx("Idx",size,64);
+  view_type dest("Dest",size);
+  view_type src("Src",size);
+
+  srand(134231);
+
+  // When using UVM Cuda views can be accessed on the Host directly
+  for(int i=0; i<size; i++) {
+    for(int j=0; j<idx.dimension_1(); j++)
+      idx(i,j) = (size + i + (rand()%500 - 250))%size;
+  }
+
+  Kokkos::fence();
+  // Run on the device
+  // This will cause a sync of idx to the device since it was modified on the host
+  Kokkos::Timer timer;
+  Kokkos::parallel_for(size,localsum<view_type::execution_space>(idx,dest,src));
+  Kokkos::fence();
+  double sec1_dev = timer.seconds();
+
+  // No data transfer will happen now, since nothing is accessed on the host
+  timer.reset();
+  Kokkos::parallel_for(size,localsum<view_type::execution_space>(idx,dest,src));
+  Kokkos::fence();
+  double sec2_dev = timer.seconds();
+
+  // Run on the host
+  // This will cause a sync back to the host of dest which was changed on the device
+  // Compare runtime here with the dual_view example: dest will be copied back in 4k blocks
+  // when they are accessed the first time during the parallel_for. Due to the latency of a memcpy
+  // this gives lower effective bandwidth when doing a manual copy via dual views
+  timer.reset();
+  Kokkos::parallel_for(size,localsum<Kokkos::HostSpace::execution_space>(idx,dest,src));
+  Kokkos::fence();
+  double sec1_host = timer.seconds();
+
+  // No data transfers will happen now
+  timer.reset();
+  Kokkos::parallel_for(size,localsum<Kokkos::HostSpace::execution_space>(idx,dest,src));
+  Kokkos::fence();
+  double sec2_host = timer.seconds();
+
+
+
+  printf("Device Time with Sync: %lf without Sync: %lf \n",sec1_dev,sec2_dev);
+  printf("Host   Time with Sync: %lf without Sync: %lf \n",sec1_host,sec2_host);
+
+  Kokkos::finalize();
+}
+
diff --git a/lib/kokkos/example/tutorial/Advanced_Views/06_AtomicViews/Makefile b/lib/kokkos/example/tutorial/Advanced_Views/06_AtomicViews/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..12ad36b31e458d155aa6dc653ab8188a7773bd18
--- /dev/null
+++ b/lib/kokkos/example/tutorial/Advanced_Views/06_AtomicViews/Makefile
@@ -0,0 +1,43 @@
+KOKKOS_PATH = ../../../..
+SRC = $(wildcard *.cpp)
+
+default: build
+	echo "Start Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+CXX = ../../../../config/nvcc_wrapper
+CXXFLAGS = -O3
+LINK = ${CXX}
+LINKFLAGS = 
+EXE = $(SRC:.cpp=.cuda)
+KOKKOS_DEVICES = "Cuda,OpenMP"
+KOKKOS_ARCH = "SNB,Kepler35"
+else
+CXX = g++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LINKFLAGS =  
+EXE = $(SRC:.cpp=.host)
+KOKKOS_DEVICES = "OpenMP"
+KOKKOS_ARCH = "SNB"
+endif
+
+DEPFLAGS = -M
+
+OBJ = $(SRC:.cpp=.o)
+LIB =
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+build: $(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: kokkos-clean 
+	rm -f *.o *.cuda *.host
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
diff --git a/lib/kokkos/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/Makefile b/lib/kokkos/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..60a514f4d50ccf3e36fa2a8233de90c46f3bbe5d
--- /dev/null
+++ b/lib/kokkos/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/Makefile
@@ -0,0 +1,43 @@
+KOKKOS_PATH = ../../../..
+SRC = $(wildcard *.cpp)
+
+default: build
+	echo "Start Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+CXX = ../../../../config/nvcc_wrapper
+CXXFLAGS = -O3 --default-stream per-thread 
+LINK = ${CXX}
+LINKFLAGS = 
+EXE = $(SRC:.cpp=.cuda)
+KOKKOS_DEVICES = "Cuda,OpenMP"
+KOKKOS_ARCH = "SNB,Kepler35"
+else
+CXX = g++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LINKFLAGS =  
+EXE = $(SRC:.cpp=.host)
+KOKKOS_DEVICES = "OpenMP"
+KOKKOS_ARCH = "SNB"
+endif
+
+DEPFLAGS = -M
+
+OBJ = $(SRC:.cpp=.o)
+LIB =
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+build: $(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: kokkos-clean 
+	rm -f *.o *.cuda *.host
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
diff --git a/lib/kokkos/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/overlapping_deepcopy.cpp b/lib/kokkos/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/overlapping_deepcopy.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5da3bf76c98ebc1cfbf4c6d81d3e6fc7d3e13171
--- /dev/null
+++ b/lib/kokkos/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/overlapping_deepcopy.cpp
@@ -0,0 +1,148 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <cstdio>
+#include <typeinfo>
+#include <cmath>
+#include <impl/Kokkos_Timer.hpp>
+
+struct FillDevice {
+  double value;
+  Kokkos::View<double*,Kokkos::LayoutLeft,Kokkos::CudaSpace> a;
+  FillDevice(const double& val, const Kokkos::View<double*,Kokkos::LayoutLeft,Kokkos::CudaSpace>& d_a):
+     value(val),a(d_a){}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i) const {
+    a(i) = value;
+  }
+};
+
+struct ComputeADevice {
+  int iter;
+  Kokkos::View<double*,Kokkos::LayoutLeft,Kokkos::CudaSpace> a;
+  Kokkos::View<double*,Kokkos::LayoutLeft,Kokkos::CudaSpace> b;
+  ComputeADevice(const int& iter_,
+                 const Kokkos::View<double*,Kokkos::LayoutLeft,Kokkos::CudaSpace>& d_a,
+                 const Kokkos::View<double*,Kokkos::LayoutLeft,Kokkos::CudaSpace>& d_b):
+     iter(iter_),a(d_a),b(d_b){}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i) const {
+    for(int j=1;j<iter;j++) {
+      a(i) += std::pow(b(i),1.0+1.0/iter);
+    }
+  }
+};
+
+struct ComputeAHost {
+  Kokkos::View<double*,Kokkos::LayoutLeft,Kokkos::CudaHostPinnedSpace> a;
+  Kokkos::View<double*,Kokkos::LayoutLeft,Kokkos::CudaHostPinnedSpace> b;
+  ComputeAHost(  const Kokkos::View<double*,Kokkos::LayoutLeft,Kokkos::CudaHostPinnedSpace>& d_a,
+                 const Kokkos::View<double*,Kokkos::LayoutLeft,Kokkos::CudaHostPinnedSpace>& d_b):
+     a(d_a),b(d_b){}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i) const {
+    a(i) += b(i);
+  }
+};
+
+struct MergeDevice {
+  Kokkos::View<double*,Kokkos::LayoutLeft,Kokkos::CudaSpace> a;
+  Kokkos::View<double*,Kokkos::LayoutLeft,Kokkos::CudaSpace> b;
+  MergeDevice(
+                 const Kokkos::View<double*,Kokkos::LayoutLeft,Kokkos::CudaSpace>& d_a,
+                 const Kokkos::View<double*,Kokkos::LayoutLeft,Kokkos::CudaSpace>& d_b):
+     a(d_a),b(d_b){}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i) const {
+    a(i) += b(i);
+  }
+};
+
+int main(int argc, char * argv[]) {
+  int size = 100000000;
+  Kokkos::initialize();
+  int synch = atoi(argv[1]);
+  Kokkos::View<double*,Kokkos::LayoutLeft,Kokkos::CudaSpace> d_a("Device A",size);
+  Kokkos::View<double*,Kokkos::LayoutLeft,Kokkos::CudaSpace> d_b("Device B",size);
+  Kokkos::View<double*,Kokkos::LayoutLeft,Kokkos::CudaSpace> d_tmp("Device tmp",size);
+  Kokkos::View<double*,Kokkos::LayoutLeft,Kokkos::CudaHostPinnedSpace> h_a("Host A",size);
+  Kokkos::View<double*,Kokkos::LayoutLeft,Kokkos::CudaHostPinnedSpace> h_b("Host B",size);
+
+  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::Cuda>(0,size),FillDevice(0.0,d_a));
+  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::Cuda>(0,size),FillDevice(1.3513,d_b));
+  Kokkos::fence();
+  Kokkos::Timer timer;
+  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::Cuda>(0,size),ComputeADevice(20,d_a,d_b));
+
+  if(synch==1)
+    Kokkos::deep_copy(Kokkos::OpenMP(),h_b,d_b);
+  if(synch==2)
+    Kokkos::deep_copy(h_b,d_b);
+
+
+  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::OpenMP>(0,size),[=] (const int& i) {
+    h_a(i) = 0.0;
+  });
+  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::OpenMP>(0,size),ComputeAHost(h_a,h_b));
+  Kokkos::OpenMP::fence();
+  if(synch==1)
+    Kokkos::deep_copy(Kokkos::OpenMP(), d_tmp,h_a);
+  if(synch==2)
+    Kokkos::deep_copy(d_tmp,h_a);
+  Kokkos::fence();
+
+  std::cout << "Time " << timer.seconds() << std::endl;
+  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::Cuda>(0,size),MergeDevice(d_a,d_tmp));
+
+  Kokkos::deep_copy(h_a,d_a);
+  std::cout << "h_a(0): " << h_a(0) << " ( Correct: 27.4154 )" << std::endl;
+  Kokkos::finalize();
+}
+
+
+
diff --git a/lib/kokkos/example/tutorial/Advanced_Views/CMakeLists.txt b/lib/kokkos/example/tutorial/Advanced_Views/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f4f1addc5553d9ce7131456f02af664554757daa
--- /dev/null
+++ b/lib/kokkos/example/tutorial/Advanced_Views/CMakeLists.txt
@@ -0,0 +1,9 @@
+
+TRIBITS_ADD_EXAMPLE_DIRECTORIES(01_data_layouts)
+TRIBITS_ADD_EXAMPLE_DIRECTORIES(02_memory_traits)
+TRIBITS_ADD_EXAMPLE_DIRECTORIES(03_subviews)
+TRIBITS_ADD_EXAMPLE_DIRECTORIES(04_dualviews)
+
+IF (Kokkos_ENABLE_Cuda_UVM)
+  TRIBITS_ADD_EXAMPLE_DIRECTORIES(05_NVIDIA_UVM)
+ENDIF ()
diff --git a/lib/kokkos/example/tutorial/Advanced_Views/Makefile b/lib/kokkos/example/tutorial/Advanced_Views/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..19053b61b037f6a21f1be0874b1c23cbbb02a234
--- /dev/null
+++ b/lib/kokkos/example/tutorial/Advanced_Views/Makefile
@@ -0,0 +1,84 @@
+default:
+	cd ./01_data_layouts; \
+	make -j 4
+	cd ./02_memory_traits; \
+	make -j 4
+	cd ./03_subviews; \
+	make -j 4
+	cd ./04_dualviews; \
+	make -j 4
+	cd ./05_NVIDIA_UVM; \
+	make -j 4
+	cd ./06_AtomicViews; \
+	make -j 4
+
+openmp:
+	cd ./01_data_layouts; \
+	make -j 4 KOKKOS_DEVICES=OpenMP
+	cd ./02_memory_traits; \
+	make -j 4 KOKKOS_DEVICES=OpenMP
+	cd ./03_subviews; \
+	make -j 4 KOKKOS_DEVICES=OpenMP
+	cd ./04_dualviews; \
+	make -j 4 KOKKOS_DEVICES=OpenMP
+	cd ./05_NVIDIA_UVM; \
+	make -j 4 KOKKOS_DEVICES=OpenMP
+	cd ./06_AtomicViews; \
+	make -j 4 KOKKOS_DEVICES=OpenMP
+
+pthreads:
+	cd ./01_data_layouts; \
+	make -j 4 KOKKOS_DEVICES=Pthreads
+	cd ./02_memory_traits; \
+	make -j 4 KOKKOS_DEVICES=Pthreads
+	cd ./03_subviews; \
+	make -j 4 KOKKOS_DEVICES=Pthreads
+	cd ./04_dualviews; \
+	make -j 4 KOKKOS_DEVICES=Pthreads
+	cd ./05_NVIDIA_UVM; \
+	make -j 4 KOKKOS_DEVICES=Pthreads
+	cd ./06_AtomicViews; \
+	make -j 4 KOKKOS_DEVICES=Pthreads
+
+serial:
+	cd ./01_data_layouts; \
+	make -j 4 KOKKOS_DEVICES=Serial
+	cd ./02_memory_traits; \
+	make -j 4 KOKKOS_DEVICES=Serial
+	cd ./03_subviews; \
+	make -j 4 KOKKOS_DEVICES=Serial
+	cd ./04_dualviews; \
+	make -j 4 KOKKOS_DEVICES=Serial
+	cd ./05_NVIDIA_UVM; \
+	make -j 4 KOKKOS_DEVICES=Serial
+	cd ./06_AtomicViews; \
+	make -j 4 KOKKOS_DEVICES=Serial
+
+cuda:
+	cd ./01_data_layouts; \
+	make -j 4 KOKKOS_DEVICES=Cuda,Serial
+	cd ./02_memory_traits; \
+	make -j 4 KOKKOS_DEVICES=Cuda,Serial
+	cd ./03_subviews; \
+	make -j 4 KOKKOS_DEVICES=Cuda,Serial
+	cd ./04_dualviews; \
+	make -j 4 KOKKOS_DEVICES=Cuda,Serial
+	cd ./05_NVIDIA_UVM; \
+	make -j 4 KOKKOS_DEVICES=Cuda,Serial
+	cd ./06_AtomicViews; \
+	make -j 4 KOKKOS_DEVICES=Cuda,Serial
+
+clean:
+	cd ./01_data_layouts; \
+	make clean
+	cd ./02_memory_traits; \
+	make clean
+	cd ./03_subviews; \
+	make clean
+	cd ./04_dualviews; \
+	make clean
+	cd ./05_NVIDIA_UVM; \
+	make clean
+	cd ./06_AtomicViews; \
+	make clean
+
diff --git a/lib/kokkos/example/tutorial/Algorithms/01_random_numbers/Makefile b/lib/kokkos/example/tutorial/Algorithms/01_random_numbers/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..12ad36b31e458d155aa6dc653ab8188a7773bd18
--- /dev/null
+++ b/lib/kokkos/example/tutorial/Algorithms/01_random_numbers/Makefile
@@ -0,0 +1,43 @@
+KOKKOS_PATH = ../../../..
+SRC = $(wildcard *.cpp)
+
+default: build
+	echo "Start Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+CXX = ../../../../config/nvcc_wrapper
+CXXFLAGS = -O3
+LINK = ${CXX}
+LINKFLAGS = 
+EXE = $(SRC:.cpp=.cuda)
+KOKKOS_DEVICES = "Cuda,OpenMP"
+KOKKOS_ARCH = "SNB,Kepler35"
+else
+CXX = g++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LINKFLAGS =  
+EXE = $(SRC:.cpp=.host)
+KOKKOS_DEVICES = "OpenMP"
+KOKKOS_ARCH = "SNB"
+endif
+
+DEPFLAGS = -M
+
+OBJ = $(SRC:.cpp=.o)
+LIB =
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+build: $(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: kokkos-clean 
+	rm -f *.o *.cuda *.host
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
diff --git a/lib/kokkos/example/tutorial/Algorithms/01_random_numbers/random_numbers.cpp b/lib/kokkos/example/tutorial/Algorithms/01_random_numbers/random_numbers.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3e6175a75652d54af1f0ad3c3c818485ccc59b07
--- /dev/null
+++ b/lib/kokkos/example/tutorial/Algorithms/01_random_numbers/random_numbers.cpp
@@ -0,0 +1,152 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Random.hpp>
+#include <Kokkos_DualView.hpp>
+#include <impl/Kokkos_Timer.hpp>
+#include <cstdlib>
+
+typedef Kokkos::HostSpace::execution_space DefaultHostType;
+
+// Kokkos provides two different random number generators with a 64 bit and a 1024 bit state.
+// These generators are based on Vigna, Sebastiano (2014). "An experimental exploration of Marsaglia's xorshift generators, scrambled"
+// See: http://arxiv.org/abs/1402.6246
+// The generators can be used fully independently on each thread and have been tested to
+// produce good statistics for both inter and intra thread numbers.
+// Note that within a kernel NO random number operations are (team) collective operations.
+// Everything can be called within branches. This is a difference to the curand library where
+// certain operations are required to be called by all threads in a block.
+//
+// In Kokkos you are required to create a pool of generator states, so that threads can
+// grep their own. On CPU architectures the pool size is equal to the thread number,
+// on CUDA about 128k states are generated (enough to give every potentially simultaneously
+// running thread its own state). With a kernel a thread is required to aquire a state from the
+// pool and later return it.
+// On CPUs the Random number generator is deterministic if using the same number of threads.
+// On GPUs (i.e. using the CUDA backend it is not deterministic because threads aquire states via
+// atomics.
+
+// A Functor for generating uint64_t random numbers templated on the GeneratorPool type
+template<class GeneratorPool>
+struct generate_random {
+
+  // The GeneratorPool
+  GeneratorPool rand_pool;
+
+  // Output View for the random numbers
+  Kokkos::View<uint64_t*> vals;
+  int samples;
+
+  // Initialize all members
+  generate_random(Kokkos::View<uint64_t*> vals_,
+                       GeneratorPool rand_pool_,
+                       int samples_):
+                       vals(vals_),rand_pool(rand_pool_),samples(samples_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (int i) const {
+    // Get a random number state from the pool for the active thread
+    typename GeneratorPool::generator_type rand_gen = rand_pool.get_state();
+
+    // Draw samples numbers from the pool as urand64 between 0 and rand_pool.MAX_URAND64
+    // Note there are function calls to get other type of scalars, and also to specify
+    // Ranges or get a normal distributed float.
+    for(int k = 0;k<samples;k++)
+      vals(i*samples+k) = rand_gen.urand64();
+
+    // Give the state back, which will allow another thread to aquire it
+    rand_pool.free_state(rand_gen);
+  }
+};
+
+
+
+
+int main(int argc, char* args[]) {
+  if (argc != 3){
+	printf("Please pass two integers on the command line\n");
+  }
+  else {
+
+  // Initialize Kokkos
+  Kokkos::initialize(argc,args);
+  int size = atoi(args[1]);
+  int samples = atoi(args[2]);
+
+  // Create two random number generator pools one for 64bit states and one for 1024 bit states
+  // Both take an 64 bit unsigned integer seed to initialize a Random_XorShift64 generator which
+  // is used to fill the generators of the pool.
+  Kokkos::Random_XorShift64_Pool<> rand_pool64(5374857);
+  Kokkos::Random_XorShift1024_Pool<> rand_pool1024(5374857);
+  Kokkos::DualView<uint64_t*> vals("Vals",size*samples);
+
+  // Run some performance comparisons
+  Kokkos::Timer timer;
+  Kokkos::parallel_for(size,generate_random<Kokkos::Random_XorShift64_Pool<> >(vals.d_view,rand_pool64,samples));
+  Kokkos::fence();
+
+  timer.reset();
+  Kokkos::parallel_for(size,generate_random<Kokkos::Random_XorShift64_Pool<> >(vals.d_view,rand_pool64,samples));
+  Kokkos::fence();
+  double time_64 = timer.seconds();
+
+  Kokkos::parallel_for(size,generate_random<Kokkos::Random_XorShift1024_Pool<> >(vals.d_view,rand_pool1024,samples));
+  Kokkos::fence();
+
+  timer.reset();
+  Kokkos::parallel_for(size,generate_random<Kokkos::Random_XorShift1024_Pool<> >(vals.d_view,rand_pool1024,samples));
+  Kokkos::fence();
+  double time_1024 = timer.seconds();
+
+  printf("#Time XorShift64*:   %lf %lf\n",time_64,1.0e-9*samples*size/time_64 );
+  printf("#Time XorShift1024*: %lf %lf\n",time_1024,1.0e-9*samples*size/time_1024 );
+
+  Kokkos::deep_copy(vals.h_view,vals.d_view);
+
+  Kokkos::finalize();
+  }
+  return 0;
+}
+
+
diff --git a/lib/kokkos/example/tutorial/Algorithms/Makefile b/lib/kokkos/example/tutorial/Algorithms/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..edc2a36024fc24a791a27064e4f36febfec81c1a
--- /dev/null
+++ b/lib/kokkos/example/tutorial/Algorithms/Makefile
@@ -0,0 +1,24 @@
+default:
+	cd ./01_random_numbers; \
+	make -j 4
+
+openmp:
+	cd ./01_random_numbers; \
+	make -j 4 KOKKOS_DEVICES=OpenMP
+
+pthreads:
+	cd ./01_random_numbers; \
+	make -j 4 KOKKOS_DEVICES=Pthreads
+
+serial:
+	cd ./01_random_numbers; \
+	make -j 4 KOKKOS_DEVICES=Serial
+
+cuda:
+	cd ./01_random_numbers; \
+	make -j 4 KOKKOS_DEVICES=Cuda,Serial
+
+clean:
+	cd ./01_random_numbers; \
+	make clean
+
diff --git a/lib/kokkos/example/tutorial/CMakeLists.txt b/lib/kokkos/example/tutorial/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d1fd4c0ae9b31f01c8fb351e5730cf2a452655fe
--- /dev/null
+++ b/lib/kokkos/example/tutorial/CMakeLists.txt
@@ -0,0 +1,17 @@
+
+TRIBITS_ADD_EXAMPLE_DIRECTORIES(01_hello_world)
+TRIBITS_ADD_EXAMPLE_DIRECTORIES(02_simple_reduce)
+TRIBITS_ADD_EXAMPLE_DIRECTORIES(03_simple_view)
+TRIBITS_ADD_EXAMPLE_DIRECTORIES(04_simple_memoryspaces)
+TRIBITS_ADD_EXAMPLE_DIRECTORIES(05_simple_atomics)
+TRIBITS_ADD_EXAMPLE_DIRECTORIES(Advanced_Views)
+TRIBITS_ADD_EXAMPLE_DIRECTORIES(Hierarchical_Parallelism)
+
+IF (Kokkos_ENABLE_CXX11)
+  TRIBITS_ADD_EXAMPLE_DIRECTORIES(01_hello_world_lambda)
+  TRIBITS_ADD_EXAMPLE_DIRECTORIES(02_simple_reduce_lambda)
+  TRIBITS_ADD_EXAMPLE_DIRECTORIES(03_simple_view_lambda)
+ENDIF ()
+
+
+
diff --git a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams/CMakeLists.txt b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2d8a514a4549aad63f735721b41e47516a570070
--- /dev/null
+++ b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams/CMakeLists.txt
@@ -0,0 +1,10 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+# This is a tutorial, not a test, so we don't ask CTest to run it.
+TRIBITS_ADD_EXECUTABLE(
+  tutorial_hierarchicalparallelism_01_thread_teams
+  SOURCES thread_teams.cpp
+  COMM serial mpi
+  )
diff --git a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams/Makefile b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..12ad36b31e458d155aa6dc653ab8188a7773bd18
--- /dev/null
+++ b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams/Makefile
@@ -0,0 +1,43 @@
+KOKKOS_PATH = ../../../..
+SRC = $(wildcard *.cpp)
+
+default: build
+	echo "Start Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+CXX = ../../../../config/nvcc_wrapper
+CXXFLAGS = -O3
+LINK = ${CXX}
+LINKFLAGS = 
+EXE = $(SRC:.cpp=.cuda)
+KOKKOS_DEVICES = "Cuda,OpenMP"
+KOKKOS_ARCH = "SNB,Kepler35"
+else
+CXX = g++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LINKFLAGS =  
+EXE = $(SRC:.cpp=.host)
+KOKKOS_DEVICES = "OpenMP"
+KOKKOS_ARCH = "SNB"
+endif
+
+DEPFLAGS = -M
+
+OBJ = $(SRC:.cpp=.o)
+LIB =
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+build: $(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: kokkos-clean 
+	rm -f *.o *.cuda *.host
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
diff --git a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams/thread_teams.cpp b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams/thread_teams.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f2de0809a796128e76e44cdb32837f4c82c98022
--- /dev/null
+++ b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams/thread_teams.cpp
@@ -0,0 +1,94 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <cstdio>
+
+// Using default execution space define a TeamPolicy and its member_type
+// The member_type is what the operator of a functor or Lambda gets, for
+// a simple RangePolicy the member_type is simply an integer
+// For a TeamPolicy its a much richer object, since it provides all information
+// to identify a thread uniquely and some team related function calls such as a
+// barrier (which will be used in a subsequent example).
+// A ThreadTeam consists of 1 to n threads where the maxmimum value of n is
+// determined by the hardware. On a dual socket CPU machine with 8 cores per socket
+// the maximum size of a team is 8. The number of teams (i.e. the league_size) is
+// not limited by physical constraints. Its a pure logical number.
+
+typedef Kokkos::TeamPolicy<>              team_policy ;
+typedef team_policy::member_type team_member ;
+
+// Define a functor which can be launched using the TeamPolicy
+struct hello_world {
+  typedef int value_type; //Specify value type for reduction target, sum
+
+  // This is a reduction operator which now takes as first argument the
+  // TeamPolicy member_type. Every member of the team contributes to the
+  // total sum.
+  // It is helpful to think of this operator as a parallel region for a team
+  // (i.e. every team member is active and will execute the code).
+  KOKKOS_INLINE_FUNCTION
+  void operator() ( const team_member & thread, int& sum) const {
+    sum+=1;
+    // The TeamPolicy<>::member_type provides functions to query the multi
+    // dimensional index of a thread as well as the number of thread-teams and the size
+    // of each team.
+    printf("Hello World: %i %i // %i %i\n",thread.league_rank(),thread.team_rank(),thread.league_size(),thread.team_size());
+  }
+};
+
+int main(int narg, char* args[]) {
+  Kokkos::initialize(narg,args);
+
+  // Launch 12 teams of the maximum number of threads per team
+  const team_policy policy( 12 , team_policy::team_size_max( hello_world() ) );
+  
+  int sum = 0;
+  Kokkos::parallel_reduce( policy , hello_world() , sum );
+
+  // The result will be 12*team_policy::team_size_max( hello_world())
+  printf("Result %i\n",sum);
+
+  Kokkos::finalize();
+}
+
diff --git a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/CMakeLists.txt b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ec7f1e1159fcf7f12209defea154c494fb48540e
--- /dev/null
+++ b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/CMakeLists.txt
@@ -0,0 +1,13 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+IF (Kokkos_ENABLE_CXX11)
+  # This is a tutorial, not a test, so we don't ask CTest to run it.
+  TRIBITS_ADD_EXECUTABLE(
+    tutorial_hierarchical_01_thread_teams_lambda
+    SOURCES thread_teams_lambda.cpp
+    COMM serial mpi
+    )
+ENDIF ()
+
diff --git a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/Makefile b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..965b72b4e9a7aac83f1a748d3f0c4fe611aafabb
--- /dev/null
+++ b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/Makefile
@@ -0,0 +1,44 @@
+KOKKOS_PATH = ../../../..
+SRC = $(wildcard *.cpp)
+
+default: build
+	echo "Start Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+CXX = ../../../../config/nvcc_wrapper
+CXXFLAGS = -O3
+LINK = ${CXX}
+LINKFLAGS = 
+EXE = $(SRC:.cpp=.cuda)
+KOKKOS_DEVICES = "Cuda,OpenMP"
+KOKKOS_ARCH = "SNB,Kepler35"
+KOKKOS_CUDA_OPTIONS = "enable_lambda"
+else
+CXX = g++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LINKFLAGS =  
+EXE = $(SRC:.cpp=.host)
+KOKKOS_DEVICES = "OpenMP"
+KOKKOS_ARCH = "SNB"
+endif
+
+DEPFLAGS = -M
+
+OBJ = $(SRC:.cpp=.o)
+LIB =
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+build: $(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: kokkos-clean 
+	rm -f *.o *.cuda *.host
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
diff --git a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/thread_teams_lambda.cpp b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/thread_teams_lambda.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..565dd22e82849fde2fe527f25179ae49346222f9
--- /dev/null
+++ b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/thread_teams_lambda.cpp
@@ -0,0 +1,94 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <cstdio>
+
+// Demonstrate a parallel reduction using thread teams (TeamPolicy).
+//
+// A thread team consists of 1 to n threads.  The hardware determines
+// the maxmimum value of n. On a dual-socket CPU machine with 8 cores
+// per socket, the maximum size of a team is 8. The number of teams
+// (the league_size) is not limited by physical constraints (up to
+// some reasonable bound, which eventually depends upon the hardware
+// and programming model implementation).
+
+int main (int narg, char* args[]) {
+  using Kokkos::parallel_reduce;
+  typedef Kokkos::TeamPolicy<>               team_policy;
+  typedef typename team_policy::member_type  team_member;
+
+  Kokkos::initialize (narg, args);
+
+  // Set up a policy that launches 12 teams, with the maximum number
+  // of threads per team.
+
+  const team_policy policy (12, Kokkos::AUTO);
+
+  // This is a reduction with a team policy.  The team policy changes
+  // the first argument of the lambda.  Rather than an integer index
+  // (as with RangePolicy), it's now TeamPolicy::member_type.  This
+  // object provides all information to identify a thread uniquely.
+  // It also provides some team-related function calls such as a team
+  // barrier (which a subsequent example will use).
+  //
+  // Every member of the team contributes to the total sum.  It is
+  // helpful to think of the lambda's body as a "team parallel
+  // region."  That is, every team member is active and will execute
+  // the body of the lambda.
+  int sum = 0;
+  parallel_reduce (policy, KOKKOS_LAMBDA (const team_member& thread, int& lsum) {
+      lsum += 1;
+      // TeamPolicy<>::member_type provides functions to query the
+      // multidimensional index of a thread, as well as the number of
+      // thread teams and the size of each team.
+      printf ("Hello World: %i %i // %i %i\n", thread.league_rank (),
+              thread.team_rank (), thread.league_size (), thread.team_size ());
+    }, sum);
+
+  // The result will be 12*team_policy::team_size_max([=]{})
+  printf ("Result %i\n",sum);
+
+  Kokkos::finalize ();
+}
+
diff --git a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/CMakeLists.txt b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e660405345167858b985261362d6135d5e6d5c4d
--- /dev/null
+++ b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/CMakeLists.txt
@@ -0,0 +1,10 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+# This is a tutorial, not a test, so we don't ask CTest to run it.
+TRIBITS_ADD_EXECUTABLE(
+  tutorial_hierarchicalparallelism_02_nested_parallel_for
+  SOURCES nested_parallel_for.cpp
+  COMM serial mpi
+  )
diff --git a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/Makefile b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..12ad36b31e458d155aa6dc653ab8188a7773bd18
--- /dev/null
+++ b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/Makefile
@@ -0,0 +1,43 @@
+KOKKOS_PATH = ../../../..
+SRC = $(wildcard *.cpp)
+
+default: build
+	echo "Start Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+CXX = ../../../../config/nvcc_wrapper
+CXXFLAGS = -O3
+LINK = ${CXX}
+LINKFLAGS = 
+EXE = $(SRC:.cpp=.cuda)
+KOKKOS_DEVICES = "Cuda,OpenMP"
+KOKKOS_ARCH = "SNB,Kepler35"
+else
+CXX = g++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LINKFLAGS =  
+EXE = $(SRC:.cpp=.host)
+KOKKOS_DEVICES = "OpenMP"
+KOKKOS_ARCH = "SNB"
+endif
+
+DEPFLAGS = -M
+
+OBJ = $(SRC:.cpp=.o)
+LIB =
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+build: $(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: kokkos-clean 
+	rm -f *.o *.cuda *.host
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
diff --git a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/nested_parallel_for.cpp b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/nested_parallel_for.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4357b4fa1a19b6b36fed42fe3f0a2b526b1fa8b4
--- /dev/null
+++ b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/nested_parallel_for.cpp
@@ -0,0 +1,89 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <cstdio>
+
+// See 01_thread_teams for an explanation of a basic TeamPolicy
+typedef Kokkos::TeamPolicy<>              team_policy ;
+typedef typename team_policy::member_type team_member ;
+
+struct hello_world {
+  typedef int value_type; //Specify value type for reduction target, sum
+  KOKKOS_INLINE_FUNCTION
+  void operator() ( const team_member & thread, int& sum) const {
+    sum+=1;
+    // When using the TeamPolicy Kokkos allows for nested parallel loops.
+    // All three Kokkos parallel patterns are allowed (for, reduce, scan) and they
+    // largely follow the same syntax as on the global level.
+    // The execution policy for the Thread level nesting (the Vector level is in the next
+    // tutorial example) is Kokkos::TeamThreadRange. This means the loop will be executed
+    // by all members of the team and the loop count will be split between threads of the
+    // team. Its arguments are the team_member, and a loop count.
+    // Not every thread will do the same amount of iterations. On a GPU for example with
+    // a team_size() larger than 31 only the first 31 threads would actually do anything.
+    // On a CPU with 8 threads 7 would execute 4 loop iterations, and 1 thread would do
+    // 3. Note also that the mode of splitting the count is architecture dependent similar
+    // to what the RangePolicy on a global level does.
+    // The call itself is not guaranteed to be synchronous. Also keep in mind that the
+    // operator using a team_policy acts like a parallel region for the team. That means
+    // that everything outside of the nested parallel_for is also executed by all threads
+    // of the team.
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(thread,31), [&] (const int& i) {
+       printf("Hello World: (%i , %i) executed loop %i \n",thread.league_rank(),thread.team_rank(),i);
+    });
+  }
+};
+
+int main(int narg, char* args[]) {
+  Kokkos::initialize(narg,args);
+
+  // Launch 3 teams of the maximum number of threads per team
+  const team_policy policy( 3 , team_policy::team_size_max( hello_world() ) );
+  
+  int sum = 0;
+  Kokkos::parallel_reduce( policy , hello_world() , sum );
+  printf("Result %i\n",sum);
+
+  Kokkos::finalize();
+}
diff --git a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/03_vectorization/CMakeLists.txt b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/03_vectorization/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ea6b0b1e42694c2b0b5994b54309e19647a09e5f
--- /dev/null
+++ b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/03_vectorization/CMakeLists.txt
@@ -0,0 +1,16 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+# This is a tutorial, not a test, so we don't ask CTest to run it.
+
+IF(Kokkos_ENABLE_CXX11)
+
+TRIBITS_ADD_EXECUTABLE(
+  tutorial_hierarchicalparallelism_03_vectorization
+  SOURCES vectorization.cpp
+  COMM serial mpi
+  )
+
+ENDIF()
+
diff --git a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/03_vectorization/Makefile b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/03_vectorization/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..12ad36b31e458d155aa6dc653ab8188a7773bd18
--- /dev/null
+++ b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/03_vectorization/Makefile
@@ -0,0 +1,43 @@
+KOKKOS_PATH = ../../../..
+SRC = $(wildcard *.cpp)
+
+default: build
+	echo "Start Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+CXX = ../../../../config/nvcc_wrapper
+CXXFLAGS = -O3
+LINK = ${CXX}
+LINKFLAGS = 
+EXE = $(SRC:.cpp=.cuda)
+KOKKOS_DEVICES = "Cuda,OpenMP"
+KOKKOS_ARCH = "SNB,Kepler35"
+else
+CXX = g++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LINKFLAGS =  
+EXE = $(SRC:.cpp=.host)
+KOKKOS_DEVICES = "OpenMP"
+KOKKOS_ARCH = "SNB"
+endif
+
+DEPFLAGS = -M
+
+OBJ = $(SRC:.cpp=.o)
+LIB =
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+build: $(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: kokkos-clean 
+	rm -f *.o *.cuda *.host
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
diff --git a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/03_vectorization/vectorization.cpp b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/03_vectorization/vectorization.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..99d5958edfe12aba723b4d5455328313009fc8f6
--- /dev/null
+++ b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/03_vectorization/vectorization.cpp
@@ -0,0 +1,162 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Random.hpp>
+#include <cstdio>
+
+#ifdef KOKKOS_HAVE_CXX11
+
+// The TeamPolicy actually supports 3D parallelism: Teams, Threads, Vector
+// Kokkos::parallel_{for/reduce/scan} calls can be completely free nested.
+// The execution policies for the nested layers are TeamThreadRange and
+// ThreadVectorRange.
+// The only restriction on nesting is that a given level can only be nested in a
+// higher one. e.g. a ThreadVectorRange can be nested inside a TeamPolicy operator
+// and inside a TeamThreadRange, but you can not nest a ThreadVectorRange or a
+// TeamThreadRange inside another ThreadVectorRange.
+// As with the 2D execution of TeamPolicy the operator has to be considered as
+// a parallel region even with respect to VectorLanes. That means even outside
+// a TeamThread or VectorThread loop all threads of a team and all vector lanes
+// of a thread execute every line of the operator as long as there are no restricitons
+// on them.
+// Code lines can be restricted using Kokkos::single to either execute once PerThread
+// or execute once PerTeam.
+typedef typename Kokkos::TeamPolicy<>::member_type team_member ;
+
+struct SomeCorrelation {
+  typedef int value_type; //Specify value type for reduction target, sum
+  typedef Kokkos::DefaultExecutionSpace::scratch_memory_space shared_space;
+  typedef Kokkos::View<int*,shared_space,Kokkos::MemoryUnmanaged> shared_1d_int;
+
+  Kokkos::View<const int***,Kokkos::LayoutRight> data;
+  Kokkos::View<int> gsum;
+
+  SomeCorrelation(Kokkos::View<int***,Kokkos::LayoutRight> data_in,
+                  Kokkos::View<int> sum):data(data_in),gsum(sum){}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() ( const team_member & thread) const {
+    int i = thread.league_rank();
+
+    // Allocate a shared array for the team.
+    shared_1d_int count(thread.team_shmem(),data.dimension_1());
+
+    // With each team run a parallel_for with its threads
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(thread,data.dimension_1()), [=] (const int& j) {
+      int tsum;
+      // Run a vector loop reduction over the inner dimension of data
+      // Count how many values are multiples of 4
+      // Every vector lane gets the same reduction value (tsum) back, it is broadcast to all vector lanes
+      Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(thread,data.dimension_2()), [=] (const int& k, int & vsum) {
+        vsum+= (data(i,j,k) % 4 == 0)?1:0;
+      },tsum);
+
+      // Make sure only one vector lane adds the reduction value to the shared array, i.e. execute
+      // the next line only once PerThread
+      Kokkos::single(Kokkos::PerThread(thread),[=] () {
+        count(j) = tsum;
+      });
+    });
+
+    // Wait for all threads to finish the parallel_for so that all shared memory writes are done
+    thread.team_barrier();
+
+    // Check with one vector lane from each thread how many consecutive
+    // data segments have the same number of values divisible by 4
+    // The team reduction value is again broadcast to every team member (and every vector lane)
+    int team_sum = 0;
+    Kokkos::parallel_reduce(Kokkos::TeamThreadRange(thread, data.dimension_1()-1), [=] (const int& j, int& thread_sum) {
+      // It is not valid to directly add to thread_sum
+      // Use a single function with broadcast instead
+      // team_sum will be used as input to the operator (i.e. it is used to initialize sum)
+      // the end value of sum will be broadcast to all vector lanes in the thread.
+      Kokkos::single(Kokkos::PerThread(thread),[=] (int& sum) {
+        if(count(j)==count(j+1)) sum++;
+      },thread_sum);
+    },team_sum);
+
+    // Add with one thread and vectorlane of the team the team_sum to the global value
+    Kokkos::single(Kokkos::PerTeam(thread),[=] () {
+      Kokkos::atomic_add(&gsum(),team_sum);
+    });
+  }
+
+  // The functor needs to define how much shared memory it requests given a team_size.
+  size_t team_shmem_size( int team_size ) const {
+    return shared_1d_int::shmem_size(data.dimension_1());
+  }
+};
+
+int main(int narg, char* args[]) {
+  Kokkos::initialize(narg,args);
+
+  // Produce some 3D random data (see Algorithms/01_random_numbers for more info)
+  Kokkos::View<int***,Kokkos::LayoutRight> data("Data",512,512,32);
+  Kokkos::Random_XorShift64_Pool<> rand_pool64(5374857);
+  Kokkos::fill_random(data,rand_pool64,100);
+
+  // A global value to put the result in
+  Kokkos::View<int> gsum("Sum");
+
+  // Each team handles a slice of the data
+  // Set up TeamPolicy with 512 teams with maximum number of threads per team and 16 vector lanes.
+  // Kokkos::AUTO will determine the number of threads
+  // The maximum vector length is hardware dependent but can always be smaller than the hardware allows.
+  // The vector length must be a power of 2.
+
+  const Kokkos::TeamPolicy<> policy( 512 , Kokkos::AUTO , 16);
+
+  Kokkos::parallel_for( policy , SomeCorrelation(data,gsum) );
+
+  Kokkos::fence();
+
+  // Copy result value back
+  int sum = 0;
+  Kokkos::deep_copy(sum,gsum);
+  printf("Result %i\n",sum);
+
+  Kokkos::finalize();
+}
+
+#endif //KOKKOS_HAVE_CXX11
diff --git a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/CMakeLists.txt b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..15ad5d780340dd0e10c338530f7c88222e742169
--- /dev/null
+++ b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/CMakeLists.txt
@@ -0,0 +1,10 @@
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+# This is a tutorial, not a test, so we don't ask CTest to run it.
+TRIBITS_ADD_EXECUTABLE(
+  tutorial_hierarchicalparallelism_04_team_scan
+  SOURCES team_scan.cpp
+  COMM serial mpi
+  )
diff --git a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/Makefile b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..12ad36b31e458d155aa6dc653ab8188a7773bd18
--- /dev/null
+++ b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/Makefile
@@ -0,0 +1,43 @@
+KOKKOS_PATH = ../../../..
+SRC = $(wildcard *.cpp)
+
+default: build
+	echo "Start Build"
+
+ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
+CXX = ../../../../config/nvcc_wrapper
+CXXFLAGS = -O3
+LINK = ${CXX}
+LINKFLAGS = 
+EXE = $(SRC:.cpp=.cuda)
+KOKKOS_DEVICES = "Cuda,OpenMP"
+KOKKOS_ARCH = "SNB,Kepler35"
+else
+CXX = g++
+CXXFLAGS = -O3
+LINK = ${CXX}
+LINKFLAGS =  
+EXE = $(SRC:.cpp=.host)
+KOKKOS_DEVICES = "OpenMP"
+KOKKOS_ARCH = "SNB"
+endif
+
+DEPFLAGS = -M
+
+OBJ = $(SRC:.cpp=.o)
+LIB =
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+build: $(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: kokkos-clean 
+	rm -f *.o *.cuda *.host
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
diff --git a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/team_scan.cpp b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/team_scan.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c12b11d04ddc99957ec4be93c3928b9c3558cb92
--- /dev/null
+++ b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/team_scan.cpp
@@ -0,0 +1,141 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <Kokkos_DualView.hpp>
+#include <impl/Kokkos_Timer.hpp>
+#include <cstdio>
+#include <cstdlib>
+
+typedef Kokkos::DefaultExecutionSpace       Device ;
+typedef Kokkos::HostSpace::execution_space  Host ;
+
+typedef Kokkos::TeamPolicy< Device >      team_policy ;
+typedef team_policy::member_type team_member ;
+
+static const int TEAM_SIZE = 16 ;
+
+struct find_2_tuples {
+  int chunk_size;
+  Kokkos::View<const int*> data;
+  Kokkos::View<int**> histogram;
+
+  find_2_tuples(int chunk_size_, Kokkos::DualView<int*> data_,
+                Kokkos::DualView<int**> histogram_):chunk_size(chunk_size_),
+                data(data_.d_view),histogram(histogram_.d_view) {
+      data_.sync<Device>();
+      histogram_.sync<Device>();
+      histogram_.modify<Device>();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() ( const team_member & dev) const {
+    Kokkos::View<int**,Kokkos::MemoryUnmanaged> l_histogram(dev.team_shmem(),TEAM_SIZE,TEAM_SIZE);
+    Kokkos::View<int*,Kokkos::MemoryUnmanaged> l_data(dev.team_shmem(),chunk_size+1);
+
+    const int i = dev.league_rank() * chunk_size;
+    for(int j = dev.team_rank(); j<chunk_size+1; j+=dev.team_size())
+      l_data(j) = data(i+j);
+
+    for(int k = dev.team_rank(); k < TEAM_SIZE; k+=dev.team_size())
+      for(int l = 0; l < TEAM_SIZE; l++)
+        l_histogram(k,l) = 0;
+    dev.team_barrier();
+
+    for(int j = 0; j<chunk_size; j++) {
+      for(int k = dev.team_rank(); k < TEAM_SIZE; k+=dev.team_size())
+        for(int l = 0; l < TEAM_SIZE; l++) {
+          if((l_data(j) == k) && (l_data(j+1)==l))
+            l_histogram(k,l)++;
+        }
+    }
+
+    for(int k = dev.team_rank(); k < TEAM_SIZE; k+=dev.team_size())
+      for(int l = 0; l < TEAM_SIZE; l++) {
+        Kokkos::atomic_fetch_add(&histogram(k,l),l_histogram(k,l));
+      }
+    dev.team_barrier();
+  }
+  size_t team_shmem_size( int team_size ) const { return sizeof(int)*(chunk_size+2 + team_size * team_size ); }
+};
+
+int main(int narg, char* args[]) {
+  Kokkos::initialize(narg,args);
+  
+  int chunk_size = 1024;
+  int nchunks = 100000; //1024*1024;
+  Kokkos::DualView<int*> data("data",nchunks*chunk_size+1);
+
+  srand(1231093);
+
+  for(int i = 0; i < (int) data.dimension_0(); i++) {
+    data.h_view(i) = rand()%TEAM_SIZE;
+  }
+  data.modify<Host>();
+  data.sync<Device>();
+
+  Kokkos::DualView<int**> histogram("histogram",TEAM_SIZE,TEAM_SIZE);
+
+
+  Kokkos::Timer timer;
+  // threads/team is automatically limited to maximum supported by the device.
+  Kokkos::parallel_for( team_policy( nchunks , TEAM_SIZE )
+                      , find_2_tuples(chunk_size,data,histogram) );
+  Kokkos::fence();
+  double time = timer.seconds();
+
+  histogram.sync<Host>();
+
+  printf("Time: %f \n\n",time);
+  int sum = 0;
+  for(int k=0; k<TEAM_SIZE; k++) {
+    for(int l=0; l<TEAM_SIZE; l++) {
+      printf("%i ",histogram.h_view(k,l));
+      sum += histogram.h_view(k,l);
+    }
+    printf("\n");
+  }
+  printf("Result: %i %i\n",sum,chunk_size*nchunks);
+  Kokkos::finalize();
+}
+
diff --git a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/CMakeLists.txt b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e03d7aeb901871aec70c712808dea9c322cd6176
--- /dev/null
+++ b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/CMakeLists.txt
@@ -0,0 +1,8 @@
+
+TRIBITS_ADD_EXAMPLE_DIRECTORIES(01_thread_teams)
+
+IF (Kokkos_ENABLE_CXX11)
+  TRIBITS_ADD_EXAMPLE_DIRECTORIES(01_thread_teams_lambda)
+  TRIBITS_ADD_EXAMPLE_DIRECTORIES(02_nested_parallel_for)
+  TRIBITS_ADD_EXAMPLE_DIRECTORIES(03_vectorization)
+ENDIF ()
diff --git a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/Makefile b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..9d6fff7981806a6d28d7704f9d4a0e6c776c8ed0
--- /dev/null
+++ b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/Makefile
@@ -0,0 +1,72 @@
+default:
+	cd ./01_thread_teams; \
+	make -j 4
+	cd ./01_thread_teams_lambda; \
+	make -j 4
+	cd ./02_nested_parallel_for; \
+	make -j 4
+	cd ./03_vectorization; \
+	make -j 4
+	cd ./04_team_scan; \
+	make -j 4
+
+openmp:
+	cd ./01_thread_teams; \
+	make -j 4 KOKKOS_DEVICES=OpenMP
+	cd ./01_thread_teams_lambda; \
+	make -j 4 KOKKOS_DEVICES=OpenMP
+	cd ./02_nested_parallel_for; \
+	make -j 4 KOKKOS_DEVICES=OpenMP
+	cd ./03_vectorization; \
+	make -j 4 KOKKOS_DEVICES=OpenMP
+	cd ./04_team_scan; \
+	make -j 4 KOKKOS_DEVICES=OpenMP
+
+pthreads:
+	cd ./01_thread_teams; \
+	make -j 4 KOKKOS_DEVICES=Pthreads
+	cd ./01_thread_teams_lambda; \
+	make -j 4 KOKKOS_DEVICES=Pthreads
+	cd ./02_nested_parallel_for; \
+	make -j 4 KOKKOS_DEVICES=Pthreads
+	cd ./03_vectorization; \
+	make -j 4 KOKKOS_DEVICES=Pthreads
+	cd ./04_team_scan; \
+	make -j 4 KOKKOS_DEVICES=Pthreads
+
+serial:
+	cd ./01_thread_teams; \
+	make -j 4 KOKKOS_DEVICES=Serial
+	cd ./01_thread_teams_lambda; \
+	make -j 4 KOKKOS_DEVICES=Serial
+	cd ./02_nested_parallel_for; \
+	make -j 4 KOKKOS_DEVICES=Serial
+	cd ./03_vectorization; \
+	make -j 4 KOKKOS_DEVICES=Serial
+	cd ./04_team_scan; \
+	make -j 4 KOKKOS_DEVICES=Serial
+
+cuda:
+	cd ./01_thread_teams; \
+	make -j 4 KOKKOS_DEVICES=Cuda,Serial
+	cd ./01_thread_teams_lambda; \
+	make -j 4 KOKKOS_DEVICES=Cuda,Serial
+	cd ./02_nested_parallel_for; \
+	make -j 4 KOKKOS_DEVICES=Cuda,Serial
+	cd ./03_vectorization; \
+	make -j 4 KOKKOS_DEVICES=Cuda,Serial
+	cd ./04_team_scan; \
+	make -j 4 KOKKOS_DEVICES=Cuda,Serial
+
+clean:
+	cd ./01_thread_teams; \
+	make clean
+	cd ./01_thread_teams_lambda; \
+	make clean
+	cd ./02_nested_parallel_for; \
+	make clean
+	cd ./03_vectorization; \
+	make clean
+	cd ./04_team_scan; \
+	make clean
+
diff --git a/lib/kokkos/example/tutorial/Makefile b/lib/kokkos/example/tutorial/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..300d98ab44340404b31dfb8690ce2a5577b55636
--- /dev/null
+++ b/lib/kokkos/example/tutorial/Makefile
@@ -0,0 +1,144 @@
+default:
+	cd ./01_hello_world; \
+	make -j 4
+	cd ./01_hello_world_lambda; \
+	make -j 4
+	cd ./02_simple_reduce; \
+	make -j 4
+	cd ./02_simple_reduce_lambda; \
+	make -j 4
+	cd ./03_simple_view; \
+	make -j 4
+	cd ./03_simple_view_lambda; \
+	make -j 4
+	cd ./04_simple_memoryspaces; \
+	make -j 4
+	cd ./05_simple_atomics; \
+	make -j 4
+	cd ./Advanced_Views; \
+	make -j 4
+	cd ./Algorithms; \
+	make -j 4
+	cd ./Hierarchical_Parallelism; \
+	make -j 4
+
+openmp:
+	cd ./01_hello_world; \
+	make -j 4 KOKKOS_DEVICES=OpenMP
+	cd ./01_hello_world_lambda; \
+	make -j 4 KOKKOS_DEVICES=OpenMP
+	cd ./02_simple_reduce; \
+	make -j 4 KOKKOS_DEVICES=OpenMP
+	cd ./02_simple_reduce_lambda; \
+	make -j 4 KOKKOS_DEVICES=OpenMP
+	cd ./03_simple_view; \
+	make -j 4 KOKKOS_DEVICES=OpenMP
+	cd ./03_simple_view_lambda; \
+	make -j 4 KOKKOS_DEVICES=OpenMP
+	cd ./04_simple_memoryspaces; \
+	make -j 4 KOKKOS_DEVICES=OpenMP
+	cd ./05_simple_atomics; \
+	make -j 4 KOKKOS_DEVICES=OpenMP
+	cd ./Advanced_Views; \
+	make -j 4 KOKKOS_DEVICES=OpenMP
+	cd ./Algorithms; \
+	make -j 4 KOKKOS_DEVICES=OpenMP
+	cd ./Hierarchical_Parallelism; \
+	make -j 4 KOKKOS_DEVICES=OpenMP
+
+pthreads:
+	cd ./01_hello_world; \
+	make -j 4 KOKKOS_DEVICES=Pthreads
+	cd ./01_hello_world_lambda; \
+	make -j 4 KOKKOS_DEVICES=Pthreads
+	cd ./02_simple_reduce; \
+	make -j 4 KOKKOS_DEVICES=Pthreads
+	cd ./02_simple_reduce_lambda; \
+	make -j 4 KOKKOS_DEVICES=Pthreads
+	cd ./03_simple_view; \
+	make -j 4 KOKKOS_DEVICES=Pthreads
+	cd ./03_simple_view_lambda; \
+	make -j 4 KOKKOS_DEVICES=Pthreads
+	cd ./04_simple_memoryspaces; \
+	make -j 4 KOKKOS_DEVICES=Pthreads
+	cd ./05_simple_atomics; \
+	make -j 4 KOKKOS_DEVICES=Pthreads
+	cd ./Advanced_Views; \
+	make -j 4 KOKKOS_DEVICES=Pthreads
+	cd ./Algorithms; \
+	make -j 4 KOKKOS_DEVICES=Pthreads
+	cd ./Hierarchical_Parallelism; \
+	make -j 4 KOKKOS_DEVICES=Pthreads
+
+serial:
+	cd ./01_hello_world; \
+	make -j 4 KOKKOS_DEVICES=Serial
+	cd ./01_hello_world_lambda; \
+	make -j 4 KOKKOS_DEVICES=Serial
+	cd ./02_simple_reduce; \
+	make -j 4 KOKKOS_DEVICES=Serial
+	cd ./02_simple_reduce_lambda; \
+	make -j 4 KOKKOS_DEVICES=Serial
+	cd ./03_simple_view; \
+	make -j 4 KOKKOS_DEVICES=Serial
+	cd ./03_simple_view_lambda; \
+	make -j 4 KOKKOS_DEVICES=Serial
+	cd ./04_simple_memoryspaces; \
+	make -j 4 KOKKOS_DEVICES=Serial
+	cd ./05_simple_atomics; \
+	make -j 4 KOKKOS_DEVICES=Serial
+	cd ./Advanced_Views; \
+	make -j 4 KOKKOS_DEVICES=Serial
+	cd ./Algorithms; \
+	make -j 4 KOKKOS_DEVICES=Serial
+	cd ./Hierarchical_Parallelism; \
+	make -j 4 KOKKOS_DEVICES=Serial
+
+cuda:
+	cd ./01_hello_world; \
+	make -j 4 KOKKOS_DEVICES=Cuda,Serial
+	cd ./01_hello_world_lambda; \
+	make -j 4 KOKKOS_DEVICES=Cuda,Serial
+	cd ./02_simple_reduce; \
+	make -j 4 KOKKOS_DEVICES=Cuda,Serial
+	cd ./02_simple_reduce_lambda; \
+	make -j 4 KOKKOS_DEVICES=Cuda,Serial
+	cd ./03_simple_view; \
+	make -j 4 KOKKOS_DEVICES=Cuda,Serial
+	cd ./03_simple_view_lambda; \
+	make -j 4 KOKKOS_DEVICES=Cuda,Serial
+	cd ./04_simple_memoryspaces; \
+	make -j 4 KOKKOS_DEVICES=Cuda,Serial
+	cd ./05_simple_atomics; \
+	make -j 4 KOKKOS_DEVICES=Cuda,Serial
+	cd ./Advanced_Views; \
+	make -j 4 KOKKOS_DEVICES=Cuda,Serial
+	cd ./Algorithms; \
+	make -j 4 KOKKOS_DEVICES=Cuda,Serial
+	cd ./Hierarchical_Parallelism; \
+	make -j 4 KOKKOS_DEVICES=Cuda,Serial
+
+clean:
+	cd ./01_hello_world; \
+	make clean
+	cd ./01_hello_world_lambda; \
+	make clean
+	cd ./02_simple_reduce; \
+	make clean
+	cd ./02_simple_reduce_lambda; \
+	make clean
+	cd ./03_simple_view; \
+	make clean
+	cd ./03_simple_view_lambda; \
+	make clean
+	cd ./04_simple_memoryspaces; \
+	make clean
+	cd ./05_simple_atomics; \
+	make clean
+	cd ./Advanced_Views; \
+	make clean
+	cd ./Algorithms; \
+	make clean
+	cd ./Hierarchical_Parallelism; \
+	make clean
+
diff --git a/lib/kokkos/example/tutorial/README b/lib/kokkos/example/tutorial/README
new file mode 100644
index 0000000000000000000000000000000000000000..4ba0b3a5d9e15e3c58326559d7a7f30e5b51ea4c
--- /dev/null
+++ b/lib/kokkos/example/tutorial/README
@@ -0,0 +1,17 @@
+Build the examples by typing in each directory: 
+make -j 16
+
+To specify a target device:
+make openmp -j 16
+make pthreads -j 16
+make serial -j 16
+make cuda -j 16
+
+The lambda variants can not be build with CUDA=yes at the moment, since
+CUDA does not support lambdas from the host. 
+Some of the advanced topics try to highlight performance impacts by timing 
+different variants of doing the same thing.
+Also some of the advanced topics (in particular hierarchical parallelism)
+require C++11 even with out using host side lambdas. CUDA 6.5 can be used 
+to compile those. 
+
diff --git a/lib/kokkos/generate_makefile.bash b/lib/kokkos/generate_makefile.bash
new file mode 100755
index 0000000000000000000000000000000000000000..86f136da96ed10e0a2f23c0cb2752eaaa5287d90
--- /dev/null
+++ b/lib/kokkos/generate_makefile.bash
@@ -0,0 +1,336 @@
+#!/bin/bash
+
+KOKKOS_DEVICES=""
+
+while [[ $# > 0 ]]
+do
+key="$1"
+
+case $key in
+    --kokkos-path*)
+    KOKKOS_PATH="${key#*=}"
+    ;;
+    --prefix*)
+    PREFIX="${key#*=}"
+    ;;
+    --with-cuda)
+    KOKKOS_DEVICES="${KOKKOS_DEVICES},Cuda"
+    CUDA_PATH_NVCC=`which nvcc`
+    CUDA_PATH=${CUDA_PATH_NVCC%/bin/nvcc}
+    ;;
+    # Catch this before '--with-cuda*'
+    --with-cuda-options*)
+    KOKKOS_CUDA_OPT="${key#*=}"
+    ;;
+    --with-cuda*)
+    KOKKOS_DEVICES="${KOKKOS_DEVICES},Cuda"
+    CUDA_PATH="${key#*=}"
+    ;;
+    --with-openmp)
+    KOKKOS_DEVICES="${KOKKOS_DEVICES},OpenMP"
+    ;;
+    --with-pthread)
+    KOKKOS_DEVICES="${KOKKOS_DEVICES},Pthread"
+    ;;
+    --with-serial)
+    KOKKOS_DEVICES="${KOKKOS_DEVICES},Serial"
+    ;;
+    --with-qthread*)
+    KOKKOS_DEVICES="${KOKKOS_DEVICES},Qthread"
+    QTHREAD_PATH="${key#*=}"
+    ;;
+    --with-devices*)
+    DEVICES="${key#*=}"
+    KOKKOS_DEVICES="${KOKKOS_DEVICES},${DEVICES}"
+    ;;
+    --with-gtest*)
+    GTEST_PATH="${key#*=}"
+    ;;
+    --with-hwloc*)
+    HWLOC_PATH="${key#*=}"
+    ;;
+    --arch*)
+    KOKKOS_ARCH="${key#*=}"
+    ;;
+    --cxxflags*)
+    CXXFLAGS="${key#*=}"
+    ;;
+    --ldflags*)
+    LDFLAGS="${key#*=}"
+    ;;
+    --debug|-dbg)
+    KOKKOS_DEBUG=yes
+    ;;
+    --compiler*)
+    COMPILER="${key#*=}"
+    ;;
+    --with-options*)
+    KOKKOS_OPT="${key#*=}"
+    ;;
+    --help)
+    echo "Kokkos configure options:"
+    echo "--kokkos-path=/Path/To/Kokkos: Path to the Kokkos root directory"
+    echo "--prefix=/Install/Path:        Path to where the Kokkos library should be installed"
+    echo ""
+    echo "--with-cuda[=/Path/To/Cuda]:      enable Cuda and set path to Cuda Toolkit"
+    echo "--with-openmp:                    enable OpenMP backend"
+    echo "--with-pthread:                   enable Pthreads backend"
+    echo "--with-serial:                    enable Serial backend"
+    echo "--with-qthread=/Path/To/Qthread:  enable Qthread backend"
+    echo "--with-devices:                   explicitly add a set of backends"
+    echo ""
+    echo "--arch=[OPTIONS]:            set target architectures. Options are:"
+    echo "                               SNB = Intel Sandy/Ivy Bridge CPUs"
+    echo "                               HSW = Intel Haswell CPUs"
+    echo "                               KNC = Intel Knights Corner Xeon Phi"
+    echo "                               KNL = Intel Knights Landing Xeon Phi"
+    echo "                               Kepler30  = NVIDIA Kepler generation CC 3.0"
+    echo "                               Kepler35  = NVIDIA Kepler generation CC 3.5"
+    echo "                               Kepler37  = NVIDIA Kepler generation CC 3.7"
+    echo "                               Maxwell50 = NVIDIA Maxwell generation CC 5.0"
+    echo "                               Power8 = IBM Power 8 CPUs"
+    echo ""
+    echo "--compiler=/Path/To/Compiler set the compiler"
+    echo "--debug,-dbg:                enable Debugging"
+    echo "--cxxflags=[FLAGS]           overwrite CXXFLAGS for library build and test build"
+    echo "                               This will still set certain required flags via"
+    echo "                               KOKKOS_CXXFLAGS (such as -fopenmp, --std=c++11, etc.)"
+    echo "--ldflags=[FLAGS]            overwrite LDFLAGS for library build and test build"
+    echo "                               This will still set certain required flags via"
+    echo "                               KOKKOS_LDFLAGS (such as -fopenmp, -lpthread, etc.)"
+    echo "--with-gtest=/Path/To/Gtest: set path to gtest (used in unit and performance tests"
+    echo "--with-hwloc=/Path/To/Hwloc: set path to hwloc"
+    echo "--with-options=[OPTIONS]:    additional options to Kokkos:"
+    echo "                               aggressive_vectorization = add ivdep on loops"
+    echo "--with-cuda-options=[OPTIONS]: additional options to CUDA:"
+    echo "                               force_uvm, use_ldg, enable_lambda, rdc"
+    exit 0
+    ;;
+    *)
+    echo "warning: ignoring unknown option $key"
+    ;;
+esac
+shift
+done
+
+# If KOKKOS_PATH undefined, assume parent dir of this
+# script is the KOKKOS_PATH
+if [ -z "$KOKKOS_PATH" ]; then
+    KOKKOS_PATH=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
+else
+    # Ensure KOKKOS_PATH is abs path
+    KOKKOS_PATH=$( cd $KOKKOS_PATH && pwd )
+fi
+
+if [ "${KOKKOS_PATH}"  = "${PWD}" ] || [ "${KOKKOS_PATH}"  = "${PWD}/" ]; then
+echo "Running generate_makefile.sh in the Kokkos root directory is not allowed"
+exit 
+fi
+
+KOKKOS_OPTIONS="KOKKOS_PATH=${KOKKOS_PATH}"
+
+if [ ${#COMPILER} -gt 0 ]; then
+KOKKOS_OPTIONS="${KOKKOS_OPTIONS} CXX=${COMPILER}"
+fi
+if [ ${#PREFIX} -gt 0 ]; then
+KOKKOS_OPTIONS="${KOKKOS_OPTIONS} PREFIX=${PREFIX}"
+fi
+if [ ${#KOKKOS_DEVICES} -gt 0 ]; then
+KOKKOS_OPTIONS="${KOKKOS_OPTIONS} KOKKOS_DEVICES=${KOKKOS_DEVICES}"
+fi
+if [ ${#KOKKOS_ARCH} -gt 0 ]; then
+KOKKOS_OPTIONS="${KOKKOS_OPTIONS} KOKKOS_ARCH=${KOKKOS_ARCH}"
+fi
+if [ ${#KOKKOS_DEBUG} -gt 0 ]; then
+KOKKOS_OPTIONS="${KOKKOS_OPTIONS} KOKKOS_DEBUG=${KOKKOS_DEBUG}"
+fi
+if [ ${#CUDA_PATH} -gt 0 ]; then
+KOKKOS_OPTIONS="${KOKKOS_OPTIONS} CUDA_PATH=${CUDA_PATH}"
+fi
+if [ ${#CXXFLAGS} -gt 0 ]; then
+KOKKOS_OPTIONS="${KOKKOS_OPTIONS} CXXFLAGS=\"${CXXFLAGS}\""
+fi
+if [ ${#LDFLAGS} -gt 0 ]; then
+KOKKOS_OPTIONS="${KOKKOS_OPTIONS} LDFLAGS=\"${LDFLAGS}\""
+fi
+if [ ${#GTEST_PATH} -gt 0 ]; then
+KOKKOS_OPTIONS="${KOKKOS_OPTIONS} GTEST_PATH=${GTEST_PATH}"
+else
+GTEST_PATH=${KOKKOS_PATH}/tpls/gtest
+KOKKOS_OPTIONS="${KOKKOS_OPTIONS} GTEST_PATH=${GTEST_PATH}"
+fi
+if [ ${#HWLOC_PATH} -gt 0 ]; then
+KOKKOS_OPTIONS="${KOKKOS_OPTIONS} HWLOC_PATH=${HWLOC_PATH} KOKKOS_USE_TPLS=hwloc"
+fi
+if [ ${#QTHREAD_PATH} -gt 0 ]; then
+KOKKOS_OPTIONS="${KOKKOS_OPTIONS} QTHREAD_PATH=${QTHREAD_PATH}"
+fi
+if [ ${#KOKKOS_OPT} -gt 0 ]; then
+KOKKOS_OPTIONS="${KOKKOS_OPTIONS} KOKKOS_OPTIONS=${KOKKOS_OPT}"
+fi
+if [ ${#KOKKOS_CUDA_OPT} -gt 0 ]; then
+KOKKOS_OPTIONS="${KOKKOS_OPTIONS} KOKKOS_CUDA_OPTIONS=${KOKKOS_CUDA_OPT}"
+fi
+mkdir core
+mkdir core/unit_test
+mkdir core/perf_test
+mkdir containers
+mkdir containers/unit_tests
+mkdir containers/performance_tests
+mkdir algorithms
+mkdir algorithms/unit_tests
+mkdir algorithms/performance_tests
+mkdir example
+mkdir example/fixture
+mkdir example/feint
+mkdir example/fenl
+
+if [ ${#KOKKOS_ENABLE_EXAMPLE_ICHOL} -gt 0 ]; then
+mkdir example/ichol
+fi
+
+# Generate subdirectory makefiles.
+echo "KOKKOS_OPTIONS=${KOKKOS_OPTIONS}" > core/unit_test/Makefile
+echo "" >> core/unit_test/Makefile
+echo "all:" >> core/unit_test/Makefile
+echo -e "\tmake -j -f ${KOKKOS_PATH}/core/unit_test/Makefile ${KOKKOS_OPTIONS}" >> core/unit_test/Makefile
+echo "" >> core/unit_test/Makefile
+echo "test: all" >> core/unit_test/Makefile
+echo -e "\tmake -f ${KOKKOS_PATH}/core/unit_test/Makefile ${KOKKOS_OPTIONS} test" >> core/unit_test/Makefile
+echo "" >> core/unit_test/Makefile
+echo "clean:" >> core/unit_test/Makefile
+echo -e "\tmake -f ${KOKKOS_PATH}/core/unit_test/Makefile ${KOKKOS_OPTIONS} clean" >> core/unit_test/Makefile
+
+echo "KOKKOS_OPTIONS=${KOKKOS_OPTIONS}" > core/perf_test/Makefile
+echo "" >> core/perf_test/Makefile
+echo "all:" >> core/perf_test/Makefile
+echo -e "\tmake -j -f ${KOKKOS_PATH}/core/perf_test/Makefile ${KOKKOS_OPTIONS}" >> core/perf_test/Makefile
+echo "" >> core/perf_test/Makefile
+echo "test: all" >> core/perf_test/Makefile
+echo -e "\tmake -f ${KOKKOS_PATH}/core/perf_test/Makefile ${KOKKOS_OPTIONS} test" >> core/perf_test/Makefile
+echo "" >> core/perf_test/Makefile
+echo "clean:" >> core/perf_test/Makefile
+echo -e "\tmake -f ${KOKKOS_PATH}/core/perf_test/Makefile ${KOKKOS_OPTIONS} clean" >> core/perf_test/Makefile
+
+echo "KOKKOS_OPTIONS=${KOKKOS_OPTIONS}" > containers/unit_tests/Makefile
+echo "" >> containers/unit_tests/Makefile
+echo "all:" >> containers/unit_tests/Makefile
+echo -e "\tmake -j -f ${KOKKOS_PATH}/containers/unit_tests/Makefile ${KOKKOS_OPTIONS}" >> containers/unit_tests/Makefile
+echo "" >> containers/unit_tests/Makefile
+echo "test: all" >> containers/unit_tests/Makefile
+echo -e "\tmake -f ${KOKKOS_PATH}/containers/unit_tests/Makefile ${KOKKOS_OPTIONS} test" >> containers/unit_tests/Makefile
+echo "" >> containers/unit_tests/Makefile
+echo "clean:" >> containers/unit_tests/Makefile
+echo -e "\tmake -f ${KOKKOS_PATH}/containers/unit_tests/Makefile ${KOKKOS_OPTIONS} clean" >> containers/unit_tests/Makefile
+
+echo "KOKKOS_OPTIONS=${KOKKOS_OPTIONS}" > containers/performance_tests/Makefile
+echo "" >> containers/performance_tests/Makefile
+echo "all:" >> containers/performance_tests/Makefile
+echo -e "\tmake -j -f ${KOKKOS_PATH}/containers/performance_tests/Makefile ${KOKKOS_OPTIONS}" >> containers/performance_tests/Makefile
+echo "" >> containers/performance_tests/Makefile
+echo "test: all" >> containers/performance_tests/Makefile
+echo -e "\tmake -f ${KOKKOS_PATH}/containers/performance_tests/Makefile ${KOKKOS_OPTIONS} test" >> containers/performance_tests/Makefile
+echo "" >> containers/performance_tests/Makefile
+echo "clean:" >> containers/performance_tests/Makefile
+echo -e "\tmake -f ${KOKKOS_PATH}/containers/performance_tests/Makefile ${KOKKOS_OPTIONS} clean" >> containers/performance_tests/Makefile
+
+echo "KOKKOS_OPTIONS=${KOKKOS_OPTIONS}" > algorithms/unit_tests/Makefile
+echo "" >> algorithms/unit_tests/Makefile
+echo "all:" >> algorithms/unit_tests/Makefile
+echo -e "\tmake -j -f ${KOKKOS_PATH}/algorithms/unit_tests/Makefile ${KOKKOS_OPTIONS}" >> algorithms/unit_tests/Makefile
+echo "" >> algorithms/unit_tests/Makefile
+echo "test: all" >> algorithms/unit_tests/Makefile
+echo -e "\tmake -f ${KOKKOS_PATH}/algorithms/unit_tests/Makefile ${KOKKOS_OPTIONS} test" >> algorithms/unit_tests/Makefile
+echo "" >> algorithms/unit_tests/Makefile
+echo "clean:" >> algorithms/unit_tests/Makefile
+echo -e "\tmake -f ${KOKKOS_PATH}/algorithms/unit_tests/Makefile ${KOKKOS_OPTIONS} clean" >> algorithms/unit_tests/Makefile
+
+echo "KOKKOS_OPTIONS=${KOKKOS_OPTIONS}" > example/fixture/Makefile
+echo "" >> example/fixture/Makefile
+echo "all:" >> example/fixture/Makefile
+echo -e "\tmake -f ${KOKKOS_PATH}/example/fixture/Makefile ${KOKKOS_OPTIONS}" >> example/fixture/Makefile
+echo "" >> example/fixture/Makefile
+echo "test: all" >> example/fixture/Makefile
+echo -e "\tmake -f ${KOKKOS_PATH}/example/fixture/Makefile ${KOKKOS_OPTIONS} test" >> example/fixture/Makefile
+echo "" >> example/fixture/Makefile
+echo "clean:" >> example/fixture/Makefile
+echo -e "\tmake -f ${KOKKOS_PATH}/example/fixture/Makefile ${KOKKOS_OPTIONS} clean" >> example/fixture/Makefile
+
+echo "KOKKOS_OPTIONS=${KOKKOS_OPTIONS}" > example/feint/Makefile
+echo "" >> example/feint/Makefile
+echo "all:" >> example/feint/Makefile
+echo -e "\tmake -f ${KOKKOS_PATH}/example/feint/Makefile ${KOKKOS_OPTIONS}" >> example/feint/Makefile
+echo "" >> example/feint/Makefile
+echo "test: all" >> example/feint/Makefile
+echo -e "\tmake -f ${KOKKOS_PATH}/example/feint/Makefile ${KOKKOS_OPTIONS} test" >> example/feint/Makefile
+echo "" >> example/feint/Makefile
+echo "clean:" >> example/feint/Makefile
+echo -e "\tmake -f ${KOKKOS_PATH}/example/feint/Makefile ${KOKKOS_OPTIONS} clean" >> example/feint/Makefile
+
+echo "KOKKOS_OPTIONS=${KOKKOS_OPTIONS}" > example/fenl/Makefile
+echo "" >> example/fenl/Makefile
+echo "all:" >> example/fenl/Makefile
+echo -e "\tmake -f ${KOKKOS_PATH}/example/fenl/Makefile ${KOKKOS_OPTIONS}" >> example/fenl/Makefile
+echo "" >> example/fenl/Makefile
+echo "test: all" >> example/fenl/Makefile
+echo -e "\tmake -f ${KOKKOS_PATH}/example/fenl/Makefile ${KOKKOS_OPTIONS} test" >> example/fenl/Makefile
+echo "" >> example/fenl/Makefile
+echo "clean:" >> example/fenl/Makefile
+echo -e "\tmake -f ${KOKKOS_PATH}/example/fenl/Makefile ${KOKKOS_OPTIONS} clean" >> example/fenl/Makefile
+
+if [ ${#KOKKOS_ENABLE_EXAMPLE_ICHOL} -gt 0 ]; then
+echo "KOKKOS_OPTIONS=${KOKKOS_OPTIONS}" > example/ichol/Makefile
+echo "" >> example/ichol/Makefile
+echo "all:" >> example/ichol/Makefile
+echo -e "\tmake -f ${KOKKOS_PATH}/example/ichol/Makefile ${KOKKOS_OPTIONS}" >> example/ichol/Makefile
+echo "" >> example/ichol/Makefile
+echo "test: all" >> example/ichol/Makefile
+echo -e "\tmake -f ${KOKKOS_PATH}/example/ichol/Makefile ${KOKKOS_OPTIONS} test" >> example/ichol/Makefile
+echo "" >> example/ichol/Makefile
+echo "clean:" >> example/ichol/Makefile
+echo -e "\tmake -f ${KOKKOS_PATH}/example/ichol/Makefile ${KOKKOS_OPTIONS} clean" >> example/ichol/Makefile
+fi
+
+# Generate top level directory makefile.
+echo "Generating Makefiles with options " ${KOKKOS_OPTIONS}
+echo "KOKKOS_OPTIONS=${KOKKOS_OPTIONS}" > Makefile
+echo "" >> Makefile
+echo "lib:" >> Makefile
+echo -e "\tcd core; \\" >> Makefile
+echo -e "\tmake -j -f ${KOKKOS_PATH}/core/src/Makefile ${KOKKOS_OPTIONS}" >> Makefile
+echo "" >> Makefile
+echo "install: lib" >> Makefile
+echo -e "\tcd core; \\" >> Makefile
+echo -e "\tmake -j -f ${KOKKOS_PATH}/core/src/Makefile ${KOKKOS_OPTIONS} install" >> Makefile
+echo "" >> Makefile
+echo "build-test:" >> Makefile
+echo -e "\tmake -C core/unit_test" >> Makefile
+echo -e "\tmake -C core/perf_test" >> Makefile
+echo -e "\tmake -C containers/unit_tests" >> Makefile
+echo -e "\tmake -C containers/performance_tests" >> Makefile
+echo -e "\tmake -C algorithms/unit_tests" >> Makefile
+echo -e "\tmake -C example/fixture" >> Makefile
+echo -e "\tmake -C example/feint" >> Makefile
+echo -e "\tmake -C example/fenl" >> Makefile
+echo "" >> Makefile
+echo "test: build-test" >> Makefile
+echo -e "\tmake -C core/unit_test test" >> Makefile
+echo -e "\tmake -C core/perf_test test" >> Makefile
+echo -e "\tmake -C containers/unit_tests test" >> Makefile
+echo -e "\tmake -C containers/performance_tests test" >> Makefile
+echo -e "\tmake -C algorithms/unit_tests test" >> Makefile
+echo -e "\tmake -C example/fixture test" >> Makefile
+echo -e "\tmake -C example/feint test" >> Makefile
+echo -e "\tmake -C example/fenl test" >> Makefile
+echo "" >> Makefile
+echo "clean:" >> Makefile
+echo -e "\tmake -C core/unit_test clean" >> Makefile
+echo -e "\tmake -C core/perf_test clean" >> Makefile
+echo -e "\tmake -C containers/unit_tests clean" >> Makefile
+echo -e "\tmake -C containers/performance_tests clean" >> Makefile
+echo -e "\tmake -C algorithms/unit_tests clean" >> Makefile
+echo -e "\tmake -C example/fixture clean" >> Makefile
+echo -e "\tmake -C example/feint clean" >> Makefile
+echo -e "\tmake -C example/fenl clean" >> Makefile
+echo -e "\tcd core; \\" >> Makefile
+echo -e "\tmake -f ${KOKKOS_PATH}/core/src/Makefile ${KOKKOS_OPTIONS} clean" >> Makefile
diff --git a/lib/kokkos/tpls/gtest/gtest/LICENSE b/lib/kokkos/tpls/gtest/gtest/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..1941a11f8ce94389160b458927a29ba217542818
--- /dev/null
+++ b/lib/kokkos/tpls/gtest/gtest/LICENSE
@@ -0,0 +1,28 @@
+Copyright 2008, Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+    * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/lib/kokkos/tpls/gtest/gtest/README b/lib/kokkos/tpls/gtest/gtest/README
new file mode 100644
index 0000000000000000000000000000000000000000..82964ecc329b474002c66cf534999519e8fc39a3
--- /dev/null
+++ b/lib/kokkos/tpls/gtest/gtest/README
@@ -0,0 +1,13 @@
+This is a fused source version of gtest 1.7.0. All that should be necessary to
+start using gtest in your package is to declare the dependency and include
+gtest/gtest.h.
+
+However, because some of the packages that are developed in Sierra do not use a
+fused source version of gtest we need to make it possible for them to build with
+this version as well as with their native build. To facilitate this we have
+created symlinks for the other gtest headers that they use to the fused source
+gtest.h. This will make it possible for them find the headers while still using
+the fuse source version. This should not have any ill effects since the header is
+protected and allows for only using the non-gtest.h headers in their files.
+
+
diff --git a/lib/kokkos/tpls/gtest/gtest/gtest-all.cc b/lib/kokkos/tpls/gtest/gtest/gtest-all.cc
new file mode 100644
index 0000000000000000000000000000000000000000..538c78db930ea72b5de6d5a9282c2f69e71e5c13
--- /dev/null
+++ b/lib/kokkos/tpls/gtest/gtest/gtest-all.cc
@@ -0,0 +1,9594 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: mheule@google.com (Markus Heule)
+//
+// Google C++ Testing Framework (Google Test)
+//
+// Sometimes it's desirable to build Google Test by compiling a single file.
+// This file serves this purpose.
+
+// This line ensures that gtest.h can be compiled on its own, even
+// when it's fused.
+#include "gtest/gtest.h"
+
+// The following lines pull in the real gtest *.cc files.
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+//
+// The Google C++ Testing Framework (Google Test)
+
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+//
+// Utilities for testing Google Test itself and code that uses Google Test
+// (e.g. frameworks built on top of Google Test).
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_SPI_H_
+#define GTEST_INCLUDE_GTEST_GTEST_SPI_H_
+
+
+namespace testing {
+
+// This helper class can be used to mock out Google Test failure reporting
+// so that we can test Google Test or code that builds on Google Test.
+//
+// An object of this class appends a TestPartResult object to the
+// TestPartResultArray object given in the constructor whenever a Google Test
+// failure is reported. It can either intercept only failures that are
+// generated in the same thread that created this object or it can intercept
+// all generated failures. The scope of this mock object can be controlled with
+// the second argument to the two arguments constructor.
+class GTEST_API_ ScopedFakeTestPartResultReporter
+    : public TestPartResultReporterInterface {
+ public:
+  // The two possible mocking modes of this object.
+  enum InterceptMode {
+    INTERCEPT_ONLY_CURRENT_THREAD,  // Intercepts only thread local failures.
+    INTERCEPT_ALL_THREADS           // Intercepts all failures.
+  };
+
+  // The c'tor sets this object as the test part result reporter used
+  // by Google Test.  The 'result' parameter specifies where to report the
+  // results. This reporter will only catch failures generated in the current
+  // thread. DEPRECATED
+  explicit ScopedFakeTestPartResultReporter(TestPartResultArray* result);
+
+  // Same as above, but you can choose the interception scope of this object.
+  ScopedFakeTestPartResultReporter(InterceptMode intercept_mode,
+                                   TestPartResultArray* result);
+
+  // The d'tor restores the previous test part result reporter.
+  virtual ~ScopedFakeTestPartResultReporter();
+
+  // Appends the TestPartResult object to the TestPartResultArray
+  // received in the constructor.
+  //
+  // This method is from the TestPartResultReporterInterface
+  // interface.
+  virtual void ReportTestPartResult(const TestPartResult& result);
+ private:
+  void Init();
+
+  const InterceptMode intercept_mode_;
+  TestPartResultReporterInterface* old_reporter_;
+  TestPartResultArray* const result_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedFakeTestPartResultReporter);
+};
+
+namespace internal {
+
+// A helper class for implementing EXPECT_FATAL_FAILURE() and
+// EXPECT_NONFATAL_FAILURE().  Its destructor verifies that the given
+// TestPartResultArray contains exactly one failure that has the given
+// type and contains the given substring.  If that's not the case, a
+// non-fatal failure will be generated.
+class GTEST_API_ SingleFailureChecker {
+ public:
+  // The constructor remembers the arguments.
+  SingleFailureChecker(const TestPartResultArray* results,
+                       TestPartResult::Type type,
+                       const string& substr);
+  ~SingleFailureChecker();
+ private:
+  const TestPartResultArray* const results_;
+  const TestPartResult::Type type_;
+  const string substr_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(SingleFailureChecker);
+};
+
+}  // namespace internal
+
+}  // namespace testing
+
+// A set of macros for testing Google Test assertions or code that's expected
+// to generate Google Test fatal failures.  It verifies that the given
+// statement will cause exactly one fatal Google Test failure with 'substr'
+// being part of the failure message.
+//
+// There are two different versions of this macro. EXPECT_FATAL_FAILURE only
+// affects and considers failures generated in the current thread and
+// EXPECT_FATAL_FAILURE_ON_ALL_THREADS does the same but for all threads.
+//
+// The verification of the assertion is done correctly even when the statement
+// throws an exception or aborts the current function.
+//
+// Known restrictions:
+//   - 'statement' cannot reference local non-static variables or
+//     non-static members of the current object.
+//   - 'statement' cannot return a value.
+//   - You cannot stream a failure message to this macro.
+//
+// Note that even though the implementations of the following two
+// macros are much alike, we cannot refactor them to use a common
+// helper macro, due to some peculiarity in how the preprocessor
+// works.  The AcceptsMacroThatExpandsToUnprotectedComma test in
+// gtest_unittest.cc will fail to compile if we do that.
+#define EXPECT_FATAL_FAILURE(statement, substr) \
+  do { \
+    class GTestExpectFatalFailureHelper {\
+     public:\
+      static void Execute() { statement; }\
+    };\
+    ::testing::TestPartResultArray gtest_failures;\
+    ::testing::internal::SingleFailureChecker gtest_checker(\
+        &gtest_failures, ::testing::TestPartResult::kFatalFailure, (substr));\
+    {\
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
+          ::testing::ScopedFakeTestPartResultReporter:: \
+          INTERCEPT_ONLY_CURRENT_THREAD, &gtest_failures);\
+      GTestExpectFatalFailureHelper::Execute();\
+    }\
+  } while (::testing::internal::AlwaysFalse())
+
+#define EXPECT_FATAL_FAILURE_ON_ALL_THREADS(statement, substr) \
+  do { \
+    class GTestExpectFatalFailureHelper {\
+     public:\
+      static void Execute() { statement; }\
+    };\
+    ::testing::TestPartResultArray gtest_failures;\
+    ::testing::internal::SingleFailureChecker gtest_checker(\
+        &gtest_failures, ::testing::TestPartResult::kFatalFailure, (substr));\
+    {\
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
+          ::testing::ScopedFakeTestPartResultReporter:: \
+          INTERCEPT_ALL_THREADS, &gtest_failures);\
+      GTestExpectFatalFailureHelper::Execute();\
+    }\
+  } while (::testing::internal::AlwaysFalse())
+
+// A macro for testing Google Test assertions or code that's expected to
+// generate Google Test non-fatal failures.  It asserts that the given
+// statement will cause exactly one non-fatal Google Test failure with 'substr'
+// being part of the failure message.
+//
+// There are two different versions of this macro. EXPECT_NONFATAL_FAILURE only
+// affects and considers failures generated in the current thread and
+// EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS does the same but for all threads.
+//
+// 'statement' is allowed to reference local variables and members of
+// the current object.
+//
+// The verification of the assertion is done correctly even when the statement
+// throws an exception or aborts the current function.
+//
+// Known restrictions:
+//   - You cannot stream a failure message to this macro.
+//
+// Note that even though the implementations of the following two
+// macros are much alike, we cannot refactor them to use a common
+// helper macro, due to some peculiarity in how the preprocessor
+// works.  If we do that, the code won't compile when the user gives
+// EXPECT_NONFATAL_FAILURE() a statement that contains a macro that
+// expands to code containing an unprotected comma.  The
+// AcceptsMacroThatExpandsToUnprotectedComma test in gtest_unittest.cc
+// catches that.
+//
+// For the same reason, we have to write
+//   if (::testing::internal::AlwaysTrue()) { statement; }
+// instead of
+//   GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement)
+// to avoid an MSVC warning on unreachable code.
+#define EXPECT_NONFATAL_FAILURE(statement, substr) \
+  do {\
+    ::testing::TestPartResultArray gtest_failures;\
+    ::testing::internal::SingleFailureChecker gtest_checker(\
+        &gtest_failures, ::testing::TestPartResult::kNonFatalFailure, \
+        (substr));\
+    {\
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
+          ::testing::ScopedFakeTestPartResultReporter:: \
+          INTERCEPT_ONLY_CURRENT_THREAD, &gtest_failures);\
+      if (::testing::internal::AlwaysTrue()) { statement; }\
+    }\
+  } while (::testing::internal::AlwaysFalse())
+
+#define EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS(statement, substr) \
+  do {\
+    ::testing::TestPartResultArray gtest_failures;\
+    ::testing::internal::SingleFailureChecker gtest_checker(\
+        &gtest_failures, ::testing::TestPartResult::kNonFatalFailure, \
+        (substr));\
+    {\
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
+          ::testing::ScopedFakeTestPartResultReporter::INTERCEPT_ALL_THREADS, \
+          &gtest_failures);\
+      if (::testing::internal::AlwaysTrue()) { statement; }\
+    }\
+  } while (::testing::internal::AlwaysFalse())
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_SPI_H_
+
+#include <ctype.h>
+#include <math.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <wchar.h>
+#include <wctype.h>
+
+#include <algorithm>
+#include <iomanip>
+#include <limits>
+#include <ostream>  // NOLINT
+#include <sstream>
+#include <vector>
+
+#if GTEST_OS_LINUX
+
+// TODO(kenton@google.com): Use autoconf to detect availability of
+// gettimeofday().
+# define GTEST_HAS_GETTIMEOFDAY_ 1
+
+# include <fcntl.h>  // NOLINT
+# include <limits.h>  // NOLINT
+# include <sched.h>  // NOLINT
+// Declares vsnprintf().  This header is not available on Windows.
+# include <strings.h>  // NOLINT
+# include <sys/mman.h>  // NOLINT
+# include <sys/time.h>  // NOLINT
+# include <unistd.h>  // NOLINT
+# include <string>
+
+#elif GTEST_OS_SYMBIAN
+# define GTEST_HAS_GETTIMEOFDAY_ 1
+# include <sys/time.h>  // NOLINT
+
+#elif GTEST_OS_ZOS
+# define GTEST_HAS_GETTIMEOFDAY_ 1
+# include <sys/time.h>  // NOLINT
+
+// On z/OS we additionally need strings.h for strcasecmp.
+# include <strings.h>  // NOLINT
+
+#elif GTEST_OS_WINDOWS_MOBILE  // We are on Windows CE.
+
+# include <windows.h>  // NOLINT
+
+#elif GTEST_OS_WINDOWS  // We are on Windows proper.
+
+# include <io.h>  // NOLINT
+# include <sys/timeb.h>  // NOLINT
+# include <sys/types.h>  // NOLINT
+# include <sys/stat.h>  // NOLINT
+
+# if GTEST_OS_WINDOWS_MINGW
+// MinGW has gettimeofday() but not _ftime64().
+// TODO(kenton@google.com): Use autoconf to detect availability of
+//   gettimeofday().
+// TODO(kenton@google.com): There are other ways to get the time on
+//   Windows, like GetTickCount() or GetSystemTimeAsFileTime().  MinGW
+//   supports these.  consider using them instead.
+#  define GTEST_HAS_GETTIMEOFDAY_ 1
+#  include <sys/time.h>  // NOLINT
+# endif  // GTEST_OS_WINDOWS_MINGW
+
+// cpplint thinks that the header is already included, so we want to
+// silence it.
+# include <windows.h>  // NOLINT
+
+#else
+
+// Assume other platforms have gettimeofday().
+// TODO(kenton@google.com): Use autoconf to detect availability of
+//   gettimeofday().
+# define GTEST_HAS_GETTIMEOFDAY_ 1
+
+// cpplint thinks that the header is already included, so we want to
+// silence it.
+# include <sys/time.h>  // NOLINT
+# include <unistd.h>  // NOLINT
+
+#endif  // GTEST_OS_LINUX
+
+#if GTEST_HAS_EXCEPTIONS
+# include <stdexcept>
+#endif
+
+#if GTEST_CAN_STREAM_RESULTS_
+# include <arpa/inet.h>  // NOLINT
+# include <netdb.h>  // NOLINT
+#endif
+
+// Indicates that this translation unit is part of Google Test's
+// implementation.  It must come before gtest-internal-inl.h is
+// included, or there will be a compiler error.  This trick is to
+// prevent a user from accidentally including gtest-internal-inl.h in
+// his code.
+#define GTEST_IMPLEMENTATION_ 1
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Utility functions and classes used by the Google C++ testing framework.
+//
+// Author: wan@google.com (Zhanyong Wan)
+//
+// This file contains purely Google Test's internal implementation.  Please
+// DO NOT #INCLUDE IT IN A USER PROGRAM.
+
+#ifndef GTEST_SRC_GTEST_INTERNAL_INL_H_
+#define GTEST_SRC_GTEST_INTERNAL_INL_H_
+
+// GTEST_IMPLEMENTATION_ is defined to 1 iff the current translation unit is
+// part of Google Test's implementation; otherwise it's undefined.
+#if !GTEST_IMPLEMENTATION_
+// A user is trying to include this from his code - just say no.
+# error "gtest-internal-inl.h is part of Google Test's internal implementation."
+# error "It must not be included except by Google Test itself."
+#endif  // GTEST_IMPLEMENTATION_
+
+#ifndef _WIN32_WCE
+# include <errno.h>
+#endif  // !_WIN32_WCE
+#include <stddef.h>
+#include <stdlib.h>  // For strtoll/_strtoul64/malloc/free.
+#include <string.h>  // For memmove.
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+
+#if GTEST_CAN_STREAM_RESULTS_
+# include <arpa/inet.h>  // NOLINT
+# include <netdb.h>  // NOLINT
+#endif
+
+#if GTEST_OS_WINDOWS
+# include <windows.h>  // NOLINT
+#endif  // GTEST_OS_WINDOWS
+
+
+namespace testing {
+
+// Declares the flags.
+//
+// We don't want the users to modify this flag in the code, but want
+// Google Test's own unit tests to be able to access it. Therefore we
+// declare it here as opposed to in gtest.h.
+GTEST_DECLARE_bool_(death_test_use_fork);
+
+namespace internal {
+
+// The value of GetTestTypeId() as seen from within the Google Test
+// library.  This is solely for testing GetTestTypeId().
+GTEST_API_ extern const TypeId kTestTypeIdInGoogleTest;
+
+// Names of the flags (needed for parsing Google Test flags).
+const char kAlsoRunDisabledTestsFlag[] = "also_run_disabled_tests";
+const char kBreakOnFailureFlag[] = "break_on_failure";
+const char kCatchExceptionsFlag[] = "catch_exceptions";
+const char kColorFlag[] = "color";
+const char kFilterFlag[] = "filter";
+const char kListTestsFlag[] = "list_tests";
+const char kOutputFlag[] = "output";
+const char kPrintTimeFlag[] = "print_time";
+const char kRandomSeedFlag[] = "random_seed";
+const char kRepeatFlag[] = "repeat";
+const char kShuffleFlag[] = "shuffle";
+const char kStackTraceDepthFlag[] = "stack_trace_depth";
+const char kStreamResultToFlag[] = "stream_result_to";
+const char kThrowOnFailureFlag[] = "throw_on_failure";
+
+// A valid random seed must be in [1, kMaxRandomSeed].
+const int kMaxRandomSeed = 99999;
+
+// g_help_flag is true iff the --help flag or an equivalent form is
+// specified on the command line.
+GTEST_API_ extern bool g_help_flag;
+
+// Returns the current time in milliseconds.
+GTEST_API_ TimeInMillis GetTimeInMillis();
+
+// Returns true iff Google Test should use colors in the output.
+GTEST_API_ bool ShouldUseColor(bool stdout_is_tty);
+
+// Formats the given time in milliseconds as seconds.
+GTEST_API_ std::string FormatTimeInMillisAsSeconds(TimeInMillis ms);
+
+// Converts the given time in milliseconds to a date string in the ISO 8601
+// format, without the timezone information.  N.B.: due to the use the
+// non-reentrant localtime() function, this function is not thread safe.  Do
+// not use it in any code that can be called from multiple threads.
+GTEST_API_ std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms);
+
+// Parses a string for an Int32 flag, in the form of "--flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+GTEST_API_ bool ParseInt32Flag(
+    const char* str, const char* flag, Int32* value);
+
+// Returns a random seed in range [1, kMaxRandomSeed] based on the
+// given --gtest_random_seed flag value.
+inline int GetRandomSeedFromFlag(Int32 random_seed_flag) {
+  const unsigned int raw_seed = (random_seed_flag == 0) ?
+      static_cast<unsigned int>(GetTimeInMillis()) :
+      static_cast<unsigned int>(random_seed_flag);
+
+  // Normalizes the actual seed to range [1, kMaxRandomSeed] such that
+  // it's easy to type.
+  const int normalized_seed =
+      static_cast<int>((raw_seed - 1U) %
+                       static_cast<unsigned int>(kMaxRandomSeed)) + 1;
+  return normalized_seed;
+}
+
+// Returns the first valid random seed after 'seed'.  The behavior is
+// undefined if 'seed' is invalid.  The seed after kMaxRandomSeed is
+// considered to be 1.
+inline int GetNextRandomSeed(int seed) {
+  GTEST_CHECK_(1 <= seed && seed <= kMaxRandomSeed)
+      << "Invalid random seed " << seed << " - must be in [1, "
+      << kMaxRandomSeed << "].";
+  const int next_seed = seed + 1;
+  return (next_seed > kMaxRandomSeed) ? 1 : next_seed;
+}
+
+// This class saves the values of all Google Test flags in its c'tor, and
+// restores them in its d'tor.
+class GTestFlagSaver {
+ public:
+  // The c'tor.
+  GTestFlagSaver() {
+    also_run_disabled_tests_ = GTEST_FLAG(also_run_disabled_tests);
+    break_on_failure_ = GTEST_FLAG(break_on_failure);
+    catch_exceptions_ = GTEST_FLAG(catch_exceptions);
+    color_ = GTEST_FLAG(color);
+    death_test_style_ = GTEST_FLAG(death_test_style);
+    death_test_use_fork_ = GTEST_FLAG(death_test_use_fork);
+    filter_ = GTEST_FLAG(filter);
+    internal_run_death_test_ = GTEST_FLAG(internal_run_death_test);
+    list_tests_ = GTEST_FLAG(list_tests);
+    output_ = GTEST_FLAG(output);
+    print_time_ = GTEST_FLAG(print_time);
+    random_seed_ = GTEST_FLAG(random_seed);
+    repeat_ = GTEST_FLAG(repeat);
+    shuffle_ = GTEST_FLAG(shuffle);
+    stack_trace_depth_ = GTEST_FLAG(stack_trace_depth);
+    stream_result_to_ = GTEST_FLAG(stream_result_to);
+    throw_on_failure_ = GTEST_FLAG(throw_on_failure);
+  }
+
+  // The d'tor is not virtual.  DO NOT INHERIT FROM THIS CLASS.
+  ~GTestFlagSaver() {
+    GTEST_FLAG(also_run_disabled_tests) = also_run_disabled_tests_;
+    GTEST_FLAG(break_on_failure) = break_on_failure_;
+    GTEST_FLAG(catch_exceptions) = catch_exceptions_;
+    GTEST_FLAG(color) = color_;
+    GTEST_FLAG(death_test_style) = death_test_style_;
+    GTEST_FLAG(death_test_use_fork) = death_test_use_fork_;
+    GTEST_FLAG(filter) = filter_;
+    GTEST_FLAG(internal_run_death_test) = internal_run_death_test_;
+    GTEST_FLAG(list_tests) = list_tests_;
+    GTEST_FLAG(output) = output_;
+    GTEST_FLAG(print_time) = print_time_;
+    GTEST_FLAG(random_seed) = random_seed_;
+    GTEST_FLAG(repeat) = repeat_;
+    GTEST_FLAG(shuffle) = shuffle_;
+    GTEST_FLAG(stack_trace_depth) = stack_trace_depth_;
+    GTEST_FLAG(stream_result_to) = stream_result_to_;
+    GTEST_FLAG(throw_on_failure) = throw_on_failure_;
+  }
+
+ private:
+  // Fields for saving the original values of flags.
+  bool also_run_disabled_tests_;
+  bool break_on_failure_;
+  bool catch_exceptions_;
+  std::string color_;
+  std::string death_test_style_;
+  bool death_test_use_fork_;
+  std::string filter_;
+  std::string internal_run_death_test_;
+  bool list_tests_;
+  std::string output_;
+  bool print_time_;
+  internal::Int32 random_seed_;
+  internal::Int32 repeat_;
+  bool shuffle_;
+  internal::Int32 stack_trace_depth_;
+  std::string stream_result_to_;
+  bool throw_on_failure_;
+} GTEST_ATTRIBUTE_UNUSED_;
+
+// Converts a Unicode code point to a narrow string in UTF-8 encoding.
+// code_point parameter is of type UInt32 because wchar_t may not be
+// wide enough to contain a code point.
+// If the code_point is not a valid Unicode code point
+// (i.e. outside of Unicode range U+0 to U+10FFFF) it will be converted
+// to "(Invalid Unicode 0xXXXXXXXX)".
+GTEST_API_ std::string CodePointToUtf8(UInt32 code_point);
+
+// Converts a wide string to a narrow string in UTF-8 encoding.
+// The wide string is assumed to have the following encoding:
+//   UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin, Symbian OS)
+//   UTF-32 if sizeof(wchar_t) == 4 (on Linux)
+// Parameter str points to a null-terminated wide string.
+// Parameter num_chars may additionally limit the number
+// of wchar_t characters processed. -1 is used when the entire string
+// should be processed.
+// If the string contains code points that are not valid Unicode code points
+// (i.e. outside of Unicode range U+0 to U+10FFFF) they will be output
+// as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding
+// and contains invalid UTF-16 surrogate pairs, values in those pairs
+// will be encoded as individual Unicode characters from Basic Normal Plane.
+GTEST_API_ std::string WideStringToUtf8(const wchar_t* str, int num_chars);
+
+// Reads the GTEST_SHARD_STATUS_FILE environment variable, and creates the file
+// if the variable is present. If a file already exists at this location, this
+// function will write over it. If the variable is present, but the file cannot
+// be created, prints an error and exits.
+void WriteToShardStatusFileIfNeeded();
+
+// Checks whether sharding is enabled by examining the relevant
+// environment variable values. If the variables are present,
+// but inconsistent (e.g., shard_index >= total_shards), prints
+// an error and exits. If in_subprocess_for_death_test, sharding is
+// disabled because it must only be applied to the original test
+// process. Otherwise, we could filter out death tests we intended to execute.
+GTEST_API_ bool ShouldShard(const char* total_shards_str,
+                            const char* shard_index_str,
+                            bool in_subprocess_for_death_test);
+
+// Parses the environment variable var as an Int32. If it is unset,
+// returns default_val. If it is not an Int32, prints an error and
+// and aborts.
+GTEST_API_ Int32 Int32FromEnvOrDie(const char* env_var, Int32 default_val);
+
+// Given the total number of shards, the shard index, and the test id,
+// returns true iff the test should be run on this shard. The test id is
+// some arbitrary but unique non-negative integer assigned to each test
+// method. Assumes that 0 <= shard_index < total_shards.
+GTEST_API_ bool ShouldRunTestOnShard(
+    int total_shards, int shard_index, int test_id);
+
+// STL container utilities.
+
+// Returns the number of elements in the given container that satisfy
+// the given predicate.
+template <class Container, typename Predicate>
+inline int CountIf(const Container& c, Predicate predicate) {
+  // Implemented as an explicit loop since std::count_if() in libCstd on
+  // Solaris has a non-standard signature.
+  int count = 0;
+  for (typename Container::const_iterator it = c.begin(); it != c.end(); ++it) {
+    if (predicate(*it))
+      ++count;
+  }
+  return count;
+}
+
+// Applies a function/functor to each element in the container.
+template <class Container, typename Functor>
+void ForEach(const Container& c, Functor functor) {
+  std::for_each(c.begin(), c.end(), functor);
+}
+
+// Returns the i-th element of the vector, or default_value if i is not
+// in range [0, v.size()).
+template <typename E>
+inline E GetElementOr(const std::vector<E>& v, int i, E default_value) {
+  return (i < 0 || i >= static_cast<int>(v.size())) ? default_value : v[i];
+}
+
+// Performs an in-place shuffle of a range of the vector's elements.
+// 'begin' and 'end' are element indices as an STL-style range;
+// i.e. [begin, end) are shuffled, where 'end' == size() means to
+// shuffle to the end of the vector.
+template <typename E>
+void ShuffleRange(internal::Random* random, int begin, int end,
+                  std::vector<E>* v) {
+  const int size = static_cast<int>(v->size());
+  GTEST_CHECK_(0 <= begin && begin <= size)
+      << "Invalid shuffle range start " << begin << ": must be in range [0, "
+      << size << "].";
+  GTEST_CHECK_(begin <= end && end <= size)
+      << "Invalid shuffle range finish " << end << ": must be in range ["
+      << begin << ", " << size << "].";
+
+  // Fisher-Yates shuffle, from
+  // http://en.wikipedia.org/wiki/Fisher-Yates_shuffle
+  for (int range_width = end - begin; range_width >= 2; range_width--) {
+    const int last_in_range = begin + range_width - 1;
+    const int selected = begin + random->Generate(range_width);
+    std::swap((*v)[selected], (*v)[last_in_range]);
+  }
+}
+
+// Performs an in-place shuffle of the vector's elements.
+template <typename E>
+inline void Shuffle(internal::Random* random, std::vector<E>* v) {
+  ShuffleRange(random, 0, static_cast<int>(v->size()), v);
+}
+
+// A function for deleting an object.  Handy for being used as a
+// functor.
+template <typename T>
+static void Delete(T* x) {
+  delete x;
+}
+
+// A predicate that checks the key of a TestProperty against a known key.
+//
+// TestPropertyKeyIs is copyable.
+class TestPropertyKeyIs {
+ public:
+  // Constructor.
+  //
+  // TestPropertyKeyIs has NO default constructor.
+  explicit TestPropertyKeyIs(const std::string& key) : key_(key) {}
+
+  // Returns true iff the test name of test property matches on key_.
+  bool operator()(const TestProperty& test_property) const {
+    return test_property.key() == key_;
+  }
+
+ private:
+  std::string key_;
+};
+
+// Class UnitTestOptions.
+//
+// This class contains functions for processing options the user
+// specifies when running the tests.  It has only static members.
+//
+// In most cases, the user can specify an option using either an
+// environment variable or a command line flag.  E.g. you can set the
+// test filter using either GTEST_FILTER or --gtest_filter.  If both
+// the variable and the flag are present, the latter overrides the
+// former.
+class GTEST_API_ UnitTestOptions {
+ public:
+  // Functions for processing the gtest_output flag.
+
+  // Returns the output format, or "" for normal printed output.
+  static std::string GetOutputFormat();
+
+  // Returns the absolute path of the requested output file, or the
+  // default (test_detail.xml in the original working directory) if
+  // none was explicitly specified.
+  static std::string GetAbsolutePathToOutputFile();
+
+  // Functions for processing the gtest_filter flag.
+
+  // Returns true iff the wildcard pattern matches the string.  The
+  // first ':' or '\0' character in pattern marks the end of it.
+  //
+  // This recursive algorithm isn't very efficient, but is clear and
+  // works well enough for matching test names, which are short.
+  static bool PatternMatchesString(const char *pattern, const char *str);
+
+  // Returns true iff the user-specified filter matches the test case
+  // name and the test name.
+  static bool FilterMatchesTest(const std::string &test_case_name,
+                                const std::string &test_name);
+
+#if GTEST_OS_WINDOWS
+  // Function for supporting the gtest_catch_exception flag.
+
+  // Returns EXCEPTION_EXECUTE_HANDLER if Google Test should handle the
+  // given SEH exception, or EXCEPTION_CONTINUE_SEARCH otherwise.
+  // This function is useful as an __except condition.
+  static int GTestShouldProcessSEH(DWORD exception_code);
+#endif  // GTEST_OS_WINDOWS
+
+  // Returns true if "name" matches the ':' separated list of glob-style
+  // filters in "filter".
+  static bool MatchesFilter(const std::string& name, const char* filter);
+};
+
+// Returns the current application's name, removing directory path if that
+// is present.  Used by UnitTestOptions::GetOutputFile.
+GTEST_API_ FilePath GetCurrentExecutableName();
+
+// The role interface for getting the OS stack trace as a string.
+class OsStackTraceGetterInterface {
+ public:
+  OsStackTraceGetterInterface() {}
+  virtual ~OsStackTraceGetterInterface() {}
+
+  // Returns the current OS stack trace as an std::string.  Parameters:
+  //
+  //   max_depth  - the maximum number of stack frames to be included
+  //                in the trace.
+  //   skip_count - the number of top frames to be skipped; doesn't count
+  //                against max_depth.
+  virtual string CurrentStackTrace(int max_depth, int skip_count) = 0;
+
+  // UponLeavingGTest() should be called immediately before Google Test calls
+  // user code. It saves some information about the current stack that
+  // CurrentStackTrace() will use to find and hide Google Test stack frames.
+  virtual void UponLeavingGTest() = 0;
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetterInterface);
+};
+
+// A working implementation of the OsStackTraceGetterInterface interface.
+class OsStackTraceGetter : public OsStackTraceGetterInterface {
+ public:
+  OsStackTraceGetter() : caller_frame_(NULL) {}
+
+  virtual string CurrentStackTrace(int max_depth, int skip_count)
+      GTEST_LOCK_EXCLUDED_(mutex_);
+
+  virtual void UponLeavingGTest() GTEST_LOCK_EXCLUDED_(mutex_);
+
+  // This string is inserted in place of stack frames that are part of
+  // Google Test's implementation.
+  static const char* const kElidedFramesMarker;
+
+ private:
+  Mutex mutex_;  // protects all internal state
+
+  // We save the stack frame below the frame that calls user code.
+  // We do this because the address of the frame immediately below
+  // the user code changes between the call to UponLeavingGTest()
+  // and any calls to CurrentStackTrace() from within the user code.
+  void* caller_frame_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetter);
+};
+
+// Information about a Google Test trace point.
+struct TraceInfo {
+  const char* file;
+  int line;
+  std::string message;
+};
+
+// This is the default global test part result reporter used in UnitTestImpl.
+// This class should only be used by UnitTestImpl.
+class DefaultGlobalTestPartResultReporter
+  : public TestPartResultReporterInterface {
+ public:
+  explicit DefaultGlobalTestPartResultReporter(UnitTestImpl* unit_test);
+  // Implements the TestPartResultReporterInterface. Reports the test part
+  // result in the current test.
+  virtual void ReportTestPartResult(const TestPartResult& result);
+
+ private:
+  UnitTestImpl* const unit_test_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultGlobalTestPartResultReporter);
+};
+
+// This is the default per thread test part result reporter used in
+// UnitTestImpl. This class should only be used by UnitTestImpl.
+class DefaultPerThreadTestPartResultReporter
+    : public TestPartResultReporterInterface {
+ public:
+  explicit DefaultPerThreadTestPartResultReporter(UnitTestImpl* unit_test);
+  // Implements the TestPartResultReporterInterface. The implementation just
+  // delegates to the current global test part result reporter of *unit_test_.
+  virtual void ReportTestPartResult(const TestPartResult& result);
+
+ private:
+  UnitTestImpl* const unit_test_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultPerThreadTestPartResultReporter);
+};
+
+// The private implementation of the UnitTest class.  We don't protect
+// the methods under a mutex, as this class is not accessible by a
+// user and the UnitTest class that delegates work to this class does
+// proper locking.
+class GTEST_API_ UnitTestImpl {
+ public:
+  explicit UnitTestImpl(UnitTest* parent);
+  virtual ~UnitTestImpl();
+
+  // There are two different ways to register your own TestPartResultReporter.
+  // You can register your own repoter to listen either only for test results
+  // from the current thread or for results from all threads.
+  // By default, each per-thread test result repoter just passes a new
+  // TestPartResult to the global test result reporter, which registers the
+  // test part result for the currently running test.
+
+  // Returns the global test part result reporter.
+  TestPartResultReporterInterface* GetGlobalTestPartResultReporter();
+
+  // Sets the global test part result reporter.
+  void SetGlobalTestPartResultReporter(
+      TestPartResultReporterInterface* reporter);
+
+  // Returns the test part result reporter for the current thread.
+  TestPartResultReporterInterface* GetTestPartResultReporterForCurrentThread();
+
+  // Sets the test part result reporter for the current thread.
+  void SetTestPartResultReporterForCurrentThread(
+      TestPartResultReporterInterface* reporter);
+
+  // Gets the number of successful test cases.
+  int successful_test_case_count() const;
+
+  // Gets the number of failed test cases.
+  int failed_test_case_count() const;
+
+  // Gets the number of all test cases.
+  int total_test_case_count() const;
+
+  // Gets the number of all test cases that contain at least one test
+  // that should run.
+  int test_case_to_run_count() const;
+
+  // Gets the number of successful tests.
+  int successful_test_count() const;
+
+  // Gets the number of failed tests.
+  int failed_test_count() const;
+
+  // Gets the number of disabled tests that will be reported in the XML report.
+  int reportable_disabled_test_count() const;
+
+  // Gets the number of disabled tests.
+  int disabled_test_count() const;
+
+  // Gets the number of tests to be printed in the XML report.
+  int reportable_test_count() const;
+
+  // Gets the number of all tests.
+  int total_test_count() const;
+
+  // Gets the number of tests that should run.
+  int test_to_run_count() const;
+
+  // Gets the time of the test program start, in ms from the start of the
+  // UNIX epoch.
+  TimeInMillis start_timestamp() const { return start_timestamp_; }
+
+  // Gets the elapsed time, in milliseconds.
+  TimeInMillis elapsed_time() const { return elapsed_time_; }
+
+  // Returns true iff the unit test passed (i.e. all test cases passed).
+  bool Passed() const { return !Failed(); }
+
+  // Returns true iff the unit test failed (i.e. some test case failed
+  // or something outside of all tests failed).
+  bool Failed() const {
+    return failed_test_case_count() > 0 || ad_hoc_test_result()->Failed();
+  }
+
+  // Gets the i-th test case among all the test cases. i can range from 0 to
+  // total_test_case_count() - 1. If i is not in that range, returns NULL.
+  const TestCase* GetTestCase(int i) const {
+    const int index = GetElementOr(test_case_indices_, i, -1);
+    return index < 0 ? NULL : test_cases_[i];
+  }
+
+  // Gets the i-th test case among all the test cases. i can range from 0 to
+  // total_test_case_count() - 1. If i is not in that range, returns NULL.
+  TestCase* GetMutableTestCase(int i) {
+    const int index = GetElementOr(test_case_indices_, i, -1);
+    return index < 0 ? NULL : test_cases_[index];
+  }
+
+  // Provides access to the event listener list.
+  TestEventListeners* listeners() { return &listeners_; }
+
+  // Returns the TestResult for the test that's currently running, or
+  // the TestResult for the ad hoc test if no test is running.
+  TestResult* current_test_result();
+
+  // Returns the TestResult for the ad hoc test.
+  const TestResult* ad_hoc_test_result() const { return &ad_hoc_test_result_; }
+
+  // Sets the OS stack trace getter.
+  //
+  // Does nothing if the input and the current OS stack trace getter
+  // are the same; otherwise, deletes the old getter and makes the
+  // input the current getter.
+  void set_os_stack_trace_getter(OsStackTraceGetterInterface* getter);
+
+  // Returns the current OS stack trace getter if it is not NULL;
+  // otherwise, creates an OsStackTraceGetter, makes it the current
+  // getter, and returns it.
+  OsStackTraceGetterInterface* os_stack_trace_getter();
+
+  // Returns the current OS stack trace as an std::string.
+  //
+  // The maximum number of stack frames to be included is specified by
+  // the gtest_stack_trace_depth flag.  The skip_count parameter
+  // specifies the number of top frames to be skipped, which doesn't
+  // count against the number of frames to be included.
+  //
+  // For example, if Foo() calls Bar(), which in turn calls
+  // CurrentOsStackTraceExceptTop(1), Foo() will be included in the
+  // trace but Bar() and CurrentOsStackTraceExceptTop() won't.
+  std::string CurrentOsStackTraceExceptTop(int skip_count) GTEST_NO_INLINE_;
+
+  // Finds and returns a TestCase with the given name.  If one doesn't
+  // exist, creates one and returns it.
+  //
+  // Arguments:
+  //
+  //   test_case_name: name of the test case
+  //   type_param:     the name of the test's type parameter, or NULL if
+  //                   this is not a typed or a type-parameterized test.
+  //   set_up_tc:      pointer to the function that sets up the test case
+  //   tear_down_tc:   pointer to the function that tears down the test case
+  TestCase* GetTestCase(const char* test_case_name,
+                        const char* type_param,
+                        Test::SetUpTestCaseFunc set_up_tc,
+                        Test::TearDownTestCaseFunc tear_down_tc);
+
+  // Adds a TestInfo to the unit test.
+  //
+  // Arguments:
+  //
+  //   set_up_tc:    pointer to the function that sets up the test case
+  //   tear_down_tc: pointer to the function that tears down the test case
+  //   test_info:    the TestInfo object
+  void AddTestInfo(Test::SetUpTestCaseFunc set_up_tc,
+                   Test::TearDownTestCaseFunc tear_down_tc,
+                   TestInfo* test_info) {
+    // In order to support thread-safe death tests, we need to
+    // remember the original working directory when the test program
+    // was first invoked.  We cannot do this in RUN_ALL_TESTS(), as
+    // the user may have changed the current directory before calling
+    // RUN_ALL_TESTS().  Therefore we capture the current directory in
+    // AddTestInfo(), which is called to register a TEST or TEST_F
+    // before main() is reached.
+    if (original_working_dir_.IsEmpty()) {
+      original_working_dir_.Set(FilePath::GetCurrentDir());
+      GTEST_CHECK_(!original_working_dir_.IsEmpty())
+          << "Failed to get the current working directory.";
+    }
+
+    GetTestCase(test_info->test_case_name(),
+                test_info->type_param(),
+                set_up_tc,
+                tear_down_tc)->AddTestInfo(test_info);
+  }
+
+#if GTEST_HAS_PARAM_TEST
+  // Returns ParameterizedTestCaseRegistry object used to keep track of
+  // value-parameterized tests and instantiate and register them.
+  internal::ParameterizedTestCaseRegistry& parameterized_test_registry() {
+    return parameterized_test_registry_;
+  }
+#endif  // GTEST_HAS_PARAM_TEST
+
+  // Sets the TestCase object for the test that's currently running.
+  void set_current_test_case(TestCase* a_current_test_case) {
+    current_test_case_ = a_current_test_case;
+  }
+
+  // Sets the TestInfo object for the test that's currently running.  If
+  // current_test_info is NULL, the assertion results will be stored in
+  // ad_hoc_test_result_.
+  void set_current_test_info(TestInfo* a_current_test_info) {
+    current_test_info_ = a_current_test_info;
+  }
+
+  // Registers all parameterized tests defined using TEST_P and
+  // INSTANTIATE_TEST_CASE_P, creating regular tests for each test/parameter
+  // combination. This method can be called more then once; it has guards
+  // protecting from registering the tests more then once.  If
+  // value-parameterized tests are disabled, RegisterParameterizedTests is
+  // present but does nothing.
+  void RegisterParameterizedTests();
+
+  // Runs all tests in this UnitTest object, prints the result, and
+  // returns true if all tests are successful.  If any exception is
+  // thrown during a test, this test is considered to be failed, but
+  // the rest of the tests will still be run.
+  bool RunAllTests();
+
+  // Clears the results of all tests, except the ad hoc tests.
+  void ClearNonAdHocTestResult() {
+    ForEach(test_cases_, TestCase::ClearTestCaseResult);
+  }
+
+  // Clears the results of ad-hoc test assertions.
+  void ClearAdHocTestResult() {
+    ad_hoc_test_result_.Clear();
+  }
+
+  // Adds a TestProperty to the current TestResult object when invoked in a
+  // context of a test or a test case, or to the global property set. If the
+  // result already contains a property with the same key, the value will be
+  // updated.
+  void RecordProperty(const TestProperty& test_property);
+
+  enum ReactionToSharding {
+    HONOR_SHARDING_PROTOCOL,
+    IGNORE_SHARDING_PROTOCOL
+  };
+
+  // Matches the full name of each test against the user-specified
+  // filter to decide whether the test should run, then records the
+  // result in each TestCase and TestInfo object.
+  // If shard_tests == HONOR_SHARDING_PROTOCOL, further filters tests
+  // based on sharding variables in the environment.
+  // Returns the number of tests that should run.
+  int FilterTests(ReactionToSharding shard_tests);
+
+  // Prints the names of the tests matching the user-specified filter flag.
+  void ListTestsMatchingFilter();
+
+  const TestCase* current_test_case() const { return current_test_case_; }
+  TestInfo* current_test_info() { return current_test_info_; }
+  const TestInfo* current_test_info() const { return current_test_info_; }
+
+  // Returns the vector of environments that need to be set-up/torn-down
+  // before/after the tests are run.
+  std::vector<Environment*>& environments() { return environments_; }
+
+  // Getters for the per-thread Google Test trace stack.
+  std::vector<TraceInfo>& gtest_trace_stack() {
+    return *(gtest_trace_stack_.pointer());
+  }
+  const std::vector<TraceInfo>& gtest_trace_stack() const {
+    return gtest_trace_stack_.get();
+  }
+
+#if GTEST_HAS_DEATH_TEST
+  void InitDeathTestSubprocessControlInfo() {
+    internal_run_death_test_flag_.reset(ParseInternalRunDeathTestFlag());
+  }
+  // Returns a pointer to the parsed --gtest_internal_run_death_test
+  // flag, or NULL if that flag was not specified.
+  // This information is useful only in a death test child process.
+  // Must not be called before a call to InitGoogleTest.
+  const InternalRunDeathTestFlag* internal_run_death_test_flag() const {
+    return internal_run_death_test_flag_.get();
+  }
+
+  // Returns a pointer to the current death test factory.
+  internal::DeathTestFactory* death_test_factory() {
+    return death_test_factory_.get();
+  }
+
+  void SuppressTestEventsIfInSubprocess();
+
+  friend class ReplaceDeathTestFactory;
+#endif  // GTEST_HAS_DEATH_TEST
+
+  // Initializes the event listener performing XML output as specified by
+  // UnitTestOptions. Must not be called before InitGoogleTest.
+  void ConfigureXmlOutput();
+
+#if GTEST_CAN_STREAM_RESULTS_
+  // Initializes the event listener for streaming test results to a socket.
+  // Must not be called before InitGoogleTest.
+  void ConfigureStreamingOutput();
+#endif
+
+  // Performs initialization dependent upon flag values obtained in
+  // ParseGoogleTestFlagsOnly.  Is called from InitGoogleTest after the call to
+  // ParseGoogleTestFlagsOnly.  In case a user neglects to call InitGoogleTest
+  // this function is also called from RunAllTests.  Since this function can be
+  // called more than once, it has to be idempotent.
+  void PostFlagParsingInit();
+
+  // Gets the random seed used at the start of the current test iteration.
+  int random_seed() const { return random_seed_; }
+
+  // Gets the random number generator.
+  internal::Random* random() { return &random_; }
+
+  // Shuffles all test cases, and the tests within each test case,
+  // making sure that death tests are still run first.
+  void ShuffleTests();
+
+  // Restores the test cases and tests to their order before the first shuffle.
+  void UnshuffleTests();
+
+  // Returns the value of GTEST_FLAG(catch_exceptions) at the moment
+  // UnitTest::Run() starts.
+  bool catch_exceptions() const { return catch_exceptions_; }
+
+ private:
+  friend class ::testing::UnitTest;
+
+  // Used by UnitTest::Run() to capture the state of
+  // GTEST_FLAG(catch_exceptions) at the moment it starts.
+  void set_catch_exceptions(bool value) { catch_exceptions_ = value; }
+
+  // The UnitTest object that owns this implementation object.
+  UnitTest* const parent_;
+
+  // The working directory when the first TEST() or TEST_F() was
+  // executed.
+  internal::FilePath original_working_dir_;
+
+  // The default test part result reporters.
+  DefaultGlobalTestPartResultReporter default_global_test_part_result_reporter_;
+  DefaultPerThreadTestPartResultReporter
+      default_per_thread_test_part_result_reporter_;
+
+  // Points to (but doesn't own) the global test part result reporter.
+  TestPartResultReporterInterface* global_test_part_result_repoter_;
+
+  // Protects read and write access to global_test_part_result_reporter_.
+  internal::Mutex global_test_part_result_reporter_mutex_;
+
+  // Points to (but doesn't own) the per-thread test part result reporter.
+  internal::ThreadLocal<TestPartResultReporterInterface*>
+      per_thread_test_part_result_reporter_;
+
+  // The vector of environments that need to be set-up/torn-down
+  // before/after the tests are run.
+  std::vector<Environment*> environments_;
+
+  // The vector of TestCases in their original order.  It owns the
+  // elements in the vector.
+  std::vector<TestCase*> test_cases_;
+
+  // Provides a level of indirection for the test case list to allow
+  // easy shuffling and restoring the test case order.  The i-th
+  // element of this vector is the index of the i-th test case in the
+  // shuffled order.
+  std::vector<int> test_case_indices_;
+
+#if GTEST_HAS_PARAM_TEST
+  // ParameterizedTestRegistry object used to register value-parameterized
+  // tests.
+  internal::ParameterizedTestCaseRegistry parameterized_test_registry_;
+
+  // Indicates whether RegisterParameterizedTests() has been called already.
+  bool parameterized_tests_registered_;
+#endif  // GTEST_HAS_PARAM_TEST
+
+  // Index of the last death test case registered.  Initially -1.
+  int last_death_test_case_;
+
+  // This points to the TestCase for the currently running test.  It
+  // changes as Google Test goes through one test case after another.
+  // When no test is running, this is set to NULL and Google Test
+  // stores assertion results in ad_hoc_test_result_.  Initially NULL.
+  TestCase* current_test_case_;
+
+  // This points to the TestInfo for the currently running test.  It
+  // changes as Google Test goes through one test after another.  When
+  // no test is running, this is set to NULL and Google Test stores
+  // assertion results in ad_hoc_test_result_.  Initially NULL.
+  TestInfo* current_test_info_;
+
+  // Normally, a user only writes assertions inside a TEST or TEST_F,
+  // or inside a function called by a TEST or TEST_F.  Since Google
+  // Test keeps track of which test is current running, it can
+  // associate such an assertion with the test it belongs to.
+  //
+  // If an assertion is encountered when no TEST or TEST_F is running,
+  // Google Test attributes the assertion result to an imaginary "ad hoc"
+  // test, and records the result in ad_hoc_test_result_.
+  TestResult ad_hoc_test_result_;
+
+  // The list of event listeners that can be used to track events inside
+  // Google Test.
+  TestEventListeners listeners_;
+
+  // The OS stack trace getter.  Will be deleted when the UnitTest
+  // object is destructed.  By default, an OsStackTraceGetter is used,
+  // but the user can set this field to use a custom getter if that is
+  // desired.
+  OsStackTraceGetterInterface* os_stack_trace_getter_;
+
+  // True iff PostFlagParsingInit() has been called.
+  bool post_flag_parse_init_performed_;
+
+  // The random number seed used at the beginning of the test run.
+  int random_seed_;
+
+  // Our random number generator.
+  internal::Random random_;
+
+  // The time of the test program start, in ms from the start of the
+  // UNIX epoch.
+  TimeInMillis start_timestamp_;
+
+  // How long the test took to run, in milliseconds.
+  TimeInMillis elapsed_time_;
+
+#if GTEST_HAS_DEATH_TEST
+  // The decomposed components of the gtest_internal_run_death_test flag,
+  // parsed when RUN_ALL_TESTS is called.
+  internal::scoped_ptr<InternalRunDeathTestFlag> internal_run_death_test_flag_;
+  internal::scoped_ptr<internal::DeathTestFactory> death_test_factory_;
+#endif  // GTEST_HAS_DEATH_TEST
+
+  // A per-thread stack of traces created by the SCOPED_TRACE() macro.
+  internal::ThreadLocal<std::vector<TraceInfo> > gtest_trace_stack_;
+
+  // The value of GTEST_FLAG(catch_exceptions) at the moment RunAllTests()
+  // starts.
+  bool catch_exceptions_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(UnitTestImpl);
+};  // class UnitTestImpl
+
+// Convenience function for accessing the global UnitTest
+// implementation object.
+inline UnitTestImpl* GetUnitTestImpl() {
+  return UnitTest::GetInstance()->impl();
+}
+
+#if GTEST_USES_SIMPLE_RE
+
+// Internal helper functions for implementing the simple regular
+// expression matcher.
+GTEST_API_ bool IsInSet(char ch, const char* str);
+GTEST_API_ bool IsAsciiDigit(char ch);
+GTEST_API_ bool IsAsciiPunct(char ch);
+GTEST_API_ bool IsRepeat(char ch);
+GTEST_API_ bool IsAsciiWhiteSpace(char ch);
+GTEST_API_ bool IsAsciiWordChar(char ch);
+GTEST_API_ bool IsValidEscape(char ch);
+GTEST_API_ bool AtomMatchesChar(bool escaped, char pattern, char ch);
+GTEST_API_ bool ValidateRegex(const char* regex);
+GTEST_API_ bool MatchRegexAtHead(const char* regex, const char* str);
+GTEST_API_ bool MatchRepetitionAndRegexAtHead(
+    bool escaped, char ch, char repeat, const char* regex, const char* str);
+GTEST_API_ bool MatchRegexAnywhere(const char* regex, const char* str);
+
+#endif  // GTEST_USES_SIMPLE_RE
+
+// Parses the command line for Google Test flags, without initializing
+// other parts of Google Test.
+GTEST_API_ void ParseGoogleTestFlagsOnly(int* argc, char** argv);
+GTEST_API_ void ParseGoogleTestFlagsOnly(int* argc, wchar_t** argv);
+
+#if GTEST_HAS_DEATH_TEST
+
+// Returns the message describing the last system error, regardless of the
+// platform.
+GTEST_API_ std::string GetLastErrnoDescription();
+
+# if GTEST_OS_WINDOWS
+// Provides leak-safe Windows kernel handle ownership.
+class AutoHandle {
+ public:
+  AutoHandle() : handle_(INVALID_HANDLE_VALUE) {}
+  explicit AutoHandle(HANDLE handle) : handle_(handle) {}
+
+  ~AutoHandle() { Reset(); }
+
+  HANDLE Get() const { return handle_; }
+  void Reset() { Reset(INVALID_HANDLE_VALUE); }
+  void Reset(HANDLE handle) {
+    if (handle != handle_) {
+      if (handle_ != INVALID_HANDLE_VALUE)
+        ::CloseHandle(handle_);
+      handle_ = handle;
+    }
+  }
+
+ private:
+  HANDLE handle_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(AutoHandle);
+};
+# endif  // GTEST_OS_WINDOWS
+
+// Attempts to parse a string into a positive integer pointed to by the
+// number parameter.  Returns true if that is possible.
+// GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we can use
+// it here.
+template <typename Integer>
+bool ParseNaturalNumber(const ::std::string& str, Integer* number) {
+  // Fail fast if the given string does not begin with a digit;
+  // this bypasses strtoXXX's "optional leading whitespace and plus
+  // or minus sign" semantics, which are undesirable here.
+  if (str.empty() || !IsDigit(str[0])) {
+    return false;
+  }
+  errno = 0;
+
+  char* end;
+  // BiggestConvertible is the largest integer type that system-provided
+  // string-to-number conversion routines can return.
+
+# if GTEST_OS_WINDOWS && !defined(__GNUC__)
+
+  // MSVC and C++ Builder define __int64 instead of the standard long long.
+  typedef unsigned __int64 BiggestConvertible;
+  const BiggestConvertible parsed = _strtoui64(str.c_str(), &end, 10);
+
+# else
+
+  typedef unsigned long long BiggestConvertible;  // NOLINT
+  const BiggestConvertible parsed = strtoull(str.c_str(), &end, 10);
+
+# endif  // GTEST_OS_WINDOWS && !defined(__GNUC__)
+
+  const bool parse_success = *end == '\0' && errno == 0;
+
+  // TODO(vladl@google.com): Convert this to compile time assertion when it is
+  // available.
+  GTEST_CHECK_(sizeof(Integer) <= sizeof(parsed));
+
+  const Integer result = static_cast<Integer>(parsed);
+  if (parse_success && static_cast<BiggestConvertible>(result) == parsed) {
+    *number = result;
+    return true;
+  }
+  return false;
+}
+#endif  // GTEST_HAS_DEATH_TEST
+
+// TestResult contains some private methods that should be hidden from
+// Google Test user but are required for testing. This class allow our tests
+// to access them.
+//
+// This class is supplied only for the purpose of testing Google Test's own
+// constructs. Do not use it in user tests, either directly or indirectly.
+class TestResultAccessor {
+ public:
+  static void RecordProperty(TestResult* test_result,
+                             const std::string& xml_element,
+                             const TestProperty& property) {
+    test_result->RecordProperty(xml_element, property);
+  }
+
+  static void ClearTestPartResults(TestResult* test_result) {
+    test_result->ClearTestPartResults();
+  }
+
+  static const std::vector<testing::TestPartResult>& test_part_results(
+      const TestResult& test_result) {
+    return test_result.test_part_results();
+  }
+};
+
+#if GTEST_CAN_STREAM_RESULTS_
+
+// Streams test results to the given port on the given host machine.
+class StreamingListener : public EmptyTestEventListener {
+ public:
+  // Abstract base class for writing strings to a socket.
+  class AbstractSocketWriter {
+   public:
+    virtual ~AbstractSocketWriter() {}
+
+    // Sends a string to the socket.
+    virtual void Send(const string& message) = 0;
+
+    // Closes the socket.
+    virtual void CloseConnection() {}
+
+    // Sends a string and a newline to the socket.
+    void SendLn(const string& message) {
+      Send(message + "\n");
+    }
+  };
+
+  // Concrete class for actually writing strings to a socket.
+  class SocketWriter : public AbstractSocketWriter {
+   public:
+    SocketWriter(const string& host, const string& port)
+        : sockfd_(-1), host_name_(host), port_num_(port) {
+      MakeConnection();
+    }
+
+    virtual ~SocketWriter() {
+      if (sockfd_ != -1)
+        CloseConnection();
+    }
+
+    // Sends a string to the socket.
+    virtual void Send(const string& message) {
+      GTEST_CHECK_(sockfd_ != -1)
+          << "Send() can be called only when there is a connection.";
+
+      const int len = static_cast<int>(message.length());
+      if (write(sockfd_, message.c_str(), len) != len) {
+        GTEST_LOG_(WARNING)
+            << "stream_result_to: failed to stream to "
+            << host_name_ << ":" << port_num_;
+      }
+    }
+
+   private:
+    // Creates a client socket and connects to the server.
+    void MakeConnection();
+
+    // Closes the socket.
+    void CloseConnection() {
+      GTEST_CHECK_(sockfd_ != -1)
+          << "CloseConnection() can be called only when there is a connection.";
+
+      close(sockfd_);
+      sockfd_ = -1;
+    }
+
+    int sockfd_;  // socket file descriptor
+    const string host_name_;
+    const string port_num_;
+
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(SocketWriter);
+  };  // class SocketWriter
+
+  // Escapes '=', '&', '%', and '\n' characters in str as "%xx".
+  static string UrlEncode(const char* str);
+
+  StreamingListener(const string& host, const string& port)
+      : socket_writer_(new SocketWriter(host, port)) { Start(); }
+
+  explicit StreamingListener(AbstractSocketWriter* socket_writer)
+      : socket_writer_(socket_writer) { Start(); }
+
+  void OnTestProgramStart(const UnitTest& /* unit_test */) {
+    SendLn("event=TestProgramStart");
+  }
+
+  void OnTestProgramEnd(const UnitTest& unit_test) {
+    // Note that Google Test current only report elapsed time for each
+    // test iteration, not for the entire test program.
+    SendLn("event=TestProgramEnd&passed=" + FormatBool(unit_test.Passed()));
+
+    // Notify the streaming server to stop.
+    socket_writer_->CloseConnection();
+  }
+
+  void OnTestIterationStart(const UnitTest& /* unit_test */, int iteration) {
+    SendLn("event=TestIterationStart&iteration=" +
+           StreamableToString(iteration));
+  }
+
+  void OnTestIterationEnd(const UnitTest& unit_test, int /* iteration */) {
+    SendLn("event=TestIterationEnd&passed=" +
+           FormatBool(unit_test.Passed()) + "&elapsed_time=" +
+           StreamableToString(unit_test.elapsed_time()) + "ms");
+  }
+
+  void OnTestCaseStart(const TestCase& test_case) {
+    SendLn(std::string("event=TestCaseStart&name=") + test_case.name());
+  }
+
+  void OnTestCaseEnd(const TestCase& test_case) {
+    SendLn("event=TestCaseEnd&passed=" + FormatBool(test_case.Passed())
+           + "&elapsed_time=" + StreamableToString(test_case.elapsed_time())
+           + "ms");
+  }
+
+  void OnTestStart(const TestInfo& test_info) {
+    SendLn(std::string("event=TestStart&name=") + test_info.name());
+  }
+
+  void OnTestEnd(const TestInfo& test_info) {
+    SendLn("event=TestEnd&passed=" +
+           FormatBool((test_info.result())->Passed()) +
+           "&elapsed_time=" +
+           StreamableToString((test_info.result())->elapsed_time()) + "ms");
+  }
+
+  void OnTestPartResult(const TestPartResult& test_part_result) {
+    const char* file_name = test_part_result.file_name();
+    if (file_name == NULL)
+      file_name = "";
+    SendLn("event=TestPartResult&file=" + UrlEncode(file_name) +
+           "&line=" + StreamableToString(test_part_result.line_number()) +
+           "&message=" + UrlEncode(test_part_result.message()));
+  }
+
+ private:
+  // Sends the given message and a newline to the socket.
+  void SendLn(const string& message) { socket_writer_->SendLn(message); }
+
+  // Called at the start of streaming to notify the receiver what
+  // protocol we are using.
+  void Start() { SendLn("gtest_streaming_protocol_version=1.0"); }
+
+  string FormatBool(bool value) { return value ? "1" : "0"; }
+
+  const scoped_ptr<AbstractSocketWriter> socket_writer_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(StreamingListener);
+};  // class StreamingListener
+
+#endif  // GTEST_CAN_STREAM_RESULTS_
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_SRC_GTEST_INTERNAL_INL_H_
+#undef GTEST_IMPLEMENTATION_
+
+#if GTEST_OS_WINDOWS
+# define vsnprintf _vsnprintf
+#endif  // GTEST_OS_WINDOWS
+
+namespace testing {
+
+using internal::CountIf;
+using internal::ForEach;
+using internal::GetElementOr;
+using internal::Shuffle;
+
+// Constants.
+
+// A test whose test case name or test name matches this filter is
+// disabled and not run.
+static const char kDisableTestFilter[] = "DISABLED_*:*/DISABLED_*";
+
+// A test case whose name matches this filter is considered a death
+// test case and will be run before test cases whose name doesn't
+// match this filter.
+static const char kDeathTestCaseFilter[] = "*DeathTest:*DeathTest/*";
+
+// A test filter that matches everything.
+static const char kUniversalFilter[] = "*";
+
+// The default output file for XML output.
+static const char kDefaultOutputFile[] = "test_detail.xml";
+
+// The environment variable name for the test shard index.
+static const char kTestShardIndex[] = "GTEST_SHARD_INDEX";
+// The environment variable name for the total number of test shards.
+static const char kTestTotalShards[] = "GTEST_TOTAL_SHARDS";
+// The environment variable name for the test shard status file.
+static const char kTestShardStatusFile[] = "GTEST_SHARD_STATUS_FILE";
+
+namespace internal {
+
+// The text used in failure messages to indicate the start of the
+// stack trace.
+const char kStackTraceMarker[] = "\nStack trace:\n";
+
+// g_help_flag is true iff the --help flag or an equivalent form is
+// specified on the command line.
+bool g_help_flag = false;
+
+}  // namespace internal
+
+static const char* GetDefaultFilter() {
+  return kUniversalFilter;
+}
+
+GTEST_DEFINE_bool_(
+    also_run_disabled_tests,
+    internal::BoolFromGTestEnv("also_run_disabled_tests", false),
+    "Run disabled tests too, in addition to the tests normally being run.");
+
+GTEST_DEFINE_bool_(
+    break_on_failure,
+    internal::BoolFromGTestEnv("break_on_failure", false),
+    "True iff a failed assertion should be a debugger break-point.");
+
+GTEST_DEFINE_bool_(
+    catch_exceptions,
+    internal::BoolFromGTestEnv("catch_exceptions", true),
+    "True iff " GTEST_NAME_
+    " should catch exceptions and treat them as test failures.");
+
+GTEST_DEFINE_string_(
+    color,
+    internal::StringFromGTestEnv("color", "auto"),
+    "Whether to use colors in the output.  Valid values: yes, no, "
+    "and auto.  'auto' means to use colors if the output is "
+    "being sent to a terminal and the TERM environment variable "
+    "is set to a terminal type that supports colors.");
+
+GTEST_DEFINE_string_(
+    filter,
+    internal::StringFromGTestEnv("filter", GetDefaultFilter()),
+    "A colon-separated list of glob (not regex) patterns "
+    "for filtering the tests to run, optionally followed by a "
+    "'-' and a : separated list of negative patterns (tests to "
+    "exclude).  A test is run if it matches one of the positive "
+    "patterns and does not match any of the negative patterns.");
+
+GTEST_DEFINE_bool_(list_tests, false,
+                   "List all tests without running them.");
+
+GTEST_DEFINE_string_(
+    output,
+    internal::StringFromGTestEnv("output", ""),
+    "A format (currently must be \"xml\"), optionally followed "
+    "by a colon and an output file name or directory. A directory "
+    "is indicated by a trailing pathname separator. "
+    "Examples: \"xml:filename.xml\", \"xml::directoryname/\". "
+    "If a directory is specified, output files will be created "
+    "within that directory, with file-names based on the test "
+    "executable's name and, if necessary, made unique by adding "
+    "digits.");
+
+GTEST_DEFINE_bool_(
+    print_time,
+    internal::BoolFromGTestEnv("print_time", true),
+    "True iff " GTEST_NAME_
+    " should display elapsed time in text output.");
+
+GTEST_DEFINE_int32_(
+    random_seed,
+    internal::Int32FromGTestEnv("random_seed", 0),
+    "Random number seed to use when shuffling test orders.  Must be in range "
+    "[1, 99999], or 0 to use a seed based on the current time.");
+
+GTEST_DEFINE_int32_(
+    repeat,
+    internal::Int32FromGTestEnv("repeat", 1),
+    "How many times to repeat each test.  Specify a negative number "
+    "for repeating forever.  Useful for shaking out flaky tests.");
+
+GTEST_DEFINE_bool_(
+    show_internal_stack_frames, false,
+    "True iff " GTEST_NAME_ " should include internal stack frames when "
+    "printing test failure stack traces.");
+
+GTEST_DEFINE_bool_(
+    shuffle,
+    internal::BoolFromGTestEnv("shuffle", false),
+    "True iff " GTEST_NAME_
+    " should randomize tests' order on every run.");
+
+GTEST_DEFINE_int32_(
+    stack_trace_depth,
+    internal::Int32FromGTestEnv("stack_trace_depth", kMaxStackTraceDepth),
+    "The maximum number of stack frames to print when an "
+    "assertion fails.  The valid range is 0 through 100, inclusive.");
+
+GTEST_DEFINE_string_(
+    stream_result_to,
+    internal::StringFromGTestEnv("stream_result_to", ""),
+    "This flag specifies the host name and the port number on which to stream "
+    "test results. Example: \"localhost:555\". The flag is effective only on "
+    "Linux.");
+
+GTEST_DEFINE_bool_(
+    throw_on_failure,
+    internal::BoolFromGTestEnv("throw_on_failure", false),
+    "When this flag is specified, a failed assertion will throw an exception "
+    "if exceptions are enabled or exit the program with a non-zero code "
+    "otherwise.");
+
+namespace internal {
+
+// Generates a random number from [0, range), using a Linear
+// Congruential Generator (LCG).  Crashes if 'range' is 0 or greater
+// than kMaxRange.
+UInt32 Random::Generate(UInt32 range) {
+  // These constants are the same as are used in glibc's rand(3).
+  state_ = (1103515245U*state_ + 12345U) % kMaxRange;
+
+  GTEST_CHECK_(range > 0)
+      << "Cannot generate a number in the range [0, 0).";
+  GTEST_CHECK_(range <= kMaxRange)
+      << "Generation of a number in [0, " << range << ") was requested, "
+      << "but this can only generate numbers in [0, " << kMaxRange << ").";
+
+  // Converting via modulus introduces a bit of downward bias, but
+  // it's simple, and a linear congruential generator isn't too good
+  // to begin with.
+  return state_ % range;
+}
+
+// GTestIsInitialized() returns true iff the user has initialized
+// Google Test.  Useful for catching the user mistake of not initializing
+// Google Test before calling RUN_ALL_TESTS().
+//
+// A user must call testing::InitGoogleTest() to initialize Google
+// Test.  g_init_gtest_count is set to the number of times
+// InitGoogleTest() has been called.  We don't protect this variable
+// under a mutex as it is only accessed in the main thread.
+GTEST_API_ int g_init_gtest_count = 0;
+static bool GTestIsInitialized() { return g_init_gtest_count != 0; }
+
+// Iterates over a vector of TestCases, keeping a running sum of the
+// results of calling a given int-returning method on each.
+// Returns the sum.
+static int SumOverTestCaseList(const std::vector<TestCase*>& case_list,
+                               int (TestCase::*method)() const) {
+  int sum = 0;
+  for (size_t i = 0; i < case_list.size(); i++) {
+    sum += (case_list[i]->*method)();
+  }
+  return sum;
+}
+
+// Returns true iff the test case passed.
+static bool TestCasePassed(const TestCase* test_case) {
+  return test_case->should_run() && test_case->Passed();
+}
+
+// Returns true iff the test case failed.
+static bool TestCaseFailed(const TestCase* test_case) {
+  return test_case->should_run() && test_case->Failed();
+}
+
+// Returns true iff test_case contains at least one test that should
+// run.
+static bool ShouldRunTestCase(const TestCase* test_case) {
+  return test_case->should_run();
+}
+
+// AssertHelper constructor.
+AssertHelper::AssertHelper(TestPartResult::Type type,
+                           const char* file,
+                           int line,
+                           const char* message)
+    : data_(new AssertHelperData(type, file, line, message)) {
+}
+
+AssertHelper::~AssertHelper() {
+  delete data_;
+}
+
+// Message assignment, for assertion streaming support.
+void AssertHelper::operator=(const Message& message) const {
+  UnitTest::GetInstance()->
+    AddTestPartResult(data_->type, data_->file, data_->line,
+                      AppendUserMessage(data_->message, message),
+                      UnitTest::GetInstance()->impl()
+                      ->CurrentOsStackTraceExceptTop(1)
+                      // Skips the stack frame for this function itself.
+                      );  // NOLINT
+}
+
+// Mutex for linked pointers.
+GTEST_API_ GTEST_DEFINE_STATIC_MUTEX_(g_linked_ptr_mutex);
+
+// Application pathname gotten in InitGoogleTest.
+std::string g_executable_path;
+
+// Returns the current application's name, removing directory path if that
+// is present.
+FilePath GetCurrentExecutableName() {
+  FilePath result;
+
+#if GTEST_OS_WINDOWS
+  result.Set(FilePath(g_executable_path).RemoveExtension("exe"));
+#else
+  result.Set(FilePath(g_executable_path));
+#endif  // GTEST_OS_WINDOWS
+
+  return result.RemoveDirectoryName();
+}
+
+// Functions for processing the gtest_output flag.
+
+// Returns the output format, or "" for normal printed output.
+std::string UnitTestOptions::GetOutputFormat() {
+  const char* const gtest_output_flag = GTEST_FLAG(output).c_str();
+  if (gtest_output_flag == NULL) return std::string("");
+
+  const char* const colon = strchr(gtest_output_flag, ':');
+  return (colon == NULL) ?
+      std::string(gtest_output_flag) :
+      std::string(gtest_output_flag, colon - gtest_output_flag);
+}
+
+// Returns the name of the requested output file, or the default if none
+// was explicitly specified.
+std::string UnitTestOptions::GetAbsolutePathToOutputFile() {
+  const char* const gtest_output_flag = GTEST_FLAG(output).c_str();
+  if (gtest_output_flag == NULL)
+    return "";
+
+  const char* const colon = strchr(gtest_output_flag, ':');
+  if (colon == NULL)
+    return internal::FilePath::ConcatPaths(
+        internal::FilePath(
+            UnitTest::GetInstance()->original_working_dir()),
+        internal::FilePath(kDefaultOutputFile)).string();
+
+  internal::FilePath output_name(colon + 1);
+  if (!output_name.IsAbsolutePath())
+    // TODO(wan@google.com): on Windows \some\path is not an absolute
+    // path (as its meaning depends on the current drive), yet the
+    // following logic for turning it into an absolute path is wrong.
+    // Fix it.
+    output_name = internal::FilePath::ConcatPaths(
+        internal::FilePath(UnitTest::GetInstance()->original_working_dir()),
+        internal::FilePath(colon + 1));
+
+  if (!output_name.IsDirectory())
+    return output_name.string();
+
+  internal::FilePath result(internal::FilePath::GenerateUniqueFileName(
+      output_name, internal::GetCurrentExecutableName(),
+      GetOutputFormat().c_str()));
+  return result.string();
+}
+
+// Returns true iff the wildcard pattern matches the string.  The
+// first ':' or '\0' character in pattern marks the end of it.
+//
+// This recursive algorithm isn't very efficient, but is clear and
+// works well enough for matching test names, which are short.
+bool UnitTestOptions::PatternMatchesString(const char *pattern,
+                                           const char *str) {
+  switch (*pattern) {
+    case '\0':
+    case ':':  // Either ':' or '\0' marks the end of the pattern.
+      return *str == '\0';
+    case '?':  // Matches any single character.
+      return *str != '\0' && PatternMatchesString(pattern + 1, str + 1);
+    case '*':  // Matches any string (possibly empty) of characters.
+      return (*str != '\0' && PatternMatchesString(pattern, str + 1)) ||
+          PatternMatchesString(pattern + 1, str);
+    default:  // Non-special character.  Matches itself.
+      return *pattern == *str &&
+          PatternMatchesString(pattern + 1, str + 1);
+  }
+}
+
+bool UnitTestOptions::MatchesFilter(
+    const std::string& name, const char* filter) {
+  const char *cur_pattern = filter;
+  for (;;) {
+    if (PatternMatchesString(cur_pattern, name.c_str())) {
+      return true;
+    }
+
+    // Finds the next pattern in the filter.
+    cur_pattern = strchr(cur_pattern, ':');
+
+    // Returns if no more pattern can be found.
+    if (cur_pattern == NULL) {
+      return false;
+    }
+
+    // Skips the pattern separater (the ':' character).
+    cur_pattern++;
+  }
+}
+
+// Returns true iff the user-specified filter matches the test case
+// name and the test name.
+bool UnitTestOptions::FilterMatchesTest(const std::string &test_case_name,
+                                        const std::string &test_name) {
+  const std::string& full_name = test_case_name + "." + test_name.c_str();
+
+  // Split --gtest_filter at '-', if there is one, to separate into
+  // positive filter and negative filter portions
+  const char* const p = GTEST_FLAG(filter).c_str();
+  const char* const dash = strchr(p, '-');
+  std::string positive;
+  std::string negative;
+  if (dash == NULL) {
+    positive = GTEST_FLAG(filter).c_str();  // Whole string is a positive filter
+    negative = "";
+  } else {
+    positive = std::string(p, dash);   // Everything up to the dash
+    negative = std::string(dash + 1);  // Everything after the dash
+    if (positive.empty()) {
+      // Treat '-test1' as the same as '*-test1'
+      positive = kUniversalFilter;
+    }
+  }
+
+  // A filter is a colon-separated list of patterns.  It matches a
+  // test if any pattern in it matches the test.
+  return (MatchesFilter(full_name, positive.c_str()) &&
+          !MatchesFilter(full_name, negative.c_str()));
+}
+
+#if GTEST_HAS_SEH
+// Returns EXCEPTION_EXECUTE_HANDLER if Google Test should handle the
+// given SEH exception, or EXCEPTION_CONTINUE_SEARCH otherwise.
+// This function is useful as an __except condition.
+int UnitTestOptions::GTestShouldProcessSEH(DWORD exception_code) {
+  // Google Test should handle a SEH exception if:
+  //   1. the user wants it to, AND
+  //   2. this is not a breakpoint exception, AND
+  //   3. this is not a C++ exception (VC++ implements them via SEH,
+  //      apparently).
+  //
+  // SEH exception code for C++ exceptions.
+  // (see http://support.microsoft.com/kb/185294 for more information).
+  const DWORD kCxxExceptionCode = 0xe06d7363;
+
+  bool should_handle = true;
+
+  if (!GTEST_FLAG(catch_exceptions))
+    should_handle = false;
+  else if (exception_code == EXCEPTION_BREAKPOINT)
+    should_handle = false;
+  else if (exception_code == kCxxExceptionCode)
+    should_handle = false;
+
+  return should_handle ? EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH;
+}
+#endif  // GTEST_HAS_SEH
+
+}  // namespace internal
+
+// The c'tor sets this object as the test part result reporter used by
+// Google Test.  The 'result' parameter specifies where to report the
+// results. Intercepts only failures from the current thread.
+ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter(
+    TestPartResultArray* result)
+    : intercept_mode_(INTERCEPT_ONLY_CURRENT_THREAD),
+      result_(result) {
+  Init();
+}
+
+// The c'tor sets this object as the test part result reporter used by
+// Google Test.  The 'result' parameter specifies where to report the
+// results.
+ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter(
+    InterceptMode intercept_mode, TestPartResultArray* result)
+    : intercept_mode_(intercept_mode),
+      result_(result) {
+  Init();
+}
+
+void ScopedFakeTestPartResultReporter::Init() {
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  if (intercept_mode_ == INTERCEPT_ALL_THREADS) {
+    old_reporter_ = impl->GetGlobalTestPartResultReporter();
+    impl->SetGlobalTestPartResultReporter(this);
+  } else {
+    old_reporter_ = impl->GetTestPartResultReporterForCurrentThread();
+    impl->SetTestPartResultReporterForCurrentThread(this);
+  }
+}
+
+// The d'tor restores the test part result reporter used by Google Test
+// before.
+ScopedFakeTestPartResultReporter::~ScopedFakeTestPartResultReporter() {
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  if (intercept_mode_ == INTERCEPT_ALL_THREADS) {
+    impl->SetGlobalTestPartResultReporter(old_reporter_);
+  } else {
+    impl->SetTestPartResultReporterForCurrentThread(old_reporter_);
+  }
+}
+
+// Increments the test part result count and remembers the result.
+// This method is from the TestPartResultReporterInterface interface.
+void ScopedFakeTestPartResultReporter::ReportTestPartResult(
+    const TestPartResult& result) {
+  result_->Append(result);
+}
+
+namespace internal {
+
+// Returns the type ID of ::testing::Test.  We should always call this
+// instead of GetTypeId< ::testing::Test>() to get the type ID of
+// testing::Test.  This is to work around a suspected linker bug when
+// using Google Test as a framework on Mac OS X.  The bug causes
+// GetTypeId< ::testing::Test>() to return different values depending
+// on whether the call is from the Google Test framework itself or
+// from user test code.  GetTestTypeId() is guaranteed to always
+// return the same value, as it always calls GetTypeId<>() from the
+// gtest.cc, which is within the Google Test framework.
+TypeId GetTestTypeId() {
+  return GetTypeId<Test>();
+}
+
+// The value of GetTestTypeId() as seen from within the Google Test
+// library.  This is solely for testing GetTestTypeId().
+extern const TypeId kTestTypeIdInGoogleTest = GetTestTypeId();
+
+// This predicate-formatter checks that 'results' contains a test part
+// failure of the given type and that the failure message contains the
+// given substring.
+AssertionResult HasOneFailure(const char* /* results_expr */,
+                              const char* /* type_expr */,
+                              const char* /* substr_expr */,
+                              const TestPartResultArray& results,
+                              TestPartResult::Type type,
+                              const string& substr) {
+  const std::string expected(type == TestPartResult::kFatalFailure ?
+                        "1 fatal failure" :
+                        "1 non-fatal failure");
+  Message msg;
+  if (results.size() != 1) {
+    msg << "Expected: " << expected << "\n"
+        << "  Actual: " << results.size() << " failures";
+    for (int i = 0; i < results.size(); i++) {
+      msg << "\n" << results.GetTestPartResult(i);
+    }
+    return AssertionFailure() << msg;
+  }
+
+  const TestPartResult& r = results.GetTestPartResult(0);
+  if (r.type() != type) {
+    return AssertionFailure() << "Expected: " << expected << "\n"
+                              << "  Actual:\n"
+                              << r;
+  }
+
+  if (strstr(r.message(), substr.c_str()) == NULL) {
+    return AssertionFailure() << "Expected: " << expected << " containing \""
+                              << substr << "\"\n"
+                              << "  Actual:\n"
+                              << r;
+  }
+
+  return AssertionSuccess();
+}
+
+// The constructor of SingleFailureChecker remembers where to look up
+// test part results, what type of failure we expect, and what
+// substring the failure message should contain.
+SingleFailureChecker:: SingleFailureChecker(
+    const TestPartResultArray* results,
+    TestPartResult::Type type,
+    const string& substr)
+    : results_(results),
+      type_(type),
+      substr_(substr) {}
+
+// The destructor of SingleFailureChecker verifies that the given
+// TestPartResultArray contains exactly one failure that has the given
+// type and contains the given substring.  If that's not the case, a
+// non-fatal failure will be generated.
+SingleFailureChecker::~SingleFailureChecker() {
+  EXPECT_PRED_FORMAT3(HasOneFailure, *results_, type_, substr_);
+}
+
+DefaultGlobalTestPartResultReporter::DefaultGlobalTestPartResultReporter(
+    UnitTestImpl* unit_test) : unit_test_(unit_test) {}
+
+void DefaultGlobalTestPartResultReporter::ReportTestPartResult(
+    const TestPartResult& result) {
+  unit_test_->current_test_result()->AddTestPartResult(result);
+  unit_test_->listeners()->repeater()->OnTestPartResult(result);
+}
+
+DefaultPerThreadTestPartResultReporter::DefaultPerThreadTestPartResultReporter(
+    UnitTestImpl* unit_test) : unit_test_(unit_test) {}
+
+void DefaultPerThreadTestPartResultReporter::ReportTestPartResult(
+    const TestPartResult& result) {
+  unit_test_->GetGlobalTestPartResultReporter()->ReportTestPartResult(result);
+}
+
+// Returns the global test part result reporter.
+TestPartResultReporterInterface*
+UnitTestImpl::GetGlobalTestPartResultReporter() {
+  internal::MutexLock lock(&global_test_part_result_reporter_mutex_);
+  return global_test_part_result_repoter_;
+}
+
+// Sets the global test part result reporter.
+void UnitTestImpl::SetGlobalTestPartResultReporter(
+    TestPartResultReporterInterface* reporter) {
+  internal::MutexLock lock(&global_test_part_result_reporter_mutex_);
+  global_test_part_result_repoter_ = reporter;
+}
+
+// Returns the test part result reporter for the current thread.
+TestPartResultReporterInterface*
+UnitTestImpl::GetTestPartResultReporterForCurrentThread() {
+  return per_thread_test_part_result_reporter_.get();
+}
+
+// Sets the test part result reporter for the current thread.
+void UnitTestImpl::SetTestPartResultReporterForCurrentThread(
+    TestPartResultReporterInterface* reporter) {
+  per_thread_test_part_result_reporter_.set(reporter);
+}
+
+// Gets the number of successful test cases.
+int UnitTestImpl::successful_test_case_count() const {
+  return CountIf(test_cases_, TestCasePassed);
+}
+
+// Gets the number of failed test cases.
+int UnitTestImpl::failed_test_case_count() const {
+  return CountIf(test_cases_, TestCaseFailed);
+}
+
+// Gets the number of all test cases.
+int UnitTestImpl::total_test_case_count() const {
+  return static_cast<int>(test_cases_.size());
+}
+
+// Gets the number of all test cases that contain at least one test
+// that should run.
+int UnitTestImpl::test_case_to_run_count() const {
+  return CountIf(test_cases_, ShouldRunTestCase);
+}
+
+// Gets the number of successful tests.
+int UnitTestImpl::successful_test_count() const {
+  return SumOverTestCaseList(test_cases_, &TestCase::successful_test_count);
+}
+
+// Gets the number of failed tests.
+int UnitTestImpl::failed_test_count() const {
+  return SumOverTestCaseList(test_cases_, &TestCase::failed_test_count);
+}
+
+// Gets the number of disabled tests that will be reported in the XML report.
+int UnitTestImpl::reportable_disabled_test_count() const {
+  return SumOverTestCaseList(test_cases_,
+                             &TestCase::reportable_disabled_test_count);
+}
+
+// Gets the number of disabled tests.
+int UnitTestImpl::disabled_test_count() const {
+  return SumOverTestCaseList(test_cases_, &TestCase::disabled_test_count);
+}
+
+// Gets the number of tests to be printed in the XML report.
+int UnitTestImpl::reportable_test_count() const {
+  return SumOverTestCaseList(test_cases_, &TestCase::reportable_test_count);
+}
+
+// Gets the number of all tests.
+int UnitTestImpl::total_test_count() const {
+  return SumOverTestCaseList(test_cases_, &TestCase::total_test_count);
+}
+
+// Gets the number of tests that should run.
+int UnitTestImpl::test_to_run_count() const {
+  return SumOverTestCaseList(test_cases_, &TestCase::test_to_run_count);
+}
+
+// Returns the current OS stack trace as an std::string.
+//
+// The maximum number of stack frames to be included is specified by
+// the gtest_stack_trace_depth flag.  The skip_count parameter
+// specifies the number of top frames to be skipped, which doesn't
+// count against the number of frames to be included.
+//
+// For example, if Foo() calls Bar(), which in turn calls
+// CurrentOsStackTraceExceptTop(1), Foo() will be included in the
+// trace but Bar() and CurrentOsStackTraceExceptTop() won't.
+std::string UnitTestImpl::CurrentOsStackTraceExceptTop(int skip_count) {
+  (void)skip_count;
+  return "";
+}
+
+// Returns the current time in milliseconds.
+TimeInMillis GetTimeInMillis() {
+#if GTEST_OS_WINDOWS_MOBILE || defined(__BORLANDC__)
+  // Difference between 1970-01-01 and 1601-01-01 in milliseconds.
+  // http://analogous.blogspot.com/2005/04/epoch.html
+  const TimeInMillis kJavaEpochToWinFileTimeDelta =
+    static_cast<TimeInMillis>(116444736UL) * 100000UL;
+  const DWORD kTenthMicrosInMilliSecond = 10000;
+
+  SYSTEMTIME now_systime;
+  FILETIME now_filetime;
+  ULARGE_INTEGER now_int64;
+  // TODO(kenton@google.com): Shouldn't this just use
+  //   GetSystemTimeAsFileTime()?
+  GetSystemTime(&now_systime);
+  if (SystemTimeToFileTime(&now_systime, &now_filetime)) {
+    now_int64.LowPart = now_filetime.dwLowDateTime;
+    now_int64.HighPart = now_filetime.dwHighDateTime;
+    now_int64.QuadPart = (now_int64.QuadPart / kTenthMicrosInMilliSecond) -
+      kJavaEpochToWinFileTimeDelta;
+    return now_int64.QuadPart;
+  }
+  return 0;
+#elif GTEST_OS_WINDOWS && !GTEST_HAS_GETTIMEOFDAY_
+  __timeb64 now;
+
+# ifdef _MSC_VER
+
+  // MSVC 8 deprecates _ftime64(), so we want to suppress warning 4996
+  // (deprecated function) there.
+  // TODO(kenton@google.com): Use GetTickCount()?  Or use
+  //   SystemTimeToFileTime()
+#  pragma warning(push)          // Saves the current warning state.
+#  pragma warning(disable:4996)  // Temporarily disables warning 4996.
+  _ftime64(&now);
+#  pragma warning(pop)           // Restores the warning state.
+# else
+
+  _ftime64(&now);
+
+# endif  // _MSC_VER
+
+  return static_cast<TimeInMillis>(now.time) * 1000 + now.millitm;
+#elif GTEST_HAS_GETTIMEOFDAY_
+  struct timeval now;
+  gettimeofday(&now, NULL);
+  return static_cast<TimeInMillis>(now.tv_sec) * 1000 + now.tv_usec / 1000;
+#else
+# error "Don't know how to get the current time on your system."
+#endif
+}
+
+// Utilities
+
+// class String.
+
+#if GTEST_OS_WINDOWS_MOBILE
+// Creates a UTF-16 wide string from the given ANSI string, allocating
+// memory using new. The caller is responsible for deleting the return
+// value using delete[]. Returns the wide string, or NULL if the
+// input is NULL.
+LPCWSTR String::AnsiToUtf16(const char* ansi) {
+  if (!ansi) return NULL;
+  const int length = strlen(ansi);
+  const int unicode_length =
+      MultiByteToWideChar(CP_ACP, 0, ansi, length,
+                          NULL, 0);
+  WCHAR* unicode = new WCHAR[unicode_length + 1];
+  MultiByteToWideChar(CP_ACP, 0, ansi, length,
+                      unicode, unicode_length);
+  unicode[unicode_length] = 0;
+  return unicode;
+}
+
+// Creates an ANSI string from the given wide string, allocating
+// memory using new. The caller is responsible for deleting the return
+// value using delete[]. Returns the ANSI string, or NULL if the
+// input is NULL.
+const char* String::Utf16ToAnsi(LPCWSTR utf16_str)  {
+  if (!utf16_str) return NULL;
+  const int ansi_length =
+      WideCharToMultiByte(CP_ACP, 0, utf16_str, -1,
+                          NULL, 0, NULL, NULL);
+  char* ansi = new char[ansi_length + 1];
+  WideCharToMultiByte(CP_ACP, 0, utf16_str, -1,
+                      ansi, ansi_length, NULL, NULL);
+  ansi[ansi_length] = 0;
+  return ansi;
+}
+
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+// Compares two C strings.  Returns true iff they have the same content.
+//
+// Unlike strcmp(), this function can handle NULL argument(s).  A NULL
+// C string is considered different to any non-NULL C string,
+// including the empty string.
+bool String::CStringEquals(const char * lhs, const char * rhs) {
+  if ( lhs == NULL ) return rhs == NULL;
+
+  if ( rhs == NULL ) return false;
+
+  return strcmp(lhs, rhs) == 0;
+}
+
+#if GTEST_HAS_STD_WSTRING || GTEST_HAS_GLOBAL_WSTRING
+
+// Converts an array of wide chars to a narrow string using the UTF-8
+// encoding, and streams the result to the given Message object.
+static void StreamWideCharsToMessage(const wchar_t* wstr, size_t length,
+                                     Message* msg) {
+  for (size_t i = 0; i != length; ) {  // NOLINT
+    if (wstr[i] != L'\0') {
+      *msg << WideStringToUtf8(wstr + i, static_cast<int>(length - i));
+      while (i != length && wstr[i] != L'\0')
+        i++;
+    } else {
+      *msg << '\0';
+      i++;
+    }
+  }
+}
+
+#endif  // GTEST_HAS_STD_WSTRING || GTEST_HAS_GLOBAL_WSTRING
+
+}  // namespace internal
+
+// Constructs an empty Message.
+// We allocate the stringstream separately because otherwise each use of
+// ASSERT/EXPECT in a procedure adds over 200 bytes to the procedure's
+// stack frame leading to huge stack frames in some cases; gcc does not reuse
+// the stack space.
+Message::Message() : ss_(new ::std::stringstream) {
+  // By default, we want there to be enough precision when printing
+  // a double to a Message.
+  *ss_ << std::setprecision(std::numeric_limits<double>::digits10 + 2);
+}
+
+// These two overloads allow streaming a wide C string to a Message
+// using the UTF-8 encoding.
+Message& Message::operator <<(const wchar_t* wide_c_str) {
+  return *this << internal::String::ShowWideCString(wide_c_str);
+}
+Message& Message::operator <<(wchar_t* wide_c_str) {
+  return *this << internal::String::ShowWideCString(wide_c_str);
+}
+
+#if GTEST_HAS_STD_WSTRING
+// Converts the given wide string to a narrow string using the UTF-8
+// encoding, and streams the result to this Message object.
+Message& Message::operator <<(const ::std::wstring& wstr) {
+  internal::StreamWideCharsToMessage(wstr.c_str(), wstr.length(), this);
+  return *this;
+}
+#endif  // GTEST_HAS_STD_WSTRING
+
+#if GTEST_HAS_GLOBAL_WSTRING
+// Converts the given wide string to a narrow string using the UTF-8
+// encoding, and streams the result to this Message object.
+Message& Message::operator <<(const ::wstring& wstr) {
+  internal::StreamWideCharsToMessage(wstr.c_str(), wstr.length(), this);
+  return *this;
+}
+#endif  // GTEST_HAS_GLOBAL_WSTRING
+
+// Gets the text streamed to this object so far as an std::string.
+// Each '\0' character in the buffer is replaced with "\\0".
+std::string Message::GetString() const {
+  return internal::StringStreamToString(ss_.get());
+}
+
+// AssertionResult constructors.
+// Used in EXPECT_TRUE/FALSE(assertion_result).
+AssertionResult::AssertionResult(const AssertionResult& other)
+    : success_(other.success_),
+      message_(other.message_.get() != NULL ?
+               new ::std::string(*other.message_) :
+               static_cast< ::std::string*>(NULL)) {
+}
+
+// Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE.
+AssertionResult AssertionResult::operator!() const {
+  AssertionResult negation(!success_);
+  if (message_.get() != NULL)
+    negation << *message_;
+  return negation;
+}
+
+// Makes a successful assertion result.
+AssertionResult AssertionSuccess() {
+  return AssertionResult(true);
+}
+
+// Makes a failed assertion result.
+AssertionResult AssertionFailure() {
+  return AssertionResult(false);
+}
+
+// Makes a failed assertion result with the given failure message.
+// Deprecated; use AssertionFailure() << message.
+AssertionResult AssertionFailure(const Message& message) {
+  return AssertionFailure() << message;
+}
+
+namespace internal {
+
+// Constructs and returns the message for an equality assertion
+// (e.g. ASSERT_EQ, EXPECT_STREQ, etc) failure.
+//
+// The first four parameters are the expressions used in the assertion
+// and their values, as strings.  For example, for ASSERT_EQ(foo, bar)
+// where foo is 5 and bar is 6, we have:
+//
+//   expected_expression: "foo"
+//   actual_expression:   "bar"
+//   expected_value:      "5"
+//   actual_value:        "6"
+//
+// The ignoring_case parameter is true iff the assertion is a
+// *_STRCASEEQ*.  When it's true, the string " (ignoring case)" will
+// be inserted into the message.
+AssertionResult EqFailure(const char* expected_expression,
+                          const char* actual_expression,
+                          const std::string& expected_value,
+                          const std::string& actual_value,
+                          bool ignoring_case) {
+  Message msg;
+  msg << "Value of: " << actual_expression;
+  if (actual_value != actual_expression) {
+    msg << "\n  Actual: " << actual_value;
+  }
+
+  msg << "\nExpected: " << expected_expression;
+  if (ignoring_case) {
+    msg << " (ignoring case)";
+  }
+  if (expected_value != expected_expression) {
+    msg << "\nWhich is: " << expected_value;
+  }
+
+  return AssertionFailure() << msg;
+}
+
+// Constructs a failure message for Boolean assertions such as EXPECT_TRUE.
+std::string GetBoolAssertionFailureMessage(
+    const AssertionResult& assertion_result,
+    const char* expression_text,
+    const char* actual_predicate_value,
+    const char* expected_predicate_value) {
+  const char* actual_message = assertion_result.message();
+  Message msg;
+  msg << "Value of: " << expression_text
+      << "\n  Actual: " << actual_predicate_value;
+  if (actual_message[0] != '\0')
+    msg << " (" << actual_message << ")";
+  msg << "\nExpected: " << expected_predicate_value;
+  return msg.GetString();
+}
+
+// Helper function for implementing ASSERT_NEAR.
+AssertionResult DoubleNearPredFormat(const char* expr1,
+                                     const char* expr2,
+                                     const char* abs_error_expr,
+                                     double val1,
+                                     double val2,
+                                     double abs_error) {
+  const double diff = fabs(val1 - val2);
+  if (diff <= abs_error) return AssertionSuccess();
+
+  // TODO(wan): do not print the value of an expression if it's
+  // already a literal.
+  return AssertionFailure()
+      << "The difference between " << expr1 << " and " << expr2
+      << " is " << diff << ", which exceeds " << abs_error_expr << ", where\n"
+      << expr1 << " evaluates to " << val1 << ",\n"
+      << expr2 << " evaluates to " << val2 << ", and\n"
+      << abs_error_expr << " evaluates to " << abs_error << ".";
+}
+
+
+// Helper template for implementing FloatLE() and DoubleLE().
+template <typename RawType>
+AssertionResult FloatingPointLE(const char* expr1,
+                                const char* expr2,
+                                RawType val1,
+                                RawType val2) {
+  // Returns success if val1 is less than val2,
+  if (val1 < val2) {
+    return AssertionSuccess();
+  }
+
+  // or if val1 is almost equal to val2.
+  const FloatingPoint<RawType> lhs(val1), rhs(val2);
+  if (lhs.AlmostEquals(rhs)) {
+    return AssertionSuccess();
+  }
+
+  // Note that the above two checks will both fail if either val1 or
+  // val2 is NaN, as the IEEE floating-point standard requires that
+  // any predicate involving a NaN must return false.
+
+  ::std::stringstream val1_ss;
+  val1_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
+          << val1;
+
+  ::std::stringstream val2_ss;
+  val2_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
+          << val2;
+
+  return AssertionFailure()
+      << "Expected: (" << expr1 << ") <= (" << expr2 << ")\n"
+      << "  Actual: " << StringStreamToString(&val1_ss) << " vs "
+      << StringStreamToString(&val2_ss);
+}
+
+}  // namespace internal
+
+// Asserts that val1 is less than, or almost equal to, val2.  Fails
+// otherwise.  In particular, it fails if either val1 or val2 is NaN.
+AssertionResult FloatLE(const char* expr1, const char* expr2,
+                        float val1, float val2) {
+  return internal::FloatingPointLE<float>(expr1, expr2, val1, val2);
+}
+
+// Asserts that val1 is less than, or almost equal to, val2.  Fails
+// otherwise.  In particular, it fails if either val1 or val2 is NaN.
+AssertionResult DoubleLE(const char* expr1, const char* expr2,
+                         double val1, double val2) {
+  return internal::FloatingPointLE<double>(expr1, expr2, val1, val2);
+}
+
+namespace internal {
+
+// The helper function for {ASSERT|EXPECT}_EQ with int or enum
+// arguments.
+AssertionResult CmpHelperEQ(const char* expected_expression,
+                            const char* actual_expression,
+                            BiggestInt expected,
+                            BiggestInt actual) {
+  if (expected == actual) {
+    return AssertionSuccess();
+  }
+
+  return EqFailure(expected_expression,
+                   actual_expression,
+                   FormatForComparisonFailureMessage(expected, actual),
+                   FormatForComparisonFailureMessage(actual, expected),
+                   false);
+}
+
+// A macro for implementing the helper functions needed to implement
+// ASSERT_?? and EXPECT_?? with integer or enum arguments.  It is here
+// just to avoid copy-and-paste of similar code.
+#define GTEST_IMPL_CMP_HELPER_(op_name, op)\
+AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \
+                                   BiggestInt val1, BiggestInt val2) {\
+  if (val1 op val2) {\
+    return AssertionSuccess();\
+  } else {\
+    return AssertionFailure() \
+        << "Expected: (" << expr1 << ") " #op " (" << expr2\
+        << "), actual: " << FormatForComparisonFailureMessage(val1, val2)\
+        << " vs " << FormatForComparisonFailureMessage(val2, val1);\
+  }\
+}
+
+// Implements the helper function for {ASSERT|EXPECT}_NE with int or
+// enum arguments.
+GTEST_IMPL_CMP_HELPER_(NE, !=)
+// Implements the helper function for {ASSERT|EXPECT}_LE with int or
+// enum arguments.
+GTEST_IMPL_CMP_HELPER_(LE, <=)
+// Implements the helper function for {ASSERT|EXPECT}_LT with int or
+// enum arguments.
+GTEST_IMPL_CMP_HELPER_(LT, < )
+// Implements the helper function for {ASSERT|EXPECT}_GE with int or
+// enum arguments.
+GTEST_IMPL_CMP_HELPER_(GE, >=)
+// Implements the helper function for {ASSERT|EXPECT}_GT with int or
+// enum arguments.
+GTEST_IMPL_CMP_HELPER_(GT, > )
+
+#undef GTEST_IMPL_CMP_HELPER_
+
+// The helper function for {ASSERT|EXPECT}_STREQ.
+AssertionResult CmpHelperSTREQ(const char* expected_expression,
+                               const char* actual_expression,
+                               const char* expected,
+                               const char* actual) {
+  if (String::CStringEquals(expected, actual)) {
+    return AssertionSuccess();
+  }
+
+  return EqFailure(expected_expression,
+                   actual_expression,
+                   PrintToString(expected),
+                   PrintToString(actual),
+                   false);
+}
+
+// The helper function for {ASSERT|EXPECT}_STRCASEEQ.
+AssertionResult CmpHelperSTRCASEEQ(const char* expected_expression,
+                                   const char* actual_expression,
+                                   const char* expected,
+                                   const char* actual) {
+  if (String::CaseInsensitiveCStringEquals(expected, actual)) {
+    return AssertionSuccess();
+  }
+
+  return EqFailure(expected_expression,
+                   actual_expression,
+                   PrintToString(expected),
+                   PrintToString(actual),
+                   true);
+}
+
+// The helper function for {ASSERT|EXPECT}_STRNE.
+AssertionResult CmpHelperSTRNE(const char* s1_expression,
+                               const char* s2_expression,
+                               const char* s1,
+                               const char* s2) {
+  if (!String::CStringEquals(s1, s2)) {
+    return AssertionSuccess();
+  } else {
+    return AssertionFailure() << "Expected: (" << s1_expression << ") != ("
+                              << s2_expression << "), actual: \""
+                              << s1 << "\" vs \"" << s2 << "\"";
+  }
+}
+
+// The helper function for {ASSERT|EXPECT}_STRCASENE.
+AssertionResult CmpHelperSTRCASENE(const char* s1_expression,
+                                   const char* s2_expression,
+                                   const char* s1,
+                                   const char* s2) {
+  if (!String::CaseInsensitiveCStringEquals(s1, s2)) {
+    return AssertionSuccess();
+  } else {
+    return AssertionFailure()
+        << "Expected: (" << s1_expression << ") != ("
+        << s2_expression << ") (ignoring case), actual: \""
+        << s1 << "\" vs \"" << s2 << "\"";
+  }
+}
+
+}  // namespace internal
+
+namespace {
+
+// Helper functions for implementing IsSubString() and IsNotSubstring().
+
+// This group of overloaded functions return true iff needle is a
+// substring of haystack.  NULL is considered a substring of itself
+// only.
+
+bool IsSubstringPred(const char* needle, const char* haystack) {
+  if (needle == NULL || haystack == NULL)
+    return needle == haystack;
+
+  return strstr(haystack, needle) != NULL;
+}
+
+bool IsSubstringPred(const wchar_t* needle, const wchar_t* haystack) {
+  if (needle == NULL || haystack == NULL)
+    return needle == haystack;
+
+  return wcsstr(haystack, needle) != NULL;
+}
+
+// StringType here can be either ::std::string or ::std::wstring.
+template <typename StringType>
+bool IsSubstringPred(const StringType& needle,
+                     const StringType& haystack) {
+  return haystack.find(needle) != StringType::npos;
+}
+
+// This function implements either IsSubstring() or IsNotSubstring(),
+// depending on the value of the expected_to_be_substring parameter.
+// StringType here can be const char*, const wchar_t*, ::std::string,
+// or ::std::wstring.
+template <typename StringType>
+AssertionResult IsSubstringImpl(
+    bool expected_to_be_substring,
+    const char* needle_expr, const char* haystack_expr,
+    const StringType& needle, const StringType& haystack) {
+  if (IsSubstringPred(needle, haystack) == expected_to_be_substring)
+    return AssertionSuccess();
+
+  const bool is_wide_string = sizeof(needle[0]) > 1;
+  const char* const begin_string_quote = is_wide_string ? "L\"" : "\"";
+  return AssertionFailure()
+      << "Value of: " << needle_expr << "\n"
+      << "  Actual: " << begin_string_quote << needle << "\"\n"
+      << "Expected: " << (expected_to_be_substring ? "" : "not ")
+      << "a substring of " << haystack_expr << "\n"
+      << "Which is: " << begin_string_quote << haystack << "\"";
+}
+
+}  // namespace
+
+// IsSubstring() and IsNotSubstring() check whether needle is a
+// substring of haystack (NULL is considered a substring of itself
+// only), and return an appropriate error message when they fail.
+
+AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const char* needle, const char* haystack) {
+  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const wchar_t* needle, const wchar_t* haystack) {
+  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const char* needle, const char* haystack) {
+  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const wchar_t* needle, const wchar_t* haystack) {
+  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::string& needle, const ::std::string& haystack) {
+  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::string& needle, const ::std::string& haystack) {
+  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
+}
+
+#if GTEST_HAS_STD_WSTRING
+AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::wstring& needle, const ::std::wstring& haystack) {
+  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::wstring& needle, const ::std::wstring& haystack) {
+  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
+}
+#endif  // GTEST_HAS_STD_WSTRING
+
+namespace internal {
+
+#if GTEST_OS_WINDOWS
+
+namespace {
+
+// Helper function for IsHRESULT{SuccessFailure} predicates
+AssertionResult HRESULTFailureHelper(const char* expr,
+                                     const char* expected,
+                                     long hr) {  // NOLINT
+# if GTEST_OS_WINDOWS_MOBILE
+
+  // Windows CE doesn't support FormatMessage.
+  const char error_text[] = "";
+
+# else
+
+  // Looks up the human-readable system message for the HRESULT code
+  // and since we're not passing any params to FormatMessage, we don't
+  // want inserts expanded.
+  const DWORD kFlags = FORMAT_MESSAGE_FROM_SYSTEM |
+                       FORMAT_MESSAGE_IGNORE_INSERTS;
+  const DWORD kBufSize = 4096;
+  // Gets the system's human readable message string for this HRESULT.
+  char error_text[kBufSize] = { '\0' };
+  DWORD message_length = ::FormatMessageA(kFlags,
+                                          0,  // no source, we're asking system
+                                          hr,  // the error
+                                          0,  // no line width restrictions
+                                          error_text,  // output buffer
+                                          kBufSize,  // buf size
+                                          NULL);  // no arguments for inserts
+  // Trims tailing white space (FormatMessage leaves a trailing CR-LF)
+  for (; message_length && IsSpace(error_text[message_length - 1]);
+          --message_length) {
+    error_text[message_length - 1] = '\0';
+  }
+
+# endif  // GTEST_OS_WINDOWS_MOBILE
+
+  const std::string error_hex("0x" + String::FormatHexInt(hr));
+  return ::testing::AssertionFailure()
+      << "Expected: " << expr << " " << expected << ".\n"
+      << "  Actual: " << error_hex << " " << error_text << "\n";
+}
+
+}  // namespace
+
+AssertionResult IsHRESULTSuccess(const char* expr, long hr) {  // NOLINT
+  if (SUCCEEDED(hr)) {
+    return AssertionSuccess();
+  }
+  return HRESULTFailureHelper(expr, "succeeds", hr);
+}
+
+AssertionResult IsHRESULTFailure(const char* expr, long hr) {  // NOLINT
+  if (FAILED(hr)) {
+    return AssertionSuccess();
+  }
+  return HRESULTFailureHelper(expr, "fails", hr);
+}
+
+#endif  // GTEST_OS_WINDOWS
+
+// Utility functions for encoding Unicode text (wide strings) in
+// UTF-8.
+
+// A Unicode code-point can have upto 21 bits, and is encoded in UTF-8
+// like this:
+//
+// Code-point length   Encoding
+//   0 -  7 bits       0xxxxxxx
+//   8 - 11 bits       110xxxxx 10xxxxxx
+//  12 - 16 bits       1110xxxx 10xxxxxx 10xxxxxx
+//  17 - 21 bits       11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+
+// The maximum code-point a one-byte UTF-8 sequence can represent.
+const UInt32 kMaxCodePoint1 = (static_cast<UInt32>(1) <<  7) - 1;
+
+// The maximum code-point a two-byte UTF-8 sequence can represent.
+const UInt32 kMaxCodePoint2 = (static_cast<UInt32>(1) << (5 + 6)) - 1;
+
+// The maximum code-point a three-byte UTF-8 sequence can represent.
+const UInt32 kMaxCodePoint3 = (static_cast<UInt32>(1) << (4 + 2*6)) - 1;
+
+// The maximum code-point a four-byte UTF-8 sequence can represent.
+const UInt32 kMaxCodePoint4 = (static_cast<UInt32>(1) << (3 + 3*6)) - 1;
+
+// Chops off the n lowest bits from a bit pattern.  Returns the n
+// lowest bits.  As a side effect, the original bit pattern will be
+// shifted to the right by n bits.
+inline UInt32 ChopLowBits(UInt32* bits, int n) {
+  const UInt32 low_bits = *bits & ((static_cast<UInt32>(1) << n) - 1);
+  *bits >>= n;
+  return low_bits;
+}
+
+// Converts a Unicode code point to a narrow string in UTF-8 encoding.
+// code_point parameter is of type UInt32 because wchar_t may not be
+// wide enough to contain a code point.
+// If the code_point is not a valid Unicode code point
+// (i.e. outside of Unicode range U+0 to U+10FFFF) it will be converted
+// to "(Invalid Unicode 0xXXXXXXXX)".
+std::string CodePointToUtf8(UInt32 code_point) {
+  if (code_point > kMaxCodePoint4) {
+    return "(Invalid Unicode 0x" + String::FormatHexInt(code_point) + ")";
+  }
+
+  char str[5];  // Big enough for the largest valid code point.
+  if (code_point <= kMaxCodePoint1) {
+    str[1] = '\0';
+    str[0] = static_cast<char>(code_point);                          // 0xxxxxxx
+  } else if (code_point <= kMaxCodePoint2) {
+    str[2] = '\0';
+    str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[0] = static_cast<char>(0xC0 | code_point);                   // 110xxxxx
+  } else if (code_point <= kMaxCodePoint3) {
+    str[3] = '\0';
+    str[2] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[0] = static_cast<char>(0xE0 | code_point);                   // 1110xxxx
+  } else {  // code_point <= kMaxCodePoint4
+    str[4] = '\0';
+    str[3] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[2] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[0] = static_cast<char>(0xF0 | code_point);                   // 11110xxx
+  }
+  return str;
+}
+
+// The following two functions only make sense if the the system
+// uses UTF-16 for wide string encoding. All supported systems
+// with 16 bit wchar_t (Windows, Cygwin, Symbian OS) do use UTF-16.
+
+// Determines if the arguments constitute UTF-16 surrogate pair
+// and thus should be combined into a single Unicode code point
+// using CreateCodePointFromUtf16SurrogatePair.
+inline bool IsUtf16SurrogatePair(wchar_t first, wchar_t second) {
+  return sizeof(wchar_t) == 2 &&
+      (first & 0xFC00) == 0xD800 && (second & 0xFC00) == 0xDC00;
+}
+
+// Creates a Unicode code point from UTF16 surrogate pair.
+inline UInt32 CreateCodePointFromUtf16SurrogatePair(wchar_t first,
+                                                    wchar_t second) {
+  const UInt32 mask = (1 << 10) - 1;
+  return (sizeof(wchar_t) == 2) ?
+      (((first & mask) << 10) | (second & mask)) + 0x10000 :
+      // This function should not be called when the condition is
+      // false, but we provide a sensible default in case it is.
+      static_cast<UInt32>(first);
+}
+
+// Converts a wide string to a narrow string in UTF-8 encoding.
+// The wide string is assumed to have the following encoding:
+//   UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin, Symbian OS)
+//   UTF-32 if sizeof(wchar_t) == 4 (on Linux)
+// Parameter str points to a null-terminated wide string.
+// Parameter num_chars may additionally limit the number
+// of wchar_t characters processed. -1 is used when the entire string
+// should be processed.
+// If the string contains code points that are not valid Unicode code points
+// (i.e. outside of Unicode range U+0 to U+10FFFF) they will be output
+// as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding
+// and contains invalid UTF-16 surrogate pairs, values in those pairs
+// will be encoded as individual Unicode characters from Basic Normal Plane.
+std::string WideStringToUtf8(const wchar_t* str, int num_chars) {
+  if (num_chars == -1)
+    num_chars = static_cast<int>(wcslen(str));
+
+  ::std::stringstream stream;
+  for (int i = 0; i < num_chars; ++i) {
+    UInt32 unicode_code_point;
+
+    if (str[i] == L'\0') {
+      break;
+    } else if (i + 1 < num_chars && IsUtf16SurrogatePair(str[i], str[i + 1])) {
+      unicode_code_point = CreateCodePointFromUtf16SurrogatePair(str[i],
+                                                                 str[i + 1]);
+      i++;
+    } else {
+      unicode_code_point = static_cast<UInt32>(str[i]);
+    }
+
+    stream << CodePointToUtf8(unicode_code_point);
+  }
+  return StringStreamToString(&stream);
+}
+
+// Converts a wide C string to an std::string using the UTF-8 encoding.
+// NULL will be converted to "(null)".
+std::string String::ShowWideCString(const wchar_t * wide_c_str) {
+  if (wide_c_str == NULL)  return "(null)";
+
+  return internal::WideStringToUtf8(wide_c_str, -1);
+}
+
+// Compares two wide C strings.  Returns true iff they have the same
+// content.
+//
+// Unlike wcscmp(), this function can handle NULL argument(s).  A NULL
+// C string is considered different to any non-NULL C string,
+// including the empty string.
+bool String::WideCStringEquals(const wchar_t * lhs, const wchar_t * rhs) {
+  if (lhs == NULL) return rhs == NULL;
+
+  if (rhs == NULL) return false;
+
+  return wcscmp(lhs, rhs) == 0;
+}
+
+// Helper function for *_STREQ on wide strings.
+AssertionResult CmpHelperSTREQ(const char* expected_expression,
+                               const char* actual_expression,
+                               const wchar_t* expected,
+                               const wchar_t* actual) {
+  if (String::WideCStringEquals(expected, actual)) {
+    return AssertionSuccess();
+  }
+
+  return EqFailure(expected_expression,
+                   actual_expression,
+                   PrintToString(expected),
+                   PrintToString(actual),
+                   false);
+}
+
+// Helper function for *_STRNE on wide strings.
+AssertionResult CmpHelperSTRNE(const char* s1_expression,
+                               const char* s2_expression,
+                               const wchar_t* s1,
+                               const wchar_t* s2) {
+  if (!String::WideCStringEquals(s1, s2)) {
+    return AssertionSuccess();
+  }
+
+  return AssertionFailure() << "Expected: (" << s1_expression << ") != ("
+                            << s2_expression << "), actual: "
+                            << PrintToString(s1)
+                            << " vs " << PrintToString(s2);
+}
+
+// Compares two C strings, ignoring case.  Returns true iff they have
+// the same content.
+//
+// Unlike strcasecmp(), this function can handle NULL argument(s).  A
+// NULL C string is considered different to any non-NULL C string,
+// including the empty string.
+bool String::CaseInsensitiveCStringEquals(const char * lhs, const char * rhs) {
+  if (lhs == NULL)
+    return rhs == NULL;
+  if (rhs == NULL)
+    return false;
+  return posix::StrCaseCmp(lhs, rhs) == 0;
+}
+
+  // Compares two wide C strings, ignoring case.  Returns true iff they
+  // have the same content.
+  //
+  // Unlike wcscasecmp(), this function can handle NULL argument(s).
+  // A NULL C string is considered different to any non-NULL wide C string,
+  // including the empty string.
+  // NB: The implementations on different platforms slightly differ.
+  // On windows, this method uses _wcsicmp which compares according to LC_CTYPE
+  // environment variable. On GNU platform this method uses wcscasecmp
+  // which compares according to LC_CTYPE category of the current locale.
+  // On MacOS X, it uses towlower, which also uses LC_CTYPE category of the
+  // current locale.
+bool String::CaseInsensitiveWideCStringEquals(const wchar_t* lhs,
+                                              const wchar_t* rhs) {
+  if (lhs == NULL) return rhs == NULL;
+
+  if (rhs == NULL) return false;
+
+#if GTEST_OS_WINDOWS
+  return _wcsicmp(lhs, rhs) == 0;
+#elif GTEST_OS_LINUX && !GTEST_OS_LINUX_ANDROID
+  return wcscasecmp(lhs, rhs) == 0;
+#else
+  // Android, Mac OS X and Cygwin don't define wcscasecmp.
+  // Other unknown OSes may not define it either.
+  wint_t left, right;
+  do {
+    left = towlower(*lhs++);
+    right = towlower(*rhs++);
+  } while (left && left == right);
+  return left == right;
+#endif  // OS selector
+}
+
+// Returns true iff str ends with the given suffix, ignoring case.
+// Any string is considered to end with an empty suffix.
+bool String::EndsWithCaseInsensitive(
+    const std::string& str, const std::string& suffix) {
+  const size_t str_len = str.length();
+  const size_t suffix_len = suffix.length();
+  return (str_len >= suffix_len) &&
+         CaseInsensitiveCStringEquals(str.c_str() + str_len - suffix_len,
+                                      suffix.c_str());
+}
+
+// Formats an int value as "%02d".
+std::string String::FormatIntWidth2(int value) {
+  std::stringstream ss;
+  ss << std::setfill('0') << std::setw(2) << value;
+  return ss.str();
+}
+
+// Formats an int value as "%X".
+std::string String::FormatHexInt(int value) {
+  std::stringstream ss;
+  ss << std::hex << std::uppercase << value;
+  return ss.str();
+}
+
+// Formats a byte as "%02X".
+std::string String::FormatByte(unsigned char value) {
+  std::stringstream ss;
+  ss << std::setfill('0') << std::setw(2) << std::hex << std::uppercase
+     << static_cast<unsigned int>(value);
+  return ss.str();
+}
+
+// Converts the buffer in a stringstream to an std::string, converting NUL
+// bytes to "\\0" along the way.
+std::string StringStreamToString(::std::stringstream* ss) {
+  const ::std::string& str = ss->str();
+  const char* const start = str.c_str();
+  const char* const end = start + str.length();
+
+  std::string result;
+  result.reserve(2 * (end - start));
+  for (const char* ch = start; ch != end; ++ch) {
+    if (*ch == '\0') {
+      result += "\\0";  // Replaces NUL with "\\0";
+    } else {
+      result += *ch;
+    }
+  }
+
+  return result;
+}
+
+// Appends the user-supplied message to the Google-Test-generated message.
+std::string AppendUserMessage(const std::string& gtest_msg,
+                              const Message& user_msg) {
+  // Appends the user message if it's non-empty.
+  const std::string user_msg_string = user_msg.GetString();
+  if (user_msg_string.empty()) {
+    return gtest_msg;
+  }
+
+  return gtest_msg + "\n" + user_msg_string;
+}
+
+}  // namespace internal
+
+// class TestResult
+
+// Creates an empty TestResult.
+TestResult::TestResult()
+    : death_test_count_(0),
+      elapsed_time_(0) {
+}
+
+// D'tor.
+TestResult::~TestResult() {
+}
+
+// Returns the i-th test part result among all the results. i can
+// range from 0 to total_part_count() - 1. If i is not in that range,
+// aborts the program.
+const TestPartResult& TestResult::GetTestPartResult(int i) const {
+  if (i < 0 || i >= total_part_count())
+    internal::posix::Abort();
+  return test_part_results_.at(i);
+}
+
+// Returns the i-th test property. i can range from 0 to
+// test_property_count() - 1. If i is not in that range, aborts the
+// program.
+const TestProperty& TestResult::GetTestProperty(int i) const {
+  if (i < 0 || i >= test_property_count())
+    internal::posix::Abort();
+  return test_properties_.at(i);
+}
+
+// Clears the test part results.
+void TestResult::ClearTestPartResults() {
+  test_part_results_.clear();
+}
+
+// Adds a test part result to the list.
+void TestResult::AddTestPartResult(const TestPartResult& test_part_result) {
+  test_part_results_.push_back(test_part_result);
+}
+
+// Adds a test property to the list. If a property with the same key as the
+// supplied property is already represented, the value of this test_property
+// replaces the old value for that key.
+void TestResult::RecordProperty(const std::string& xml_element,
+                                const TestProperty& test_property) {
+  if (!ValidateTestProperty(xml_element, test_property)) {
+    return;
+  }
+  internal::MutexLock lock(&test_properites_mutex_);
+  const std::vector<TestProperty>::iterator property_with_matching_key =
+      std::find_if(test_properties_.begin(), test_properties_.end(),
+                   internal::TestPropertyKeyIs(test_property.key()));
+  if (property_with_matching_key == test_properties_.end()) {
+    test_properties_.push_back(test_property);
+    return;
+  }
+  property_with_matching_key->SetValue(test_property.value());
+}
+
+// The list of reserved attributes used in the <testsuites> element of XML
+// output.
+static const char* const kReservedTestSuitesAttributes[] = {
+  "disabled",
+  "errors",
+  "failures",
+  "name",
+  "random_seed",
+  "tests",
+  "time",
+  "timestamp"
+};
+
+// The list of reserved attributes used in the <testsuite> element of XML
+// output.
+static const char* const kReservedTestSuiteAttributes[] = {
+  "disabled",
+  "errors",
+  "failures",
+  "name",
+  "tests",
+  "time"
+};
+
+// The list of reserved attributes used in the <testcase> element of XML output.
+static const char* const kReservedTestCaseAttributes[] = {
+  "classname",
+  "name",
+  "status",
+  "time",
+  "type_param",
+  "value_param"
+};
+
+template <int kSize>
+std::vector<std::string> ArrayAsVector(const char* const (&array)[kSize]) {
+  return std::vector<std::string>(array, array + kSize);
+}
+
+static std::vector<std::string> GetReservedAttributesForElement(
+    const std::string& xml_element) {
+  if (xml_element == "testsuites") {
+    return ArrayAsVector(kReservedTestSuitesAttributes);
+  } else if (xml_element == "testsuite") {
+    return ArrayAsVector(kReservedTestSuiteAttributes);
+  } else if (xml_element == "testcase") {
+    return ArrayAsVector(kReservedTestCaseAttributes);
+  } else {
+    GTEST_CHECK_(false) << "Unrecognized xml_element provided: " << xml_element;
+  }
+  // This code is unreachable but some compilers may not realizes that.
+  return std::vector<std::string>();
+}
+
+static std::string FormatWordList(const std::vector<std::string>& words) {
+  Message word_list;
+  for (size_t i = 0; i < words.size(); ++i) {
+    if (i > 0 && words.size() > 2) {
+      word_list << ", ";
+    }
+    if (i == words.size() - 1) {
+      word_list << "and ";
+    }
+    word_list << "'" << words[i] << "'";
+  }
+  return word_list.GetString();
+}
+
+bool ValidateTestPropertyName(const std::string& property_name,
+                              const std::vector<std::string>& reserved_names) {
+  if (std::find(reserved_names.begin(), reserved_names.end(), property_name) !=
+          reserved_names.end()) {
+    ADD_FAILURE() << "Reserved key used in RecordProperty(): " << property_name
+                  << " (" << FormatWordList(reserved_names)
+                  << " are reserved by " << GTEST_NAME_ << ")";
+    return false;
+  }
+  return true;
+}
+
+// Adds a failure if the key is a reserved attribute of the element named
+// xml_element.  Returns true if the property is valid.
+bool TestResult::ValidateTestProperty(const std::string& xml_element,
+                                      const TestProperty& test_property) {
+  return ValidateTestPropertyName(test_property.key(),
+                                  GetReservedAttributesForElement(xml_element));
+}
+
+// Clears the object.
+void TestResult::Clear() {
+  test_part_results_.clear();
+  test_properties_.clear();
+  death_test_count_ = 0;
+  elapsed_time_ = 0;
+}
+
+// Returns true iff the test failed.
+bool TestResult::Failed() const {
+  for (int i = 0; i < total_part_count(); ++i) {
+    if (GetTestPartResult(i).failed())
+      return true;
+  }
+  return false;
+}
+
+// Returns true iff the test part fatally failed.
+static bool TestPartFatallyFailed(const TestPartResult& result) {
+  return result.fatally_failed();
+}
+
+// Returns true iff the test fatally failed.
+bool TestResult::HasFatalFailure() const {
+  return CountIf(test_part_results_, TestPartFatallyFailed) > 0;
+}
+
+// Returns true iff the test part non-fatally failed.
+static bool TestPartNonfatallyFailed(const TestPartResult& result) {
+  return result.nonfatally_failed();
+}
+
+// Returns true iff the test has a non-fatal failure.
+bool TestResult::HasNonfatalFailure() const {
+  return CountIf(test_part_results_, TestPartNonfatallyFailed) > 0;
+}
+
+// Gets the number of all test parts.  This is the sum of the number
+// of successful test parts and the number of failed test parts.
+int TestResult::total_part_count() const {
+  return static_cast<int>(test_part_results_.size());
+}
+
+// Returns the number of the test properties.
+int TestResult::test_property_count() const {
+  return static_cast<int>(test_properties_.size());
+}
+
+// class Test
+
+// Creates a Test object.
+
+// The c'tor saves the values of all Google Test flags.
+Test::Test()
+    : gtest_flag_saver_(new internal::GTestFlagSaver) {
+}
+
+// The d'tor restores the values of all Google Test flags.
+Test::~Test() {
+  delete gtest_flag_saver_;
+}
+
+// Sets up the test fixture.
+//
+// A sub-class may override this.
+void Test::SetUp() {
+}
+
+// Tears down the test fixture.
+//
+// A sub-class may override this.
+void Test::TearDown() {
+}
+
+// Allows user supplied key value pairs to be recorded for later output.
+void Test::RecordProperty(const std::string& key, const std::string& value) {
+  UnitTest::GetInstance()->RecordProperty(key, value);
+}
+
+// Allows user supplied key value pairs to be recorded for later output.
+void Test::RecordProperty(const std::string& key, int value) {
+  Message value_message;
+  value_message << value;
+  RecordProperty(key, value_message.GetString().c_str());
+}
+
+namespace internal {
+
+void ReportFailureInUnknownLocation(TestPartResult::Type result_type,
+                                    const std::string& message) {
+  // This function is a friend of UnitTest and as such has access to
+  // AddTestPartResult.
+  UnitTest::GetInstance()->AddTestPartResult(
+      result_type,
+      NULL,  // No info about the source file where the exception occurred.
+      -1,    // We have no info on which line caused the exception.
+      message,
+      "");   // No stack trace, either.
+}
+
+}  // namespace internal
+
+// Google Test requires all tests in the same test case to use the same test
+// fixture class.  This function checks if the current test has the
+// same fixture class as the first test in the current test case.  If
+// yes, it returns true; otherwise it generates a Google Test failure and
+// returns false.
+bool Test::HasSameFixtureClass() {
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  const TestCase* const test_case = impl->current_test_case();
+
+  // Info about the first test in the current test case.
+  const TestInfo* const first_test_info = test_case->test_info_list()[0];
+  const internal::TypeId first_fixture_id = first_test_info->fixture_class_id_;
+  const char* const first_test_name = first_test_info->name();
+
+  // Info about the current test.
+  const TestInfo* const this_test_info = impl->current_test_info();
+  const internal::TypeId this_fixture_id = this_test_info->fixture_class_id_;
+  const char* const this_test_name = this_test_info->name();
+
+  if (this_fixture_id != first_fixture_id) {
+    // Is the first test defined using TEST?
+    const bool first_is_TEST = first_fixture_id == internal::GetTestTypeId();
+    // Is this test defined using TEST?
+    const bool this_is_TEST = this_fixture_id == internal::GetTestTypeId();
+
+    if (first_is_TEST || this_is_TEST) {
+      // The user mixed TEST and TEST_F in this test case - we'll tell
+      // him/her how to fix it.
+
+      // Gets the name of the TEST and the name of the TEST_F.  Note
+      // that first_is_TEST and this_is_TEST cannot both be true, as
+      // the fixture IDs are different for the two tests.
+      const char* const TEST_name =
+          first_is_TEST ? first_test_name : this_test_name;
+      const char* const TEST_F_name =
+          first_is_TEST ? this_test_name : first_test_name;
+
+      ADD_FAILURE()
+          << "All tests in the same test case must use the same test fixture\n"
+          << "class, so mixing TEST_F and TEST in the same test case is\n"
+          << "illegal.  In test case " << this_test_info->test_case_name()
+          << ",\n"
+          << "test " << TEST_F_name << " is defined using TEST_F but\n"
+          << "test " << TEST_name << " is defined using TEST.  You probably\n"
+          << "want to change the TEST to TEST_F or move it to another test\n"
+          << "case.";
+    } else {
+      // The user defined two fixture classes with the same name in
+      // two namespaces - we'll tell him/her how to fix it.
+      ADD_FAILURE()
+          << "All tests in the same test case must use the same test fixture\n"
+          << "class.  However, in test case "
+          << this_test_info->test_case_name() << ",\n"
+          << "you defined test " << first_test_name
+          << " and test " << this_test_name << "\n"
+          << "using two different test fixture classes.  This can happen if\n"
+          << "the two classes are from different namespaces or translation\n"
+          << "units and have the same name.  You should probably rename one\n"
+          << "of the classes to put the tests into different test cases.";
+    }
+    return false;
+  }
+
+  return true;
+}
+
+#if GTEST_HAS_SEH
+
+// Adds an "exception thrown" fatal failure to the current test.  This
+// function returns its result via an output parameter pointer because VC++
+// prohibits creation of objects with destructors on stack in functions
+// using __try (see error C2712).
+static std::string* FormatSehExceptionMessage(DWORD exception_code,
+                                              const char* location) {
+  Message message;
+  message << "SEH exception with code 0x" << std::setbase(16) <<
+    exception_code << std::setbase(10) << " thrown in " << location << ".";
+
+  return new std::string(message.GetString());
+}
+
+#endif  // GTEST_HAS_SEH
+
+namespace internal {
+
+#if GTEST_HAS_EXCEPTIONS
+
+// Adds an "exception thrown" fatal failure to the current test.
+static std::string FormatCxxExceptionMessage(const char* description,
+                                             const char* location) {
+  Message message;
+  if (description != NULL) {
+    message << "C++ exception with description \"" << description << "\"";
+  } else {
+    message << "Unknown C++ exception";
+  }
+  message << " thrown in " << location << ".";
+
+  return message.GetString();
+}
+
+static std::string PrintTestPartResultToString(
+    const TestPartResult& test_part_result);
+
+GoogleTestFailureException::GoogleTestFailureException(
+    const TestPartResult& failure)
+    : ::std::runtime_error(PrintTestPartResultToString(failure).c_str()) {}
+
+#endif  // GTEST_HAS_EXCEPTIONS
+
+// We put these helper functions in the internal namespace as IBM's xlC
+// compiler rejects the code if they were declared static.
+
+// Runs the given method and handles SEH exceptions it throws, when
+// SEH is supported; returns the 0-value for type Result in case of an
+// SEH exception.  (Microsoft compilers cannot handle SEH and C++
+// exceptions in the same function.  Therefore, we provide a separate
+// wrapper function for handling SEH exceptions.)
+template <class T, typename Result>
+Result HandleSehExceptionsInMethodIfSupported(
+    T* object, Result (T::*method)(), const char* location) {
+#if GTEST_HAS_SEH
+  __try {
+    return (object->*method)();
+  } __except (internal::UnitTestOptions::GTestShouldProcessSEH(  // NOLINT
+      GetExceptionCode())) {
+    // We create the exception message on the heap because VC++ prohibits
+    // creation of objects with destructors on stack in functions using __try
+    // (see error C2712).
+    std::string* exception_message = FormatSehExceptionMessage(
+        GetExceptionCode(), location);
+    internal::ReportFailureInUnknownLocation(TestPartResult::kFatalFailure,
+                                             *exception_message);
+    delete exception_message;
+    return static_cast<Result>(0);
+  }
+#else
+  (void)location;
+  return (object->*method)();
+#endif  // GTEST_HAS_SEH
+}
+
+// Runs the given method and catches and reports C++ and/or SEH-style
+// exceptions, if they are supported; returns the 0-value for type
+// Result in case of an SEH exception.
+template <class T, typename Result>
+Result HandleExceptionsInMethodIfSupported(
+    T* object, Result (T::*method)(), const char* location) {
+  // NOTE: The user code can affect the way in which Google Test handles
+  // exceptions by setting GTEST_FLAG(catch_exceptions), but only before
+  // RUN_ALL_TESTS() starts. It is technically possible to check the flag
+  // after the exception is caught and either report or re-throw the
+  // exception based on the flag's value:
+  //
+  // try {
+  //   // Perform the test method.
+  // } catch (...) {
+  //   if (GTEST_FLAG(catch_exceptions))
+  //     // Report the exception as failure.
+  //   else
+  //     throw;  // Re-throws the original exception.
+  // }
+  //
+  // However, the purpose of this flag is to allow the program to drop into
+  // the debugger when the exception is thrown. On most platforms, once the
+  // control enters the catch block, the exception origin information is
+  // lost and the debugger will stop the program at the point of the
+  // re-throw in this function -- instead of at the point of the original
+  // throw statement in the code under test.  For this reason, we perform
+  // the check early, sacrificing the ability to affect Google Test's
+  // exception handling in the method where the exception is thrown.
+  if (internal::GetUnitTestImpl()->catch_exceptions()) {
+#if GTEST_HAS_EXCEPTIONS
+    try {
+      return HandleSehExceptionsInMethodIfSupported(object, method, location);
+    } catch (const internal::GoogleTestFailureException&) {  // NOLINT
+      // This exception type can only be thrown by a failed Google
+      // Test assertion with the intention of letting another testing
+      // framework catch it.  Therefore we just re-throw it.
+      throw;
+    } catch (const std::exception& e) {  // NOLINT
+      internal::ReportFailureInUnknownLocation(
+          TestPartResult::kFatalFailure,
+          FormatCxxExceptionMessage(e.what(), location));
+    } catch (...) {  // NOLINT
+      internal::ReportFailureInUnknownLocation(
+          TestPartResult::kFatalFailure,
+          FormatCxxExceptionMessage(NULL, location));
+    }
+    return static_cast<Result>(0);
+#else
+    return HandleSehExceptionsInMethodIfSupported(object, method, location);
+#endif  // GTEST_HAS_EXCEPTIONS
+  } else {
+    return (object->*method)();
+  }
+}
+
+}  // namespace internal
+
+// Runs the test and updates the test result.
+void Test::Run() {
+  if (!HasSameFixtureClass()) return;
+
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+  internal::HandleExceptionsInMethodIfSupported(this, &Test::SetUp, "SetUp()");
+  // We will run the test only if SetUp() was successful.
+  if (!HasFatalFailure()) {
+    impl->os_stack_trace_getter()->UponLeavingGTest();
+    internal::HandleExceptionsInMethodIfSupported(
+        this, &Test::TestBody, "the test body");
+  }
+
+  // However, we want to clean up as much as possible.  Hence we will
+  // always call TearDown(), even if SetUp() or the test body has
+  // failed.
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+  internal::HandleExceptionsInMethodIfSupported(
+      this, &Test::TearDown, "TearDown()");
+}
+
+// Returns true iff the current test has a fatal failure.
+bool Test::HasFatalFailure() {
+  return internal::GetUnitTestImpl()->current_test_result()->HasFatalFailure();
+}
+
+// Returns true iff the current test has a non-fatal failure.
+bool Test::HasNonfatalFailure() {
+  return internal::GetUnitTestImpl()->current_test_result()->
+      HasNonfatalFailure();
+}
+
+// class TestInfo
+
+// Constructs a TestInfo object. It assumes ownership of the test factory
+// object.
+TestInfo::TestInfo(const std::string& a_test_case_name,
+                   const std::string& a_name,
+                   const char* a_type_param,
+                   const char* a_value_param,
+                   internal::TypeId fixture_class_id,
+                   internal::TestFactoryBase* factory)
+    : test_case_name_(a_test_case_name),
+      name_(a_name),
+      type_param_(a_type_param ? new std::string(a_type_param) : NULL),
+      value_param_(a_value_param ? new std::string(a_value_param) : NULL),
+      fixture_class_id_(fixture_class_id),
+      should_run_(false),
+      is_disabled_(false),
+      matches_filter_(false),
+      factory_(factory),
+      result_() {}
+
+// Destructs a TestInfo object.
+TestInfo::~TestInfo() { delete factory_; }
+
+namespace internal {
+
+// Creates a new TestInfo object and registers it with Google Test;
+// returns the created object.
+//
+// Arguments:
+//
+//   test_case_name:   name of the test case
+//   name:             name of the test
+//   type_param:       the name of the test's type parameter, or NULL if
+//                     this is not a typed or a type-parameterized test.
+//   value_param:      text representation of the test's value parameter,
+//                     or NULL if this is not a value-parameterized test.
+//   fixture_class_id: ID of the test fixture class
+//   set_up_tc:        pointer to the function that sets up the test case
+//   tear_down_tc:     pointer to the function that tears down the test case
+//   factory:          pointer to the factory that creates a test object.
+//                     The newly created TestInfo instance will assume
+//                     ownership of the factory object.
+TestInfo* MakeAndRegisterTestInfo(
+    const char* test_case_name,
+    const char* name,
+    const char* type_param,
+    const char* value_param,
+    TypeId fixture_class_id,
+    SetUpTestCaseFunc set_up_tc,
+    TearDownTestCaseFunc tear_down_tc,
+    TestFactoryBase* factory) {
+  TestInfo* const test_info =
+      new TestInfo(test_case_name, name, type_param, value_param,
+                   fixture_class_id, factory);
+  GetUnitTestImpl()->AddTestInfo(set_up_tc, tear_down_tc, test_info);
+  return test_info;
+}
+
+#if GTEST_HAS_PARAM_TEST
+void ReportInvalidTestCaseType(const char* test_case_name,
+                               const char* file, int line) {
+  Message errors;
+  errors
+      << "Attempted redefinition of test case " << test_case_name << ".\n"
+      << "All tests in the same test case must use the same test fixture\n"
+      << "class.  However, in test case " << test_case_name << ", you tried\n"
+      << "to define a test using a fixture class different from the one\n"
+      << "used earlier. This can happen if the two fixture classes are\n"
+      << "from different namespaces and have the same name. You should\n"
+      << "probably rename one of the classes to put the tests into different\n"
+      << "test cases.";
+
+  fprintf(stderr, "%s %s", FormatFileLocation(file, line).c_str(),
+          errors.GetString().c_str());
+}
+#endif  // GTEST_HAS_PARAM_TEST
+
+}  // namespace internal
+
+namespace {
+
+// A predicate that checks the test name of a TestInfo against a known
+// value.
+//
+// This is used for implementation of the TestCase class only.  We put
+// it in the anonymous namespace to prevent polluting the outer
+// namespace.
+//
+// TestNameIs is copyable.
+
+//Commenting out this class since its not used and wherefor produces warnings
+// class TestNameIs {
+// public:
+//  // Constructor.
+//  //
+//  // TestNameIs has NO default constructor.
+//  explicit TestNameIs(const char* name)
+//      : name_(name) {}
+//
+//  // Returns true iff the test name of test_info matches name_.
+//  bool operator()(const TestInfo * test_info) const {
+//    return test_info && test_info->name() == name_;
+//  }
+//
+// private:
+//  std::string name_;
+//};
+
+}  // namespace
+
+namespace internal {
+
+// This method expands all parameterized tests registered with macros TEST_P
+// and INSTANTIATE_TEST_CASE_P into regular tests and registers those.
+// This will be done just once during the program runtime.
+void UnitTestImpl::RegisterParameterizedTests() {
+#if GTEST_HAS_PARAM_TEST
+  if (!parameterized_tests_registered_) {
+    parameterized_test_registry_.RegisterTests();
+    parameterized_tests_registered_ = true;
+  }
+#endif
+}
+
+}  // namespace internal
+
+// Creates the test object, runs it, records its result, and then
+// deletes it.
+void TestInfo::Run() {
+  if (!should_run_) return;
+
+  // Tells UnitTest where to store test result.
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  impl->set_current_test_info(this);
+
+  TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater();
+
+  // Notifies the unit test event listeners that a test is about to start.
+  repeater->OnTestStart(*this);
+
+  const TimeInMillis start = internal::GetTimeInMillis();
+
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+
+  // Creates the test object.
+  Test* const test = internal::HandleExceptionsInMethodIfSupported(
+      factory_, &internal::TestFactoryBase::CreateTest,
+      "the test fixture's constructor");
+
+  // Runs the test only if the test object was created and its
+  // constructor didn't generate a fatal failure.
+  if ((test != NULL) && !Test::HasFatalFailure()) {
+    // This doesn't throw as all user code that can throw are wrapped into
+    // exception handling code.
+    test->Run();
+  }
+
+  // Deletes the test object.
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+  internal::HandleExceptionsInMethodIfSupported(
+      test, &Test::DeleteSelf_, "the test fixture's destructor");
+
+  result_.set_elapsed_time(internal::GetTimeInMillis() - start);
+
+  // Notifies the unit test event listener that a test has just finished.
+  repeater->OnTestEnd(*this);
+
+  // Tells UnitTest to stop associating assertion results to this
+  // test.
+  impl->set_current_test_info(NULL);
+}
+
+// class TestCase
+
+// Gets the number of successful tests in this test case.
+int TestCase::successful_test_count() const {
+  return CountIf(test_info_list_, TestPassed);
+}
+
+// Gets the number of failed tests in this test case.
+int TestCase::failed_test_count() const {
+  return CountIf(test_info_list_, TestFailed);
+}
+
+// Gets the number of disabled tests that will be reported in the XML report.
+int TestCase::reportable_disabled_test_count() const {
+  return CountIf(test_info_list_, TestReportableDisabled);
+}
+
+// Gets the number of disabled tests in this test case.
+int TestCase::disabled_test_count() const {
+  return CountIf(test_info_list_, TestDisabled);
+}
+
+// Gets the number of tests to be printed in the XML report.
+int TestCase::reportable_test_count() const {
+  return CountIf(test_info_list_, TestReportable);
+}
+
+// Get the number of tests in this test case that should run.
+int TestCase::test_to_run_count() const {
+  return CountIf(test_info_list_, ShouldRunTest);
+}
+
+// Gets the number of all tests.
+int TestCase::total_test_count() const {
+  return static_cast<int>(test_info_list_.size());
+}
+
+// Creates a TestCase with the given name.
+//
+// Arguments:
+//
+//   name:         name of the test case
+//   a_type_param: the name of the test case's type parameter, or NULL if
+//                 this is not a typed or a type-parameterized test case.
+//   set_up_tc:    pointer to the function that sets up the test case
+//   tear_down_tc: pointer to the function that tears down the test case
+TestCase::TestCase(const char* a_name, const char* a_type_param,
+                   Test::SetUpTestCaseFunc set_up_tc,
+                   Test::TearDownTestCaseFunc tear_down_tc)
+    : name_(a_name),
+      type_param_(a_type_param ? new std::string(a_type_param) : NULL),
+      set_up_tc_(set_up_tc),
+      tear_down_tc_(tear_down_tc),
+      should_run_(false),
+      elapsed_time_(0) {
+}
+
+// Destructor of TestCase.
+TestCase::~TestCase() {
+  // Deletes every Test in the collection.
+  ForEach(test_info_list_, internal::Delete<TestInfo>);
+}
+
+// Returns the i-th test among all the tests. i can range from 0 to
+// total_test_count() - 1. If i is not in that range, returns NULL.
+const TestInfo* TestCase::GetTestInfo(int i) const {
+  const int index = GetElementOr(test_indices_, i, -1);
+  return index < 0 ? NULL : test_info_list_[index];
+}
+
+// Returns the i-th test among all the tests. i can range from 0 to
+// total_test_count() - 1. If i is not in that range, returns NULL.
+TestInfo* TestCase::GetMutableTestInfo(int i) {
+  const int index = GetElementOr(test_indices_, i, -1);
+  return index < 0 ? NULL : test_info_list_[index];
+}
+
+// Adds a test to this test case.  Will delete the test upon
+// destruction of the TestCase object.
+void TestCase::AddTestInfo(TestInfo * test_info) {
+  test_info_list_.push_back(test_info);
+  test_indices_.push_back(static_cast<int>(test_indices_.size()));
+}
+
+// Runs every test in this TestCase.
+void TestCase::Run() {
+  if (!should_run_) return;
+
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  impl->set_current_test_case(this);
+
+  TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater();
+
+  repeater->OnTestCaseStart(*this);
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+  internal::HandleExceptionsInMethodIfSupported(
+      this, &TestCase::RunSetUpTestCase, "SetUpTestCase()");
+
+  const internal::TimeInMillis start = internal::GetTimeInMillis();
+  for (int i = 0; i < total_test_count(); i++) {
+    GetMutableTestInfo(i)->Run();
+  }
+  elapsed_time_ = internal::GetTimeInMillis() - start;
+
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+  internal::HandleExceptionsInMethodIfSupported(
+      this, &TestCase::RunTearDownTestCase, "TearDownTestCase()");
+
+  repeater->OnTestCaseEnd(*this);
+  impl->set_current_test_case(NULL);
+}
+
+// Clears the results of all tests in this test case.
+void TestCase::ClearResult() {
+  ad_hoc_test_result_.Clear();
+  ForEach(test_info_list_, TestInfo::ClearTestResult);
+}
+
+// Shuffles the tests in this test case.
+void TestCase::ShuffleTests(internal::Random* random) {
+  Shuffle(random, &test_indices_);
+}
+
+// Restores the test order to before the first shuffle.
+void TestCase::UnshuffleTests() {
+  for (size_t i = 0; i < test_indices_.size(); i++) {
+    test_indices_[i] = static_cast<int>(i);
+  }
+}
+
+// Formats a countable noun.  Depending on its quantity, either the
+// singular form or the plural form is used. e.g.
+//
+// FormatCountableNoun(1, "formula", "formuli") returns "1 formula".
+// FormatCountableNoun(5, "book", "books") returns "5 books".
+static std::string FormatCountableNoun(int count,
+                                       const char * singular_form,
+                                       const char * plural_form) {
+  return internal::StreamableToString(count) + " " +
+      (count == 1 ? singular_form : plural_form);
+}
+
+// Formats the count of tests.
+static std::string FormatTestCount(int test_count) {
+  return FormatCountableNoun(test_count, "test", "tests");
+}
+
+// Formats the count of test cases.
+static std::string FormatTestCaseCount(int test_case_count) {
+  return FormatCountableNoun(test_case_count, "test case", "test cases");
+}
+
+// Converts a TestPartResult::Type enum to human-friendly string
+// representation.  Both kNonFatalFailure and kFatalFailure are translated
+// to "Failure", as the user usually doesn't care about the difference
+// between the two when viewing the test result.
+static const char * TestPartResultTypeToString(TestPartResult::Type type) {
+  switch (type) {
+    case TestPartResult::kSuccess:
+      return "Success";
+
+    case TestPartResult::kNonFatalFailure:
+    case TestPartResult::kFatalFailure:
+#ifdef _MSC_VER
+      return "error: ";
+#else
+      return "Failure\n";
+#endif
+    default:
+      return "Unknown result type";
+  }
+}
+
+namespace internal {
+
+// Prints a TestPartResult to an std::string.
+static std::string PrintTestPartResultToString(
+    const TestPartResult& test_part_result) {
+  return (Message()
+          << internal::FormatFileLocation(test_part_result.file_name(),
+                                          test_part_result.line_number())
+          << " " << TestPartResultTypeToString(test_part_result.type())
+          << test_part_result.message()).GetString();
+}
+
+// Prints a TestPartResult.
+static void PrintTestPartResult(const TestPartResult& test_part_result) {
+  const std::string& result =
+      PrintTestPartResultToString(test_part_result);
+  printf("%s\n", result.c_str());
+  fflush(stdout);
+  // If the test program runs in Visual Studio or a debugger, the
+  // following statements add the test part result message to the Output
+  // window such that the user can double-click on it to jump to the
+  // corresponding source code location; otherwise they do nothing.
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
+  // We don't call OutputDebugString*() on Windows Mobile, as printing
+  // to stdout is done by OutputDebugString() there already - we don't
+  // want the same message printed twice.
+  ::OutputDebugStringA(result.c_str());
+  ::OutputDebugStringA("\n");
+#endif
+}
+
+// class PrettyUnitTestResultPrinter
+
+enum GTestColor {
+  COLOR_DEFAULT,
+  COLOR_RED,
+  COLOR_GREEN,
+  COLOR_YELLOW
+};
+
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
+
+// Returns the character attribute for the given color.
+WORD GetColorAttribute(GTestColor color) {
+  switch (color) {
+    case COLOR_RED:    return FOREGROUND_RED;
+    case COLOR_GREEN:  return FOREGROUND_GREEN;
+    case COLOR_YELLOW: return FOREGROUND_RED | FOREGROUND_GREEN;
+    default:           return 0;
+  }
+}
+
+#else
+
+// Returns the ANSI color code for the given color.  COLOR_DEFAULT is
+// an invalid input.
+const char* GetAnsiColorCode(GTestColor color) {
+  switch (color) {
+    case COLOR_RED:     return "1";
+    case COLOR_GREEN:   return "2";
+    case COLOR_YELLOW:  return "3";
+    default:            return NULL;
+  };
+}
+
+#endif  // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
+
+// Returns true iff Google Test should use colors in the output.
+bool ShouldUseColor(bool stdout_is_tty) {
+  const char* const gtest_color = GTEST_FLAG(color).c_str();
+
+  if (String::CaseInsensitiveCStringEquals(gtest_color, "auto")) {
+#if GTEST_OS_WINDOWS
+    // On Windows the TERM variable is usually not set, but the
+    // console there does support colors.
+    return stdout_is_tty;
+#else
+    // On non-Windows platforms, we rely on the TERM variable.
+    const char* const term = posix::GetEnv("TERM");
+    const bool term_supports_color =
+        String::CStringEquals(term, "xterm") ||
+        String::CStringEquals(term, "xterm-color") ||
+        String::CStringEquals(term, "xterm-256color") ||
+        String::CStringEquals(term, "screen") ||
+        String::CStringEquals(term, "screen-256color") ||
+        String::CStringEquals(term, "linux") ||
+        String::CStringEquals(term, "cygwin");
+    return stdout_is_tty && term_supports_color;
+#endif  // GTEST_OS_WINDOWS
+  }
+
+  return String::CaseInsensitiveCStringEquals(gtest_color, "yes") ||
+      String::CaseInsensitiveCStringEquals(gtest_color, "true") ||
+      String::CaseInsensitiveCStringEquals(gtest_color, "t") ||
+      String::CStringEquals(gtest_color, "1");
+  // We take "yes", "true", "t", and "1" as meaning "yes".  If the
+  // value is neither one of these nor "auto", we treat it as "no" to
+  // be conservative.
+}
+
+// Helpers for printing colored strings to stdout. Note that on Windows, we
+// cannot simply emit special characters and have the terminal change colors.
+// This routine must actually emit the characters rather than return a string
+// that would be colored when printed, as can be done on Linux.
+void ColoredPrintf(GTestColor color, const char* fmt, ...) {
+  va_list args;
+  va_start(args, fmt);
+
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN || GTEST_OS_ZOS || GTEST_OS_IOS
+  const bool use_color = false;
+#else
+  static const bool in_color_mode =
+      ShouldUseColor(posix::IsATTY(posix::FileNo(stdout)) != 0);
+  const bool use_color = in_color_mode && (color != COLOR_DEFAULT);
+#endif  // GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN || GTEST_OS_ZOS
+  // The '!= 0' comparison is necessary to satisfy MSVC 7.1.
+
+  if (!use_color) {
+    vprintf(fmt, args);
+    va_end(args);
+    return;
+  }
+
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
+  const HANDLE stdout_handle = GetStdHandle(STD_OUTPUT_HANDLE);
+
+  // Gets the current text color.
+  CONSOLE_SCREEN_BUFFER_INFO buffer_info;
+  GetConsoleScreenBufferInfo(stdout_handle, &buffer_info);
+  const WORD old_color_attrs = buffer_info.wAttributes;
+
+  // We need to flush the stream buffers into the console before each
+  // SetConsoleTextAttribute call lest it affect the text that is already
+  // printed but has not yet reached the console.
+  fflush(stdout);
+  SetConsoleTextAttribute(stdout_handle,
+                          GetColorAttribute(color) | FOREGROUND_INTENSITY);
+  vprintf(fmt, args);
+
+  fflush(stdout);
+  // Restores the text color.
+  SetConsoleTextAttribute(stdout_handle, old_color_attrs);
+#else
+  printf("\033[0;3%sm", GetAnsiColorCode(color));
+  vprintf(fmt, args);
+  printf("\033[m");  // Resets the terminal to default.
+#endif  // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
+  va_end(args);
+}
+
+// Text printed in Google Test's text output and --gunit_list_tests
+// output to label the type parameter and value parameter for a test.
+static const char kTypeParamLabel[] = "TypeParam";
+static const char kValueParamLabel[] = "GetParam()";
+
+void PrintFullTestCommentIfPresent(const TestInfo& test_info) {
+  const char* const type_param = test_info.type_param();
+  const char* const value_param = test_info.value_param();
+
+  if (type_param != NULL || value_param != NULL) {
+    printf(", where ");
+    if (type_param != NULL) {
+      printf("%s = %s", kTypeParamLabel, type_param);
+      if (value_param != NULL)
+        printf(" and ");
+    }
+    if (value_param != NULL) {
+      printf("%s = %s", kValueParamLabel, value_param);
+    }
+  }
+}
+
+// This class implements the TestEventListener interface.
+//
+// Class PrettyUnitTestResultPrinter is copyable.
+class PrettyUnitTestResultPrinter : public TestEventListener {
+ public:
+  PrettyUnitTestResultPrinter() {}
+  static void PrintTestName(const char * test_case, const char * test) {
+    printf("%s.%s", test_case, test);
+  }
+
+  // The following methods override what's in the TestEventListener class.
+  virtual void OnTestProgramStart(const UnitTest& /*unit_test*/) {}
+  virtual void OnTestIterationStart(const UnitTest& unit_test, int iteration);
+  virtual void OnEnvironmentsSetUpStart(const UnitTest& unit_test);
+  virtual void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) {}
+  virtual void OnTestCaseStart(const TestCase& test_case);
+  virtual void OnTestStart(const TestInfo& test_info);
+  virtual void OnTestPartResult(const TestPartResult& result);
+  virtual void OnTestEnd(const TestInfo& test_info);
+  virtual void OnTestCaseEnd(const TestCase& test_case);
+  virtual void OnEnvironmentsTearDownStart(const UnitTest& unit_test);
+  virtual void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) {}
+  virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration);
+  virtual void OnTestProgramEnd(const UnitTest& /*unit_test*/) {}
+
+ private:
+  static void PrintFailedTests(const UnitTest& unit_test);
+};
+
+  // Fired before each iteration of tests starts.
+void PrettyUnitTestResultPrinter::OnTestIterationStart(
+    const UnitTest& unit_test, int iteration) {
+  if (GTEST_FLAG(repeat) != 1)
+    printf("\nRepeating all tests (iteration %d) . . .\n\n", iteration + 1);
+
+  const char* const filter = GTEST_FLAG(filter).c_str();
+
+  // Prints the filter if it's not *.  This reminds the user that some
+  // tests may be skipped.
+  if (!String::CStringEquals(filter, kUniversalFilter)) {
+    ColoredPrintf(COLOR_YELLOW,
+                  "Note: %s filter = %s\n", GTEST_NAME_, filter);
+  }
+
+  if (internal::ShouldShard(kTestTotalShards, kTestShardIndex, false)) {
+    const Int32 shard_index = Int32FromEnvOrDie(kTestShardIndex, -1);
+    ColoredPrintf(COLOR_YELLOW,
+                  "Note: This is test shard %d of %s.\n",
+                  static_cast<int>(shard_index) + 1,
+                  internal::posix::GetEnv(kTestTotalShards));
+  }
+
+  if (GTEST_FLAG(shuffle)) {
+    ColoredPrintf(COLOR_YELLOW,
+                  "Note: Randomizing tests' orders with a seed of %d .\n",
+                  unit_test.random_seed());
+  }
+
+  ColoredPrintf(COLOR_GREEN,  "[==========] ");
+  printf("Running %s from %s.\n",
+         FormatTestCount(unit_test.test_to_run_count()).c_str(),
+         FormatTestCaseCount(unit_test.test_case_to_run_count()).c_str());
+  fflush(stdout);
+}
+
+void PrettyUnitTestResultPrinter::OnEnvironmentsSetUpStart(
+    const UnitTest& /*unit_test*/) {
+  ColoredPrintf(COLOR_GREEN,  "[----------] ");
+  printf("Global test environment set-up.\n");
+  fflush(stdout);
+}
+
+void PrettyUnitTestResultPrinter::OnTestCaseStart(const TestCase& test_case) {
+  const std::string counts =
+      FormatCountableNoun(test_case.test_to_run_count(), "test", "tests");
+  ColoredPrintf(COLOR_GREEN, "[----------] ");
+  printf("%s from %s", counts.c_str(), test_case.name());
+  if (test_case.type_param() == NULL) {
+    printf("\n");
+  } else {
+    printf(", where %s = %s\n", kTypeParamLabel, test_case.type_param());
+  }
+  fflush(stdout);
+}
+
+void PrettyUnitTestResultPrinter::OnTestStart(const TestInfo& test_info) {
+  ColoredPrintf(COLOR_GREEN,  "[ RUN      ] ");
+  PrintTestName(test_info.test_case_name(), test_info.name());
+  printf("\n");
+  fflush(stdout);
+}
+
+// Called after an assertion failure.
+void PrettyUnitTestResultPrinter::OnTestPartResult(
+    const TestPartResult& result) {
+  // If the test part succeeded, we don't need to do anything.
+  if (result.type() == TestPartResult::kSuccess)
+    return;
+
+  // Print failure message from the assertion (e.g. expected this and got that).
+  PrintTestPartResult(result);
+  fflush(stdout);
+}
+
+void PrettyUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) {
+  if (test_info.result()->Passed()) {
+    ColoredPrintf(COLOR_GREEN, "[       OK ] ");
+  } else {
+    ColoredPrintf(COLOR_RED, "[  FAILED  ] ");
+  }
+  PrintTestName(test_info.test_case_name(), test_info.name());
+  if (test_info.result()->Failed())
+    PrintFullTestCommentIfPresent(test_info);
+
+  if (GTEST_FLAG(print_time)) {
+    printf(" (%s ms)\n", internal::StreamableToString(
+           test_info.result()->elapsed_time()).c_str());
+  } else {
+    printf("\n");
+  }
+  fflush(stdout);
+}
+
+void PrettyUnitTestResultPrinter::OnTestCaseEnd(const TestCase& test_case) {
+  if (!GTEST_FLAG(print_time)) return;
+
+  const std::string counts =
+      FormatCountableNoun(test_case.test_to_run_count(), "test", "tests");
+  ColoredPrintf(COLOR_GREEN, "[----------] ");
+  printf("%s from %s (%s ms total)\n\n",
+         counts.c_str(), test_case.name(),
+         internal::StreamableToString(test_case.elapsed_time()).c_str());
+  fflush(stdout);
+}
+
+void PrettyUnitTestResultPrinter::OnEnvironmentsTearDownStart(
+    const UnitTest& /*unit_test*/) {
+  ColoredPrintf(COLOR_GREEN,  "[----------] ");
+  printf("Global test environment tear-down\n");
+  fflush(stdout);
+}
+
+// Internal helper for printing the list of failed tests.
+void PrettyUnitTestResultPrinter::PrintFailedTests(const UnitTest& unit_test) {
+  const int failed_test_count = unit_test.failed_test_count();
+  if (failed_test_count == 0) {
+    return;
+  }
+
+  for (int i = 0; i < unit_test.total_test_case_count(); ++i) {
+    const TestCase& test_case = *unit_test.GetTestCase(i);
+    if (!test_case.should_run() || (test_case.failed_test_count() == 0)) {
+      continue;
+    }
+    for (int j = 0; j < test_case.total_test_count(); ++j) {
+      const TestInfo& test_info = *test_case.GetTestInfo(j);
+      if (!test_info.should_run() || test_info.result()->Passed()) {
+        continue;
+      }
+      ColoredPrintf(COLOR_RED, "[  FAILED  ] ");
+      printf("%s.%s", test_case.name(), test_info.name());
+      PrintFullTestCommentIfPresent(test_info);
+      printf("\n");
+    }
+  }
+}
+
+void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
+                                                     int /*iteration*/) {
+  ColoredPrintf(COLOR_GREEN,  "[==========] ");
+  printf("%s from %s ran.",
+         FormatTestCount(unit_test.test_to_run_count()).c_str(),
+         FormatTestCaseCount(unit_test.test_case_to_run_count()).c_str());
+  if (GTEST_FLAG(print_time)) {
+    printf(" (%s ms total)",
+           internal::StreamableToString(unit_test.elapsed_time()).c_str());
+  }
+  printf("\n");
+  ColoredPrintf(COLOR_GREEN,  "[  PASSED  ] ");
+  printf("%s.\n", FormatTestCount(unit_test.successful_test_count()).c_str());
+
+  int num_failures = unit_test.failed_test_count();
+  if (!unit_test.Passed()) {
+    const int failed_test_count = unit_test.failed_test_count();
+    ColoredPrintf(COLOR_RED,  "[  FAILED  ] ");
+    printf("%s, listed below:\n", FormatTestCount(failed_test_count).c_str());
+    PrintFailedTests(unit_test);
+    printf("\n%2d FAILED %s\n", num_failures,
+                        num_failures == 1 ? "TEST" : "TESTS");
+  }
+
+  int num_disabled = unit_test.reportable_disabled_test_count();
+  if (num_disabled && !GTEST_FLAG(also_run_disabled_tests)) {
+    if (!num_failures) {
+      printf("\n");  // Add a spacer if no FAILURE banner is displayed.
+    }
+    ColoredPrintf(COLOR_YELLOW,
+                  "  YOU HAVE %d DISABLED %s\n\n",
+                  num_disabled,
+                  num_disabled == 1 ? "TEST" : "TESTS");
+  }
+  // Ensure that Google Test output is printed before, e.g., heapchecker output.
+  fflush(stdout);
+}
+
+// End PrettyUnitTestResultPrinter
+
+// class TestEventRepeater
+//
+// This class forwards events to other event listeners.
+class TestEventRepeater : public TestEventListener {
+ public:
+  TestEventRepeater() : forwarding_enabled_(true) {}
+  virtual ~TestEventRepeater();
+  void Append(TestEventListener *listener);
+  TestEventListener* Release(TestEventListener* listener);
+
+  // Controls whether events will be forwarded to listeners_. Set to false
+  // in death test child processes.
+  bool forwarding_enabled() const { return forwarding_enabled_; }
+  void set_forwarding_enabled(bool enable) { forwarding_enabled_ = enable; }
+
+  virtual void OnTestProgramStart(const UnitTest& unit_test);
+  virtual void OnTestIterationStart(const UnitTest& unit_test, int iteration);
+  virtual void OnEnvironmentsSetUpStart(const UnitTest& unit_test);
+  virtual void OnEnvironmentsSetUpEnd(const UnitTest& unit_test);
+  virtual void OnTestCaseStart(const TestCase& test_case);
+  virtual void OnTestStart(const TestInfo& test_info);
+  virtual void OnTestPartResult(const TestPartResult& result);
+  virtual void OnTestEnd(const TestInfo& test_info);
+  virtual void OnTestCaseEnd(const TestCase& test_case);
+  virtual void OnEnvironmentsTearDownStart(const UnitTest& unit_test);
+  virtual void OnEnvironmentsTearDownEnd(const UnitTest& unit_test);
+  virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration);
+  virtual void OnTestProgramEnd(const UnitTest& unit_test);
+
+ private:
+  // Controls whether events will be forwarded to listeners_. Set to false
+  // in death test child processes.
+  bool forwarding_enabled_;
+  // The list of listeners that receive events.
+  std::vector<TestEventListener*> listeners_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestEventRepeater);
+};
+
+TestEventRepeater::~TestEventRepeater() {
+  ForEach(listeners_, Delete<TestEventListener>);
+}
+
+void TestEventRepeater::Append(TestEventListener *listener) {
+  listeners_.push_back(listener);
+}
+
+// TODO(vladl@google.com): Factor the search functionality into Vector::Find.
+TestEventListener* TestEventRepeater::Release(TestEventListener *listener) {
+  for (size_t i = 0; i < listeners_.size(); ++i) {
+    if (listeners_[i] == listener) {
+      listeners_.erase(listeners_.begin() + i);
+      return listener;
+    }
+  }
+
+  return NULL;
+}
+
+// Since most methods are very similar, use macros to reduce boilerplate.
+// This defines a member that forwards the call to all listeners.
+#define GTEST_REPEATER_METHOD_(Name, Type) \
+void TestEventRepeater::Name(const Type& parameter) { \
+  if (forwarding_enabled_) { \
+    for (size_t i = 0; i < listeners_.size(); i++) { \
+      listeners_[i]->Name(parameter); \
+    } \
+  } \
+}
+// This defines a member that forwards the call to all listeners in reverse
+// order.
+#define GTEST_REVERSE_REPEATER_METHOD_(Name, Type) \
+void TestEventRepeater::Name(const Type& parameter) { \
+  if (forwarding_enabled_) { \
+    for (int i = static_cast<int>(listeners_.size()) - 1; i >= 0; i--) { \
+      listeners_[i]->Name(parameter); \
+    } \
+  } \
+}
+
+GTEST_REPEATER_METHOD_(OnTestProgramStart, UnitTest)
+GTEST_REPEATER_METHOD_(OnEnvironmentsSetUpStart, UnitTest)
+GTEST_REPEATER_METHOD_(OnTestCaseStart, TestCase)
+GTEST_REPEATER_METHOD_(OnTestStart, TestInfo)
+GTEST_REPEATER_METHOD_(OnTestPartResult, TestPartResult)
+GTEST_REPEATER_METHOD_(OnEnvironmentsTearDownStart, UnitTest)
+GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsSetUpEnd, UnitTest)
+GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsTearDownEnd, UnitTest)
+GTEST_REVERSE_REPEATER_METHOD_(OnTestEnd, TestInfo)
+GTEST_REVERSE_REPEATER_METHOD_(OnTestCaseEnd, TestCase)
+GTEST_REVERSE_REPEATER_METHOD_(OnTestProgramEnd, UnitTest)
+
+#undef GTEST_REPEATER_METHOD_
+#undef GTEST_REVERSE_REPEATER_METHOD_
+
+void TestEventRepeater::OnTestIterationStart(const UnitTest& unit_test,
+                                             int iteration) {
+  if (forwarding_enabled_) {
+    for (size_t i = 0; i < listeners_.size(); i++) {
+      listeners_[i]->OnTestIterationStart(unit_test, iteration);
+    }
+  }
+}
+
+void TestEventRepeater::OnTestIterationEnd(const UnitTest& unit_test,
+                                           int iteration) {
+  if (forwarding_enabled_) {
+    for (int i = static_cast<int>(listeners_.size()) - 1; i >= 0; i--) {
+      listeners_[i]->OnTestIterationEnd(unit_test, iteration);
+    }
+  }
+}
+
+// End TestEventRepeater
+
+// This class generates an XML output file.
+class XmlUnitTestResultPrinter : public EmptyTestEventListener {
+ public:
+  explicit XmlUnitTestResultPrinter(const char* output_file);
+
+  virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration);
+
+ private:
+  // Is c a whitespace character that is normalized to a space character
+  // when it appears in an XML attribute value?
+  static bool IsNormalizableWhitespace(char c) {
+    return c == 0x9 || c == 0xA || c == 0xD;
+  }
+
+  // May c appear in a well-formed XML document?
+  static bool IsValidXmlCharacter(char c) {
+    return IsNormalizableWhitespace(c) || c >= 0x20;
+  }
+
+  // Returns an XML-escaped copy of the input string str.  If
+  // is_attribute is true, the text is meant to appear as an attribute
+  // value, and normalizable whitespace is preserved by replacing it
+  // with character references.
+  static std::string EscapeXml(const std::string& str, bool is_attribute);
+
+  // Returns the given string with all characters invalid in XML removed.
+  static std::string RemoveInvalidXmlCharacters(const std::string& str);
+
+  // Convenience wrapper around EscapeXml when str is an attribute value.
+  static std::string EscapeXmlAttribute(const std::string& str) {
+    return EscapeXml(str, true);
+  }
+
+  // Convenience wrapper around EscapeXml when str is not an attribute value.
+  static std::string EscapeXmlText(const char* str) {
+    return EscapeXml(str, false);
+  }
+
+  // Verifies that the given attribute belongs to the given element and
+  // streams the attribute as XML.
+  static void OutputXmlAttribute(std::ostream* stream,
+                                 const std::string& element_name,
+                                 const std::string& name,
+                                 const std::string& value);
+
+  // Streams an XML CDATA section, escaping invalid CDATA sequences as needed.
+  static void OutputXmlCDataSection(::std::ostream* stream, const char* data);
+
+  // Streams an XML representation of a TestInfo object.
+  static void OutputXmlTestInfo(::std::ostream* stream,
+                                const char* test_case_name,
+                                const TestInfo& test_info);
+
+  // Prints an XML representation of a TestCase object
+  static void PrintXmlTestCase(::std::ostream* stream,
+                               const TestCase& test_case);
+
+  // Prints an XML summary of unit_test to output stream out.
+  static void PrintXmlUnitTest(::std::ostream* stream,
+                               const UnitTest& unit_test);
+
+  // Produces a string representing the test properties in a result as space
+  // delimited XML attributes based on the property key="value" pairs.
+  // When the std::string is not empty, it includes a space at the beginning,
+  // to delimit this attribute from prior attributes.
+  static std::string TestPropertiesAsXmlAttributes(const TestResult& result);
+
+  // The output file.
+  const std::string output_file_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(XmlUnitTestResultPrinter);
+};
+
+// Creates a new XmlUnitTestResultPrinter.
+XmlUnitTestResultPrinter::XmlUnitTestResultPrinter(const char* output_file)
+    : output_file_(output_file) {
+  if (output_file_.c_str() == NULL || output_file_.empty()) {
+    fprintf(stderr, "XML output file may not be null\n");
+    fflush(stderr);
+    exit(EXIT_FAILURE);
+  }
+}
+
+// Called after the unit test ends.
+void XmlUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
+                                                  int /*iteration*/) {
+  FILE* xmlout = NULL;
+  FilePath output_file(output_file_);
+  FilePath output_dir(output_file.RemoveFileName());
+
+  if (output_dir.CreateDirectoriesRecursively()) {
+    xmlout = posix::FOpen(output_file_.c_str(), "w");
+  }
+  if (xmlout == NULL) {
+    // TODO(wan): report the reason of the failure.
+    //
+    // We don't do it for now as:
+    //
+    //   1. There is no urgent need for it.
+    //   2. It's a bit involved to make the errno variable thread-safe on
+    //      all three operating systems (Linux, Windows, and Mac OS).
+    //   3. To interpret the meaning of errno in a thread-safe way,
+    //      we need the strerror_r() function, which is not available on
+    //      Windows.
+    fprintf(stderr,
+            "Unable to open file \"%s\"\n",
+            output_file_.c_str());
+    fflush(stderr);
+    exit(EXIT_FAILURE);
+  }
+  std::stringstream stream;
+  PrintXmlUnitTest(&stream, unit_test);
+  fprintf(xmlout, "%s", StringStreamToString(&stream).c_str());
+  fclose(xmlout);
+}
+
+// Returns an XML-escaped copy of the input string str.  If is_attribute
+// is true, the text is meant to appear as an attribute value, and
+// normalizable whitespace is preserved by replacing it with character
+// references.
+//
+// Invalid XML characters in str, if any, are stripped from the output.
+// It is expected that most, if not all, of the text processed by this
+// module will consist of ordinary English text.
+// If this module is ever modified to produce version 1.1 XML output,
+// most invalid characters can be retained using character references.
+// TODO(wan): It might be nice to have a minimally invasive, human-readable
+// escaping scheme for invalid characters, rather than dropping them.
+std::string XmlUnitTestResultPrinter::EscapeXml(
+    const std::string& str, bool is_attribute) {
+  Message m;
+
+  for (size_t i = 0; i < str.size(); ++i) {
+    const char ch = str[i];
+    switch (ch) {
+      case '<':
+        m << "&lt;";
+        break;
+      case '>':
+        m << "&gt;";
+        break;
+      case '&':
+        m << "&amp;";
+        break;
+      case '\'':
+        if (is_attribute)
+          m << "&apos;";
+        else
+          m << '\'';
+        break;
+      case '"':
+        if (is_attribute)
+          m << "&quot;";
+        else
+          m << '"';
+        break;
+      default:
+        if (IsValidXmlCharacter(ch)) {
+          if (is_attribute && IsNormalizableWhitespace(ch))
+            m << "&#x" << String::FormatByte(static_cast<unsigned char>(ch))
+              << ";";
+          else
+            m << ch;
+        }
+        break;
+    }
+  }
+
+  return m.GetString();
+}
+
+// Returns the given string with all characters invalid in XML removed.
+// Currently invalid characters are dropped from the string. An
+// alternative is to replace them with certain characters such as . or ?.
+std::string XmlUnitTestResultPrinter::RemoveInvalidXmlCharacters(
+    const std::string& str) {
+  std::string output;
+  output.reserve(str.size());
+  for (std::string::const_iterator it = str.begin(); it != str.end(); ++it)
+    if (IsValidXmlCharacter(*it))
+      output.push_back(*it);
+
+  return output;
+}
+
+// The following routines generate an XML representation of a UnitTest
+// object.
+//
+// This is how Google Test concepts map to the DTD:
+//
+// <testsuites name="AllTests">        <-- corresponds to a UnitTest object
+//   <testsuite name="testcase-name">  <-- corresponds to a TestCase object
+//     <testcase name="test-name">     <-- corresponds to a TestInfo object
+//       <failure message="...">...</failure>
+//       <failure message="...">...</failure>
+//       <failure message="...">...</failure>
+//                                     <-- individual assertion failures
+//     </testcase>
+//   </testsuite>
+// </testsuites>
+
+// Formats the given time in milliseconds as seconds.
+std::string FormatTimeInMillisAsSeconds(TimeInMillis ms) {
+  ::std::stringstream ss;
+  ss << ms/1000.0;
+  return ss.str();
+}
+
+// Converts the given epoch time in milliseconds to a date string in the ISO
+// 8601 format, without the timezone information.
+std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms) {
+  // Using non-reentrant version as localtime_r is not portable.
+  time_t seconds = static_cast<time_t>(ms / 1000);
+#ifdef _MSC_VER
+# pragma warning(push)          // Saves the current warning state.
+# pragma warning(disable:4996)  // Temporarily disables warning 4996
+                                // (function or variable may be unsafe).
+  const struct tm* const time_struct = localtime(&seconds);  // NOLINT
+# pragma warning(pop)           // Restores the warning state again.
+#else
+  const struct tm* const time_struct = localtime(&seconds);  // NOLINT
+#endif
+  if (time_struct == NULL)
+    return "";  // Invalid ms value
+
+  // YYYY-MM-DDThh:mm:ss
+  return StreamableToString(time_struct->tm_year + 1900) + "-" +
+      String::FormatIntWidth2(time_struct->tm_mon + 1) + "-" +
+      String::FormatIntWidth2(time_struct->tm_mday) + "T" +
+      String::FormatIntWidth2(time_struct->tm_hour) + ":" +
+      String::FormatIntWidth2(time_struct->tm_min) + ":" +
+      String::FormatIntWidth2(time_struct->tm_sec);
+}
+
+// Streams an XML CDATA section, escaping invalid CDATA sequences as needed.
+void XmlUnitTestResultPrinter::OutputXmlCDataSection(::std::ostream* stream,
+                                                     const char* data) {
+  const char* segment = data;
+  *stream << "<![CDATA[";
+  for (;;) {
+    const char* const next_segment = strstr(segment, "]]>");
+    if (next_segment != NULL) {
+      stream->write(
+          segment, static_cast<std::streamsize>(next_segment - segment));
+      *stream << "]]>]]&gt;<![CDATA[";
+      segment = next_segment + strlen("]]>");
+    } else {
+      *stream << segment;
+      break;
+    }
+  }
+  *stream << "]]>";
+}
+
+void XmlUnitTestResultPrinter::OutputXmlAttribute(
+    std::ostream* stream,
+    const std::string& element_name,
+    const std::string& name,
+    const std::string& value) {
+  const std::vector<std::string>& allowed_names =
+      GetReservedAttributesForElement(element_name);
+
+  GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) !=
+                   allowed_names.end())
+      << "Attribute " << name << " is not allowed for element <" << element_name
+      << ">.";
+
+  *stream << " " << name << "=\"" << EscapeXmlAttribute(value) << "\"";
+}
+
+// Prints an XML representation of a TestInfo object.
+// TODO(wan): There is also value in printing properties with the plain printer.
+void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream* stream,
+                                                 const char* test_case_name,
+                                                 const TestInfo& test_info) {
+  const TestResult& result = *test_info.result();
+  const std::string kTestcase = "testcase";
+
+  *stream << "    <testcase";
+  OutputXmlAttribute(stream, kTestcase, "name", test_info.name());
+
+  if (test_info.value_param() != NULL) {
+    OutputXmlAttribute(stream, kTestcase, "value_param",
+                       test_info.value_param());
+  }
+  if (test_info.type_param() != NULL) {
+    OutputXmlAttribute(stream, kTestcase, "type_param", test_info.type_param());
+  }
+
+  OutputXmlAttribute(stream, kTestcase, "status",
+                     test_info.should_run() ? "run" : "notrun");
+  OutputXmlAttribute(stream, kTestcase, "time",
+                     FormatTimeInMillisAsSeconds(result.elapsed_time()));
+  OutputXmlAttribute(stream, kTestcase, "classname", test_case_name);
+  *stream << TestPropertiesAsXmlAttributes(result);
+
+  int failures = 0;
+  for (int i = 0; i < result.total_part_count(); ++i) {
+    const TestPartResult& part = result.GetTestPartResult(i);
+    if (part.failed()) {
+      if (++failures == 1) {
+        *stream << ">\n";
+      }
+      const string location = internal::FormatCompilerIndependentFileLocation(
+          part.file_name(), part.line_number());
+      const string summary = location + "\n" + part.summary();
+      *stream << "      <failure message=\""
+              << EscapeXmlAttribute(summary.c_str())
+              << "\" type=\"\">";
+      const string detail = location + "\n" + part.message();
+      OutputXmlCDataSection(stream, RemoveInvalidXmlCharacters(detail).c_str());
+      *stream << "</failure>\n";
+    }
+  }
+
+  if (failures == 0)
+    *stream << " />\n";
+  else
+    *stream << "    </testcase>\n";
+}
+
+// Prints an XML representation of a TestCase object
+void XmlUnitTestResultPrinter::PrintXmlTestCase(std::ostream* stream,
+                                                const TestCase& test_case) {
+  const std::string kTestsuite = "testsuite";
+  *stream << "  <" << kTestsuite;
+  OutputXmlAttribute(stream, kTestsuite, "name", test_case.name());
+  OutputXmlAttribute(stream, kTestsuite, "tests",
+                     StreamableToString(test_case.reportable_test_count()));
+  OutputXmlAttribute(stream, kTestsuite, "failures",
+                     StreamableToString(test_case.failed_test_count()));
+  OutputXmlAttribute(
+      stream, kTestsuite, "disabled",
+      StreamableToString(test_case.reportable_disabled_test_count()));
+  OutputXmlAttribute(stream, kTestsuite, "errors", "0");
+  OutputXmlAttribute(stream, kTestsuite, "time",
+                     FormatTimeInMillisAsSeconds(test_case.elapsed_time()));
+  *stream << TestPropertiesAsXmlAttributes(test_case.ad_hoc_test_result())
+          << ">\n";
+
+  for (int i = 0; i < test_case.total_test_count(); ++i) {
+    if (test_case.GetTestInfo(i)->is_reportable())
+      OutputXmlTestInfo(stream, test_case.name(), *test_case.GetTestInfo(i));
+  }
+  *stream << "  </" << kTestsuite << ">\n";
+}
+
+// Prints an XML summary of unit_test to output stream out.
+void XmlUnitTestResultPrinter::PrintXmlUnitTest(std::ostream* stream,
+                                                const UnitTest& unit_test) {
+  const std::string kTestsuites = "testsuites";
+
+  *stream << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
+  *stream << "<" << kTestsuites;
+
+  OutputXmlAttribute(stream, kTestsuites, "tests",
+                     StreamableToString(unit_test.reportable_test_count()));
+  OutputXmlAttribute(stream, kTestsuites, "failures",
+                     StreamableToString(unit_test.failed_test_count()));
+  OutputXmlAttribute(
+      stream, kTestsuites, "disabled",
+      StreamableToString(unit_test.reportable_disabled_test_count()));
+  OutputXmlAttribute(stream, kTestsuites, "errors", "0");
+  OutputXmlAttribute(
+      stream, kTestsuites, "timestamp",
+      FormatEpochTimeInMillisAsIso8601(unit_test.start_timestamp()));
+  OutputXmlAttribute(stream, kTestsuites, "time",
+                     FormatTimeInMillisAsSeconds(unit_test.elapsed_time()));
+
+  if (GTEST_FLAG(shuffle)) {
+    OutputXmlAttribute(stream, kTestsuites, "random_seed",
+                       StreamableToString(unit_test.random_seed()));
+  }
+
+  *stream << TestPropertiesAsXmlAttributes(unit_test.ad_hoc_test_result());
+
+  OutputXmlAttribute(stream, kTestsuites, "name", "AllTests");
+  *stream << ">\n";
+
+  for (int i = 0; i < unit_test.total_test_case_count(); ++i) {
+    if (unit_test.GetTestCase(i)->reportable_test_count() > 0)
+      PrintXmlTestCase(stream, *unit_test.GetTestCase(i));
+  }
+  *stream << "</" << kTestsuites << ">\n";
+}
+
+// Produces a string representing the test properties in a result as space
+// delimited XML attributes based on the property key="value" pairs.
+std::string XmlUnitTestResultPrinter::TestPropertiesAsXmlAttributes(
+    const TestResult& result) {
+  Message attributes;
+  for (int i = 0; i < result.test_property_count(); ++i) {
+    const TestProperty& property = result.GetTestProperty(i);
+    attributes << " " << property.key() << "="
+        << "\"" << EscapeXmlAttribute(property.value()) << "\"";
+  }
+  return attributes.GetString();
+}
+
+// End XmlUnitTestResultPrinter
+
+#if GTEST_CAN_STREAM_RESULTS_
+
+// Checks if str contains '=', '&', '%' or '\n' characters. If yes,
+// replaces them by "%xx" where xx is their hexadecimal value. For
+// example, replaces "=" with "%3D".  This algorithm is O(strlen(str))
+// in both time and space -- important as the input str may contain an
+// arbitrarily long test failure message and stack trace.
+string StreamingListener::UrlEncode(const char* str) {
+  string result;
+  result.reserve(strlen(str) + 1);
+  for (char ch = *str; ch != '\0'; ch = *++str) {
+    switch (ch) {
+      case '%':
+      case '=':
+      case '&':
+      case '\n':
+        result.append("%" + String::FormatByte(static_cast<unsigned char>(ch)));
+        break;
+      default:
+        result.push_back(ch);
+        break;
+    }
+  }
+  return result;
+}
+
+void StreamingListener::SocketWriter::MakeConnection() {
+  GTEST_CHECK_(sockfd_ == -1)
+      << "MakeConnection() can't be called when there is already a connection.";
+
+  addrinfo hints;
+  memset(&hints, 0, sizeof(hints));
+  hints.ai_family = AF_UNSPEC;    // To allow both IPv4 and IPv6 addresses.
+  hints.ai_socktype = SOCK_STREAM;
+  addrinfo* servinfo = NULL;
+
+  // Use the getaddrinfo() to get a linked list of IP addresses for
+  // the given host name.
+  const int error_num = getaddrinfo(
+      host_name_.c_str(), port_num_.c_str(), &hints, &servinfo);
+  if (error_num != 0) {
+    GTEST_LOG_(WARNING) << "stream_result_to: getaddrinfo() failed: "
+                        << gai_strerror(error_num);
+  }
+
+  // Loop through all the results and connect to the first we can.
+  for (addrinfo* cur_addr = servinfo; sockfd_ == -1 && cur_addr != NULL;
+       cur_addr = cur_addr->ai_next) {
+    sockfd_ = socket(
+        cur_addr->ai_family, cur_addr->ai_socktype, cur_addr->ai_protocol);
+    if (sockfd_ != -1) {
+      // Connect the client socket to the server socket.
+      if (connect(sockfd_, cur_addr->ai_addr, cur_addr->ai_addrlen) == -1) {
+        close(sockfd_);
+        sockfd_ = -1;
+      }
+    }
+  }
+
+  freeaddrinfo(servinfo);  // all done with this structure
+
+  if (sockfd_ == -1) {
+    GTEST_LOG_(WARNING) << "stream_result_to: failed to connect to "
+                        << host_name_ << ":" << port_num_;
+  }
+}
+
+// End of class Streaming Listener
+#endif  // GTEST_CAN_STREAM_RESULTS__
+
+// Class ScopedTrace
+
+// Pushes the given source file location and message onto a per-thread
+// trace stack maintained by Google Test.
+ScopedTrace::ScopedTrace(const char* file, int line, const Message& message)
+    GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) {
+  TraceInfo trace;
+  trace.file = file;
+  trace.line = line;
+  trace.message = message.GetString();
+
+  UnitTest::GetInstance()->PushGTestTrace(trace);
+}
+
+// Pops the info pushed by the c'tor.
+ScopedTrace::~ScopedTrace()
+    GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) {
+  UnitTest::GetInstance()->PopGTestTrace();
+}
+
+
+// class OsStackTraceGetter
+
+// Returns the current OS stack trace as an std::string.  Parameters:
+//
+//   max_depth  - the maximum number of stack frames to be included
+//                in the trace.
+//   skip_count - the number of top frames to be skipped; doesn't count
+//                against max_depth.
+//
+string OsStackTraceGetter::CurrentStackTrace(int /* max_depth */,
+                                             int /* skip_count */)
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+  return "";
+}
+
+void OsStackTraceGetter::UponLeavingGTest()
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+}
+
+const char* const
+OsStackTraceGetter::kElidedFramesMarker =
+    "... " GTEST_NAME_ " internal frames ...";
+
+// A helper class that creates the premature-exit file in its
+// constructor and deletes the file in its destructor.
+class ScopedPrematureExitFile {
+ public:
+  explicit ScopedPrematureExitFile(const char* premature_exit_filepath)
+      : premature_exit_filepath_(premature_exit_filepath) {
+    // If a path to the premature-exit file is specified...
+    if (premature_exit_filepath != NULL && *premature_exit_filepath != '\0') {
+      // create the file with a single "0" character in it.  I/O
+      // errors are ignored as there's nothing better we can do and we
+      // don't want to fail the test because of this.
+      FILE* pfile = posix::FOpen(premature_exit_filepath, "w");
+      fwrite("0", 1, 1, pfile);
+      fclose(pfile);
+    }
+  }
+
+  ~ScopedPrematureExitFile() {
+    if (premature_exit_filepath_ != NULL && *premature_exit_filepath_ != '\0') {
+      remove(premature_exit_filepath_);
+    }
+  }
+
+ private:
+  const char* const premature_exit_filepath_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedPrematureExitFile);
+};
+
+}  // namespace internal
+
+// class TestEventListeners
+
+TestEventListeners::TestEventListeners()
+    : repeater_(new internal::TestEventRepeater()),
+      default_result_printer_(NULL),
+      default_xml_generator_(NULL) {
+}
+
+TestEventListeners::~TestEventListeners() { delete repeater_; }
+
+// Returns the standard listener responsible for the default console
+// output.  Can be removed from the listeners list to shut down default
+// console output.  Note that removing this object from the listener list
+// with Release transfers its ownership to the user.
+void TestEventListeners::Append(TestEventListener* listener) {
+  repeater_->Append(listener);
+}
+
+// Removes the given event listener from the list and returns it.  It then
+// becomes the caller's responsibility to delete the listener. Returns
+// NULL if the listener is not found in the list.
+TestEventListener* TestEventListeners::Release(TestEventListener* listener) {
+  if (listener == default_result_printer_)
+    default_result_printer_ = NULL;
+  else if (listener == default_xml_generator_)
+    default_xml_generator_ = NULL;
+  return repeater_->Release(listener);
+}
+
+// Returns repeater that broadcasts the TestEventListener events to all
+// subscribers.
+TestEventListener* TestEventListeners::repeater() { return repeater_; }
+
+// Sets the default_result_printer attribute to the provided listener.
+// The listener is also added to the listener list and previous
+// default_result_printer is removed from it and deleted. The listener can
+// also be NULL in which case it will not be added to the list. Does
+// nothing if the previous and the current listener objects are the same.
+void TestEventListeners::SetDefaultResultPrinter(TestEventListener* listener) {
+  if (default_result_printer_ != listener) {
+    // It is an error to pass this method a listener that is already in the
+    // list.
+    delete Release(default_result_printer_);
+    default_result_printer_ = listener;
+    if (listener != NULL)
+      Append(listener);
+  }
+}
+
+// Sets the default_xml_generator attribute to the provided listener.  The
+// listener is also added to the listener list and previous
+// default_xml_generator is removed from it and deleted. The listener can
+// also be NULL in which case it will not be added to the list. Does
+// nothing if the previous and the current listener objects are the same.
+void TestEventListeners::SetDefaultXmlGenerator(TestEventListener* listener) {
+  if (default_xml_generator_ != listener) {
+    // It is an error to pass this method a listener that is already in the
+    // list.
+    delete Release(default_xml_generator_);
+    default_xml_generator_ = listener;
+    if (listener != NULL)
+      Append(listener);
+  }
+}
+
+// Controls whether events will be forwarded by the repeater to the
+// listeners in the list.
+bool TestEventListeners::EventForwardingEnabled() const {
+  return repeater_->forwarding_enabled();
+}
+
+void TestEventListeners::SuppressEventForwarding() {
+  repeater_->set_forwarding_enabled(false);
+}
+
+// class UnitTest
+
+// Gets the singleton UnitTest object.  The first time this method is
+// called, a UnitTest object is constructed and returned.  Consecutive
+// calls will return the same object.
+//
+// We don't protect this under mutex_ as a user is not supposed to
+// call this before main() starts, from which point on the return
+// value will never change.
+UnitTest* UnitTest::GetInstance() {
+  // When compiled with MSVC 7.1 in optimized mode, destroying the
+  // UnitTest object upon exiting the program messes up the exit code,
+  // causing successful tests to appear failed.  We have to use a
+  // different implementation in this case to bypass the compiler bug.
+  // This implementation makes the compiler happy, at the cost of
+  // leaking the UnitTest object.
+
+  // CodeGear C++Builder insists on a public destructor for the
+  // default implementation.  Use this implementation to keep good OO
+  // design with private destructor.
+
+#if (_MSC_VER == 1310 && !defined(_DEBUG)) || defined(__BORLANDC__)
+  static UnitTest* const instance = new UnitTest;
+  return instance;
+#else
+  static UnitTest instance;
+  return &instance;
+#endif  // (_MSC_VER == 1310 && !defined(_DEBUG)) || defined(__BORLANDC__)
+}
+
+// Gets the number of successful test cases.
+int UnitTest::successful_test_case_count() const {
+  return impl()->successful_test_case_count();
+}
+
+// Gets the number of failed test cases.
+int UnitTest::failed_test_case_count() const {
+  return impl()->failed_test_case_count();
+}
+
+// Gets the number of all test cases.
+int UnitTest::total_test_case_count() const {
+  return impl()->total_test_case_count();
+}
+
+// Gets the number of all test cases that contain at least one test
+// that should run.
+int UnitTest::test_case_to_run_count() const {
+  return impl()->test_case_to_run_count();
+}
+
+// Gets the number of successful tests.
+int UnitTest::successful_test_count() const {
+  return impl()->successful_test_count();
+}
+
+// Gets the number of failed tests.
+int UnitTest::failed_test_count() const { return impl()->failed_test_count(); }
+
+// Gets the number of disabled tests that will be reported in the XML report.
+int UnitTest::reportable_disabled_test_count() const {
+  return impl()->reportable_disabled_test_count();
+}
+
+// Gets the number of disabled tests.
+int UnitTest::disabled_test_count() const {
+  return impl()->disabled_test_count();
+}
+
+// Gets the number of tests to be printed in the XML report.
+int UnitTest::reportable_test_count() const {
+  return impl()->reportable_test_count();
+}
+
+// Gets the number of all tests.
+int UnitTest::total_test_count() const { return impl()->total_test_count(); }
+
+// Gets the number of tests that should run.
+int UnitTest::test_to_run_count() const { return impl()->test_to_run_count(); }
+
+// Gets the time of the test program start, in ms from the start of the
+// UNIX epoch.
+internal::TimeInMillis UnitTest::start_timestamp() const {
+    return impl()->start_timestamp();
+}
+
+// Gets the elapsed time, in milliseconds.
+internal::TimeInMillis UnitTest::elapsed_time() const {
+  return impl()->elapsed_time();
+}
+
+// Returns true iff the unit test passed (i.e. all test cases passed).
+bool UnitTest::Passed() const { return impl()->Passed(); }
+
+// Returns true iff the unit test failed (i.e. some test case failed
+// or something outside of all tests failed).
+bool UnitTest::Failed() const { return impl()->Failed(); }
+
+// Gets the i-th test case among all the test cases. i can range from 0 to
+// total_test_case_count() - 1. If i is not in that range, returns NULL.
+const TestCase* UnitTest::GetTestCase(int i) const {
+  return impl()->GetTestCase(i);
+}
+
+// Returns the TestResult containing information on test failures and
+// properties logged outside of individual test cases.
+const TestResult& UnitTest::ad_hoc_test_result() const {
+  return *impl()->ad_hoc_test_result();
+}
+
+// Gets the i-th test case among all the test cases. i can range from 0 to
+// total_test_case_count() - 1. If i is not in that range, returns NULL.
+TestCase* UnitTest::GetMutableTestCase(int i) {
+  return impl()->GetMutableTestCase(i);
+}
+
+// Returns the list of event listeners that can be used to track events
+// inside Google Test.
+TestEventListeners& UnitTest::listeners() {
+  return *impl()->listeners();
+}
+
+// Registers and returns a global test environment.  When a test
+// program is run, all global test environments will be set-up in the
+// order they were registered.  After all tests in the program have
+// finished, all global test environments will be torn-down in the
+// *reverse* order they were registered.
+//
+// The UnitTest object takes ownership of the given environment.
+//
+// We don't protect this under mutex_, as we only support calling it
+// from the main thread.
+Environment* UnitTest::AddEnvironment(Environment* env) {
+  if (env == NULL) {
+    return NULL;
+  }
+
+  impl_->environments().push_back(env);
+  return env;
+}
+
+// Adds a TestPartResult to the current TestResult object.  All Google Test
+// assertion macros (e.g. ASSERT_TRUE, EXPECT_EQ, etc) eventually call
+// this to report their results.  The user code should use the
+// assertion macros instead of calling this directly.
+void UnitTest::AddTestPartResult(
+    TestPartResult::Type result_type,
+    const char* file_name,
+    int line_number,
+    const std::string& message,
+    const std::string& os_stack_trace) GTEST_LOCK_EXCLUDED_(mutex_) {
+  Message msg;
+  msg << message;
+
+  internal::MutexLock lock(&mutex_);
+  if (impl_->gtest_trace_stack().size() > 0) {
+    msg << "\n" << GTEST_NAME_ << " trace:";
+
+    for (int i = static_cast<int>(impl_->gtest_trace_stack().size());
+         i > 0; --i) {
+      const internal::TraceInfo& trace = impl_->gtest_trace_stack()[i - 1];
+      msg << "\n" << internal::FormatFileLocation(trace.file, trace.line)
+          << " " << trace.message;
+    }
+  }
+
+  if (os_stack_trace.c_str() != NULL && !os_stack_trace.empty()) {
+    msg << internal::kStackTraceMarker << os_stack_trace;
+  }
+
+  const TestPartResult result =
+    TestPartResult(result_type, file_name, line_number,
+                   msg.GetString().c_str());
+  impl_->GetTestPartResultReporterForCurrentThread()->
+      ReportTestPartResult(result);
+
+  if (result_type != TestPartResult::kSuccess) {
+    // gtest_break_on_failure takes precedence over
+    // gtest_throw_on_failure.  This allows a user to set the latter
+    // in the code (perhaps in order to use Google Test assertions
+    // with another testing framework) and specify the former on the
+    // command line for debugging.
+    if (GTEST_FLAG(break_on_failure)) {
+#if GTEST_OS_WINDOWS
+      // Using DebugBreak on Windows allows gtest to still break into a debugger
+      // when a failure happens and both the --gtest_break_on_failure and
+      // the --gtest_catch_exceptions flags are specified.
+      DebugBreak();
+#else
+      // Dereference NULL through a volatile pointer to prevent the compiler
+      // from removing. We use this rather than abort() or __builtin_trap() for
+      // portability: Symbian doesn't implement abort() well, and some debuggers
+      // don't correctly trap abort().
+      *static_cast<volatile int*>(NULL) = 1;
+#endif  // GTEST_OS_WINDOWS
+    } else if (GTEST_FLAG(throw_on_failure)) {
+#if GTEST_HAS_EXCEPTIONS
+      throw internal::GoogleTestFailureException(result);
+#else
+      // We cannot call abort() as it generates a pop-up in debug mode
+      // that cannot be suppressed in VC 7.1 or below.
+      exit(1);
+#endif
+    }
+  }
+}
+
+// Adds a TestProperty to the current TestResult object when invoked from
+// inside a test, to current TestCase's ad_hoc_test_result_ when invoked
+// from SetUpTestCase or TearDownTestCase, or to the global property set
+// when invoked elsewhere.  If the result already contains a property with
+// the same key, the value will be updated.
+void UnitTest::RecordProperty(const std::string& key,
+                              const std::string& value) {
+  impl_->RecordProperty(TestProperty(key, value));
+}
+
+// Runs all tests in this UnitTest object and prints the result.
+// Returns 0 if successful, or 1 otherwise.
+//
+// We don't protect this under mutex_, as we only support calling it
+// from the main thread.
+int UnitTest::Run() {
+  const bool in_death_test_child_process =
+      internal::GTEST_FLAG(internal_run_death_test).length() > 0;
+
+  // Google Test implements this protocol for catching that a test
+  // program exits before returning control to Google Test:
+  //
+  //   1. Upon start, Google Test creates a file whose absolute path
+  //      is specified by the environment variable
+  //      TEST_PREMATURE_EXIT_FILE.
+  //   2. When Google Test has finished its work, it deletes the file.
+  //
+  // This allows a test runner to set TEST_PREMATURE_EXIT_FILE before
+  // running a Google-Test-based test program and check the existence
+  // of the file at the end of the test execution to see if it has
+  // exited prematurely.
+
+  // If we are in the child process of a death test, don't
+  // create/delete the premature exit file, as doing so is unnecessary
+  // and will confuse the parent process.  Otherwise, create/delete
+  // the file upon entering/leaving this function.  If the program
+  // somehow exits before this function has a chance to return, the
+  // premature-exit file will be left undeleted, causing a test runner
+  // that understands the premature-exit-file protocol to report the
+  // test as having failed.
+  const internal::ScopedPrematureExitFile premature_exit_file(
+      in_death_test_child_process ?
+      NULL : internal::posix::GetEnv("TEST_PREMATURE_EXIT_FILE"));
+
+  // Captures the value of GTEST_FLAG(catch_exceptions).  This value will be
+  // used for the duration of the program.
+  impl()->set_catch_exceptions(GTEST_FLAG(catch_exceptions));
+
+#if GTEST_HAS_SEH
+  // Either the user wants Google Test to catch exceptions thrown by the
+  // tests or this is executing in the context of death test child
+  // process. In either case the user does not want to see pop-up dialogs
+  // about crashes - they are expected.
+  if (impl()->catch_exceptions() || in_death_test_child_process) {
+# if !GTEST_OS_WINDOWS_MOBILE
+    // SetErrorMode doesn't exist on CE.
+    SetErrorMode(SEM_FAILCRITICALERRORS | SEM_NOALIGNMENTFAULTEXCEPT |
+                 SEM_NOGPFAULTERRORBOX | SEM_NOOPENFILEERRORBOX);
+# endif  // !GTEST_OS_WINDOWS_MOBILE
+
+# if (defined(_MSC_VER) || GTEST_OS_WINDOWS_MINGW) && !GTEST_OS_WINDOWS_MOBILE
+    // Death test children can be terminated with _abort().  On Windows,
+    // _abort() can show a dialog with a warning message.  This forces the
+    // abort message to go to stderr instead.
+    _set_error_mode(_OUT_TO_STDERR);
+# endif
+
+# if _MSC_VER >= 1400 && !GTEST_OS_WINDOWS_MOBILE
+    // In the debug version, Visual Studio pops up a separate dialog
+    // offering a choice to debug the aborted program. We need to suppress
+    // this dialog or it will pop up for every EXPECT/ASSERT_DEATH statement
+    // executed. Google Test will notify the user of any unexpected
+    // failure via stderr.
+    //
+    // VC++ doesn't define _set_abort_behavior() prior to the version 8.0.
+    // Users of prior VC versions shall suffer the agony and pain of
+    // clicking through the countless debug dialogs.
+    // TODO(vladl@google.com): find a way to suppress the abort dialog() in the
+    // debug mode when compiled with VC 7.1 or lower.
+    if (!GTEST_FLAG(break_on_failure))
+      _set_abort_behavior(
+          0x0,                                    // Clear the following flags:
+          _WRITE_ABORT_MSG | _CALL_REPORTFAULT);  // pop-up window, core dump.
+# endif
+  }
+#endif  // GTEST_HAS_SEH
+
+  return internal::HandleExceptionsInMethodIfSupported(
+      impl(),
+      &internal::UnitTestImpl::RunAllTests,
+      "auxiliary test code (environments or event listeners)") ? 0 : 1;
+}
+
+// Returns the working directory when the first TEST() or TEST_F() was
+// executed.
+const char* UnitTest::original_working_dir() const {
+  return impl_->original_working_dir_.c_str();
+}
+
+// Returns the TestCase object for the test that's currently running,
+// or NULL if no test is running.
+const TestCase* UnitTest::current_test_case() const
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+  internal::MutexLock lock(&mutex_);
+  return impl_->current_test_case();
+}
+
+// Returns the TestInfo object for the test that's currently running,
+// or NULL if no test is running.
+const TestInfo* UnitTest::current_test_info() const
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+  internal::MutexLock lock(&mutex_);
+  return impl_->current_test_info();
+}
+
+// Returns the random seed used at the start of the current test run.
+int UnitTest::random_seed() const { return impl_->random_seed(); }
+
+#if GTEST_HAS_PARAM_TEST
+// Returns ParameterizedTestCaseRegistry object used to keep track of
+// value-parameterized tests and instantiate and register them.
+internal::ParameterizedTestCaseRegistry&
+    UnitTest::parameterized_test_registry()
+        GTEST_LOCK_EXCLUDED_(mutex_) {
+  return impl_->parameterized_test_registry();
+}
+#endif  // GTEST_HAS_PARAM_TEST
+
+// Creates an empty UnitTest.
+UnitTest::UnitTest() {
+  impl_ = new internal::UnitTestImpl(this);
+}
+
+// Destructor of UnitTest.
+UnitTest::~UnitTest() {
+  delete impl_;
+}
+
+// Pushes a trace defined by SCOPED_TRACE() on to the per-thread
+// Google Test trace stack.
+void UnitTest::PushGTestTrace(const internal::TraceInfo& trace)
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+  internal::MutexLock lock(&mutex_);
+  impl_->gtest_trace_stack().push_back(trace);
+}
+
+// Pops a trace from the per-thread Google Test trace stack.
+void UnitTest::PopGTestTrace()
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+  internal::MutexLock lock(&mutex_);
+  impl_->gtest_trace_stack().pop_back();
+}
+
+namespace internal {
+
+UnitTestImpl::UnitTestImpl(UnitTest* parent)
+    : parent_(parent),
+#ifdef _MSC_VER
+# pragma warning(push)                    // Saves the current warning state.
+# pragma warning(disable:4355)            // Temporarily disables warning 4355
+                                         // (using this in initializer).
+      default_global_test_part_result_reporter_(this),
+      default_per_thread_test_part_result_reporter_(this),
+# pragma warning(pop)                     // Restores the warning state again.
+#else
+      default_global_test_part_result_reporter_(this),
+      default_per_thread_test_part_result_reporter_(this),
+#endif  // _MSC_VER
+      global_test_part_result_repoter_(
+          &default_global_test_part_result_reporter_),
+      per_thread_test_part_result_reporter_(
+          &default_per_thread_test_part_result_reporter_),
+#if GTEST_HAS_PARAM_TEST
+      parameterized_test_registry_(),
+      parameterized_tests_registered_(false),
+#endif  // GTEST_HAS_PARAM_TEST
+      last_death_test_case_(-1),
+      current_test_case_(NULL),
+      current_test_info_(NULL),
+      ad_hoc_test_result_(),
+      os_stack_trace_getter_(NULL),
+      post_flag_parse_init_performed_(false),
+      random_seed_(0),  // Will be overridden by the flag before first use.
+      random_(0),  // Will be reseeded before first use.
+      start_timestamp_(0),
+      elapsed_time_(0),
+#if GTEST_HAS_DEATH_TEST
+      death_test_factory_(new DefaultDeathTestFactory),
+#endif
+      // Will be overridden by the flag before first use.
+      catch_exceptions_(false) {
+  listeners()->SetDefaultResultPrinter(new PrettyUnitTestResultPrinter);
+}
+
+UnitTestImpl::~UnitTestImpl() {
+  // Deletes every TestCase.
+  ForEach(test_cases_, internal::Delete<TestCase>);
+
+  // Deletes every Environment.
+  ForEach(environments_, internal::Delete<Environment>);
+
+  delete os_stack_trace_getter_;
+}
+
+// Adds a TestProperty to the current TestResult object when invoked in a
+// context of a test, to current test case's ad_hoc_test_result when invoke
+// from SetUpTestCase/TearDownTestCase, or to the global property set
+// otherwise.  If the result already contains a property with the same key,
+// the value will be updated.
+void UnitTestImpl::RecordProperty(const TestProperty& test_property) {
+  std::string xml_element;
+  TestResult* test_result;  // TestResult appropriate for property recording.
+
+  if (current_test_info_ != NULL) {
+    xml_element = "testcase";
+    test_result = &(current_test_info_->result_);
+  } else if (current_test_case_ != NULL) {
+    xml_element = "testsuite";
+    test_result = &(current_test_case_->ad_hoc_test_result_);
+  } else {
+    xml_element = "testsuites";
+    test_result = &ad_hoc_test_result_;
+  }
+  test_result->RecordProperty(xml_element, test_property);
+}
+
+#if GTEST_HAS_DEATH_TEST
+// Disables event forwarding if the control is currently in a death test
+// subprocess. Must not be called before InitGoogleTest.
+void UnitTestImpl::SuppressTestEventsIfInSubprocess() {
+  if (internal_run_death_test_flag_.get() != NULL)
+    listeners()->SuppressEventForwarding();
+}
+#endif  // GTEST_HAS_DEATH_TEST
+
+// Initializes event listeners performing XML output as specified by
+// UnitTestOptions. Must not be called before InitGoogleTest.
+void UnitTestImpl::ConfigureXmlOutput() {
+  const std::string& output_format = UnitTestOptions::GetOutputFormat();
+  if (output_format == "xml") {
+    listeners()->SetDefaultXmlGenerator(new XmlUnitTestResultPrinter(
+        UnitTestOptions::GetAbsolutePathToOutputFile().c_str()));
+  } else if (output_format != "") {
+    printf("WARNING: unrecognized output format \"%s\" ignored.\n",
+           output_format.c_str());
+    fflush(stdout);
+  }
+}
+
+#if GTEST_CAN_STREAM_RESULTS_
+// Initializes event listeners for streaming test results in string form.
+// Must not be called before InitGoogleTest.
+void UnitTestImpl::ConfigureStreamingOutput() {
+  const std::string& target = GTEST_FLAG(stream_result_to);
+  if (!target.empty()) {
+    const size_t pos = target.find(':');
+    if (pos != std::string::npos) {
+      listeners()->Append(new StreamingListener(target.substr(0, pos),
+                                                target.substr(pos+1)));
+    } else {
+      printf("WARNING: unrecognized streaming target \"%s\" ignored.\n",
+             target.c_str());
+      fflush(stdout);
+    }
+  }
+}
+#endif  // GTEST_CAN_STREAM_RESULTS_
+
+// Performs initialization dependent upon flag values obtained in
+// ParseGoogleTestFlagsOnly.  Is called from InitGoogleTest after the call to
+// ParseGoogleTestFlagsOnly.  In case a user neglects to call InitGoogleTest
+// this function is also called from RunAllTests.  Since this function can be
+// called more than once, it has to be idempotent.
+void UnitTestImpl::PostFlagParsingInit() {
+  // Ensures that this function does not execute more than once.
+  if (!post_flag_parse_init_performed_) {
+    post_flag_parse_init_performed_ = true;
+
+#if GTEST_HAS_DEATH_TEST
+    InitDeathTestSubprocessControlInfo();
+    SuppressTestEventsIfInSubprocess();
+#endif  // GTEST_HAS_DEATH_TEST
+
+    // Registers parameterized tests. This makes parameterized tests
+    // available to the UnitTest reflection API without running
+    // RUN_ALL_TESTS.
+    RegisterParameterizedTests();
+
+    // Configures listeners for XML output. This makes it possible for users
+    // to shut down the default XML output before invoking RUN_ALL_TESTS.
+    ConfigureXmlOutput();
+
+#if GTEST_CAN_STREAM_RESULTS_
+    // Configures listeners for streaming test results to the specified server.
+    ConfigureStreamingOutput();
+#endif  // GTEST_CAN_STREAM_RESULTS_
+  }
+}
+
+// A predicate that checks the name of a TestCase against a known
+// value.
+//
+// This is used for implementation of the UnitTest class only.  We put
+// it in the anonymous namespace to prevent polluting the outer
+// namespace.
+//
+// TestCaseNameIs is copyable.
+class TestCaseNameIs {
+ public:
+  // Constructor.
+  explicit TestCaseNameIs(const std::string& name)
+      : name_(name) {}
+
+  // Returns true iff the name of test_case matches name_.
+  bool operator()(const TestCase* test_case) const {
+    return test_case != NULL && strcmp(test_case->name(), name_.c_str()) == 0;
+  }
+
+ private:
+  std::string name_;
+};
+
+// Finds and returns a TestCase with the given name.  If one doesn't
+// exist, creates one and returns it.  It's the CALLER'S
+// RESPONSIBILITY to ensure that this function is only called WHEN THE
+// TESTS ARE NOT SHUFFLED.
+//
+// Arguments:
+//
+//   test_case_name: name of the test case
+//   type_param:     the name of the test case's type parameter, or NULL if
+//                   this is not a typed or a type-parameterized test case.
+//   set_up_tc:      pointer to the function that sets up the test case
+//   tear_down_tc:   pointer to the function that tears down the test case
+TestCase* UnitTestImpl::GetTestCase(const char* test_case_name,
+                                    const char* type_param,
+                                    Test::SetUpTestCaseFunc set_up_tc,
+                                    Test::TearDownTestCaseFunc tear_down_tc) {
+  // Can we find a TestCase with the given name?
+  const std::vector<TestCase*>::const_iterator test_case =
+      std::find_if(test_cases_.begin(), test_cases_.end(),
+                   TestCaseNameIs(test_case_name));
+
+  if (test_case != test_cases_.end())
+    return *test_case;
+
+  // No.  Let's create one.
+  TestCase* const new_test_case =
+      new TestCase(test_case_name, type_param, set_up_tc, tear_down_tc);
+
+  // Is this a death test case?
+  if (internal::UnitTestOptions::MatchesFilter(test_case_name,
+                                               kDeathTestCaseFilter)) {
+    // Yes.  Inserts the test case after the last death test case
+    // defined so far.  This only works when the test cases haven't
+    // been shuffled.  Otherwise we may end up running a death test
+    // after a non-death test.
+    ++last_death_test_case_;
+    test_cases_.insert(test_cases_.begin() + last_death_test_case_,
+                       new_test_case);
+  } else {
+    // No.  Appends to the end of the list.
+    test_cases_.push_back(new_test_case);
+  }
+
+  test_case_indices_.push_back(static_cast<int>(test_case_indices_.size()));
+  return new_test_case;
+}
+
+// Helpers for setting up / tearing down the given environment.  They
+// are for use in the ForEach() function.
+static void SetUpEnvironment(Environment* env) { env->SetUp(); }
+static void TearDownEnvironment(Environment* env) { env->TearDown(); }
+
+// Runs all tests in this UnitTest object, prints the result, and
+// returns true if all tests are successful.  If any exception is
+// thrown during a test, the test is considered to be failed, but the
+// rest of the tests will still be run.
+//
+// When parameterized tests are enabled, it expands and registers
+// parameterized tests first in RegisterParameterizedTests().
+// All other functions called from RunAllTests() may safely assume that
+// parameterized tests are ready to be counted and run.
+bool UnitTestImpl::RunAllTests() {
+  // Makes sure InitGoogleTest() was called.
+  if (!GTestIsInitialized()) {
+    printf("%s",
+           "\nThis test program did NOT call ::testing::InitGoogleTest "
+           "before calling RUN_ALL_TESTS().  Please fix it.\n");
+    return false;
+  }
+
+  // Do not run any test if the --help flag was specified.
+  if (g_help_flag)
+    return true;
+
+  // Repeats the call to the post-flag parsing initialization in case the
+  // user didn't call InitGoogleTest.
+  PostFlagParsingInit();
+
+  // Even if sharding is not on, test runners may want to use the
+  // GTEST_SHARD_STATUS_FILE to query whether the test supports the sharding
+  // protocol.
+  internal::WriteToShardStatusFileIfNeeded();
+
+  // True iff we are in a subprocess for running a thread-safe-style
+  // death test.
+  bool in_subprocess_for_death_test = false;
+
+#if GTEST_HAS_DEATH_TEST
+  in_subprocess_for_death_test = (internal_run_death_test_flag_.get() != NULL);
+#endif  // GTEST_HAS_DEATH_TEST
+
+  const bool should_shard = ShouldShard(kTestTotalShards, kTestShardIndex,
+                                        in_subprocess_for_death_test);
+
+  // Compares the full test names with the filter to decide which
+  // tests to run.
+  const bool has_tests_to_run = FilterTests(should_shard
+                                              ? HONOR_SHARDING_PROTOCOL
+                                              : IGNORE_SHARDING_PROTOCOL) > 0;
+
+  // Lists the tests and exits if the --gtest_list_tests flag was specified.
+  if (GTEST_FLAG(list_tests)) {
+    // This must be called *after* FilterTests() has been called.
+    ListTestsMatchingFilter();
+    return true;
+  }
+
+  random_seed_ = GTEST_FLAG(shuffle) ?
+      GetRandomSeedFromFlag(GTEST_FLAG(random_seed)) : 0;
+
+  // True iff at least one test has failed.
+  bool failed = false;
+
+  TestEventListener* repeater = listeners()->repeater();
+
+  start_timestamp_ = GetTimeInMillis();
+  repeater->OnTestProgramStart(*parent_);
+
+  // How many times to repeat the tests?  We don't want to repeat them
+  // when we are inside the subprocess of a death test.
+  const int repeat = in_subprocess_for_death_test ? 1 : GTEST_FLAG(repeat);
+  // Repeats forever if the repeat count is negative.
+  const bool forever = repeat < 0;
+  for (int i = 0; forever || i != repeat; i++) {
+    // We want to preserve failures generated by ad-hoc test
+    // assertions executed before RUN_ALL_TESTS().
+    ClearNonAdHocTestResult();
+
+    const TimeInMillis start = GetTimeInMillis();
+
+    // Shuffles test cases and tests if requested.
+    if (has_tests_to_run && GTEST_FLAG(shuffle)) {
+      random()->Reseed(random_seed_);
+      // This should be done before calling OnTestIterationStart(),
+      // such that a test event listener can see the actual test order
+      // in the event.
+      ShuffleTests();
+    }
+
+    // Tells the unit test event listeners that the tests are about to start.
+    repeater->OnTestIterationStart(*parent_, i);
+
+    // Runs each test case if there is at least one test to run.
+    if (has_tests_to_run) {
+      // Sets up all environments beforehand.
+      repeater->OnEnvironmentsSetUpStart(*parent_);
+      ForEach(environments_, SetUpEnvironment);
+      repeater->OnEnvironmentsSetUpEnd(*parent_);
+
+      // Runs the tests only if there was no fatal failure during global
+      // set-up.
+      if (!Test::HasFatalFailure()) {
+        for (int test_index = 0; test_index < total_test_case_count();
+             test_index++) {
+          GetMutableTestCase(test_index)->Run();
+        }
+      }
+
+      // Tears down all environments in reverse order afterwards.
+      repeater->OnEnvironmentsTearDownStart(*parent_);
+      std::for_each(environments_.rbegin(), environments_.rend(),
+                    TearDownEnvironment);
+      repeater->OnEnvironmentsTearDownEnd(*parent_);
+    }
+
+    elapsed_time_ = GetTimeInMillis() - start;
+
+    // Tells the unit test event listener that the tests have just finished.
+    repeater->OnTestIterationEnd(*parent_, i);
+
+    // Gets the result and clears it.
+    if (!Passed()) {
+      failed = true;
+    }
+
+    // Restores the original test order after the iteration.  This
+    // allows the user to quickly repro a failure that happens in the
+    // N-th iteration without repeating the first (N - 1) iterations.
+    // This is not enclosed in "if (GTEST_FLAG(shuffle)) { ... }", in
+    // case the user somehow changes the value of the flag somewhere
+    // (it's always safe to unshuffle the tests).
+    UnshuffleTests();
+
+    if (GTEST_FLAG(shuffle)) {
+      // Picks a new random seed for each iteration.
+      random_seed_ = GetNextRandomSeed(random_seed_);
+    }
+  }
+
+  repeater->OnTestProgramEnd(*parent_);
+
+  return !failed;
+}
+
+// Reads the GTEST_SHARD_STATUS_FILE environment variable, and creates the file
+// if the variable is present. If a file already exists at this location, this
+// function will write over it. If the variable is present, but the file cannot
+// be created, prints an error and exits.
+void WriteToShardStatusFileIfNeeded() {
+  const char* const test_shard_file = posix::GetEnv(kTestShardStatusFile);
+  if (test_shard_file != NULL) {
+    FILE* const file = posix::FOpen(test_shard_file, "w");
+    if (file == NULL) {
+      ColoredPrintf(COLOR_RED,
+                    "Could not write to the test shard status file \"%s\" "
+                    "specified by the %s environment variable.\n",
+                    test_shard_file, kTestShardStatusFile);
+      fflush(stdout);
+      exit(EXIT_FAILURE);
+    }
+    fclose(file);
+  }
+}
+
+// Checks whether sharding is enabled by examining the relevant
+// environment variable values. If the variables are present,
+// but inconsistent (i.e., shard_index >= total_shards), prints
+// an error and exits. If in_subprocess_for_death_test, sharding is
+// disabled because it must only be applied to the original test
+// process. Otherwise, we could filter out death tests we intended to execute.
+bool ShouldShard(const char* total_shards_env,
+                 const char* shard_index_env,
+                 bool in_subprocess_for_death_test) {
+  if (in_subprocess_for_death_test) {
+    return false;
+  }
+
+  const Int32 total_shards = Int32FromEnvOrDie(total_shards_env, -1);
+  const Int32 shard_index = Int32FromEnvOrDie(shard_index_env, -1);
+
+  if (total_shards == -1 && shard_index == -1) {
+    return false;
+  } else if (total_shards == -1 && shard_index != -1) {
+    const Message msg = Message()
+      << "Invalid environment variables: you have "
+      << kTestShardIndex << " = " << shard_index
+      << ", but have left " << kTestTotalShards << " unset.\n";
+    ColoredPrintf(COLOR_RED, msg.GetString().c_str());
+    fflush(stdout);
+    exit(EXIT_FAILURE);
+  } else if (total_shards != -1 && shard_index == -1) {
+    const Message msg = Message()
+      << "Invalid environment variables: you have "
+      << kTestTotalShards << " = " << total_shards
+      << ", but have left " << kTestShardIndex << " unset.\n";
+    ColoredPrintf(COLOR_RED, msg.GetString().c_str());
+    fflush(stdout);
+    exit(EXIT_FAILURE);
+  } else if (shard_index < 0 || shard_index >= total_shards) {
+    const Message msg = Message()
+      << "Invalid environment variables: we require 0 <= "
+      << kTestShardIndex << " < " << kTestTotalShards
+      << ", but you have " << kTestShardIndex << "=" << shard_index
+      << ", " << kTestTotalShards << "=" << total_shards << ".\n";
+    ColoredPrintf(COLOR_RED, msg.GetString().c_str());
+    fflush(stdout);
+    exit(EXIT_FAILURE);
+  }
+
+  return total_shards > 1;
+}
+
+// Parses the environment variable var as an Int32. If it is unset,
+// returns default_val. If it is not an Int32, prints an error
+// and aborts.
+Int32 Int32FromEnvOrDie(const char* var, Int32 default_val) {
+  const char* str_val = posix::GetEnv(var);
+  if (str_val == NULL) {
+    return default_val;
+  }
+
+  Int32 result;
+  if (!ParseInt32(Message() << "The value of environment variable " << var,
+                  str_val, &result)) {
+    exit(EXIT_FAILURE);
+  }
+  return result;
+}
+
+// Given the total number of shards, the shard index, and the test id,
+// returns true iff the test should be run on this shard. The test id is
+// some arbitrary but unique non-negative integer assigned to each test
+// method. Assumes that 0 <= shard_index < total_shards.
+bool ShouldRunTestOnShard(int total_shards, int shard_index, int test_id) {
+  return (test_id % total_shards) == shard_index;
+}
+
+// Compares the name of each test with the user-specified filter to
+// decide whether the test should be run, then records the result in
+// each TestCase and TestInfo object.
+// If shard_tests == true, further filters tests based on sharding
+// variables in the environment - see
+// http://code.google.com/p/googletest/wiki/GoogleTestAdvancedGuide.
+// Returns the number of tests that should run.
+int UnitTestImpl::FilterTests(ReactionToSharding shard_tests) {
+  const Int32 total_shards = shard_tests == HONOR_SHARDING_PROTOCOL ?
+      Int32FromEnvOrDie(kTestTotalShards, -1) : -1;
+  const Int32 shard_index = shard_tests == HONOR_SHARDING_PROTOCOL ?
+      Int32FromEnvOrDie(kTestShardIndex, -1) : -1;
+
+  // num_runnable_tests are the number of tests that will
+  // run across all shards (i.e., match filter and are not disabled).
+  // num_selected_tests are the number of tests to be run on
+  // this shard.
+  int num_runnable_tests = 0;
+  int num_selected_tests = 0;
+  for (size_t i = 0; i < test_cases_.size(); i++) {
+    TestCase* const test_case = test_cases_[i];
+    const std::string &test_case_name = test_case->name();
+    test_case->set_should_run(false);
+
+    for (size_t j = 0; j < test_case->test_info_list().size(); j++) {
+      TestInfo* const test_info = test_case->test_info_list()[j];
+      const std::string test_name(test_info->name());
+      // A test is disabled if test case name or test name matches
+      // kDisableTestFilter.
+      const bool is_disabled =
+          internal::UnitTestOptions::MatchesFilter(test_case_name,
+                                                   kDisableTestFilter) ||
+          internal::UnitTestOptions::MatchesFilter(test_name,
+                                                   kDisableTestFilter);
+      test_info->is_disabled_ = is_disabled;
+
+      const bool matches_filter =
+          internal::UnitTestOptions::FilterMatchesTest(test_case_name,
+                                                       test_name);
+      test_info->matches_filter_ = matches_filter;
+
+      const bool is_runnable =
+          (GTEST_FLAG(also_run_disabled_tests) || !is_disabled) &&
+          matches_filter;
+
+      const bool is_selected = is_runnable &&
+          (shard_tests == IGNORE_SHARDING_PROTOCOL ||
+           ShouldRunTestOnShard(total_shards, shard_index,
+                                num_runnable_tests));
+
+      num_runnable_tests += is_runnable;
+      num_selected_tests += is_selected;
+
+      test_info->should_run_ = is_selected;
+      test_case->set_should_run(test_case->should_run() || is_selected);
+    }
+  }
+  return num_selected_tests;
+}
+
+// Prints the given C-string on a single line by replacing all '\n'
+// characters with string "\\n".  If the output takes more than
+// max_length characters, only prints the first max_length characters
+// and "...".
+static void PrintOnOneLine(const char* str, int max_length) {
+  if (str != NULL) {
+    for (int i = 0; *str != '\0'; ++str) {
+      if (i >= max_length) {
+        printf("...");
+        break;
+      }
+      if (*str == '\n') {
+        printf("\\n");
+        i += 2;
+      } else {
+        printf("%c", *str);
+        ++i;
+      }
+    }
+  }
+}
+
+// Prints the names of the tests matching the user-specified filter flag.
+void UnitTestImpl::ListTestsMatchingFilter() {
+  // Print at most this many characters for each type/value parameter.
+  const int kMaxParamLength = 250;
+
+  for (size_t i = 0; i < test_cases_.size(); i++) {
+    const TestCase* const test_case = test_cases_[i];
+    bool printed_test_case_name = false;
+
+    for (size_t j = 0; j < test_case->test_info_list().size(); j++) {
+      const TestInfo* const test_info =
+          test_case->test_info_list()[j];
+      if (test_info->matches_filter_) {
+        if (!printed_test_case_name) {
+          printed_test_case_name = true;
+          printf("%s.", test_case->name());
+          if (test_case->type_param() != NULL) {
+            printf("  # %s = ", kTypeParamLabel);
+            // We print the type parameter on a single line to make
+            // the output easy to parse by a program.
+            PrintOnOneLine(test_case->type_param(), kMaxParamLength);
+          }
+          printf("\n");
+        }
+        printf("  %s", test_info->name());
+        if (test_info->value_param() != NULL) {
+          printf("  # %s = ", kValueParamLabel);
+          // We print the value parameter on a single line to make the
+          // output easy to parse by a program.
+          PrintOnOneLine(test_info->value_param(), kMaxParamLength);
+        }
+        printf("\n");
+      }
+    }
+  }
+  fflush(stdout);
+}
+
+// Sets the OS stack trace getter.
+//
+// Does nothing if the input and the current OS stack trace getter are
+// the same; otherwise, deletes the old getter and makes the input the
+// current getter.
+void UnitTestImpl::set_os_stack_trace_getter(
+    OsStackTraceGetterInterface* getter) {
+  if (os_stack_trace_getter_ != getter) {
+    delete os_stack_trace_getter_;
+    os_stack_trace_getter_ = getter;
+  }
+}
+
+// Returns the current OS stack trace getter if it is not NULL;
+// otherwise, creates an OsStackTraceGetter, makes it the current
+// getter, and returns it.
+OsStackTraceGetterInterface* UnitTestImpl::os_stack_trace_getter() {
+  if (os_stack_trace_getter_ == NULL) {
+    os_stack_trace_getter_ = new OsStackTraceGetter;
+  }
+
+  return os_stack_trace_getter_;
+}
+
+// Returns the TestResult for the test that's currently running, or
+// the TestResult for the ad hoc test if no test is running.
+TestResult* UnitTestImpl::current_test_result() {
+  return current_test_info_ ?
+      &(current_test_info_->result_) : &ad_hoc_test_result_;
+}
+
+// Shuffles all test cases, and the tests within each test case,
+// making sure that death tests are still run first.
+void UnitTestImpl::ShuffleTests() {
+  // Shuffles the death test cases.
+  ShuffleRange(random(), 0, last_death_test_case_ + 1, &test_case_indices_);
+
+  // Shuffles the non-death test cases.
+  ShuffleRange(random(), last_death_test_case_ + 1,
+               static_cast<int>(test_cases_.size()), &test_case_indices_);
+
+  // Shuffles the tests inside each test case.
+  for (size_t i = 0; i < test_cases_.size(); i++) {
+    test_cases_[i]->ShuffleTests(random());
+  }
+}
+
+// Restores the test cases and tests to their order before the first shuffle.
+void UnitTestImpl::UnshuffleTests() {
+  for (size_t i = 0; i < test_cases_.size(); i++) {
+    // Unshuffles the tests in each test case.
+    test_cases_[i]->UnshuffleTests();
+    // Resets the index of each test case.
+    test_case_indices_[i] = static_cast<int>(i);
+  }
+}
+
+// Returns the current OS stack trace as an std::string.
+//
+// The maximum number of stack frames to be included is specified by
+// the gtest_stack_trace_depth flag.  The skip_count parameter
+// specifies the number of top frames to be skipped, which doesn't
+// count against the number of frames to be included.
+//
+// For example, if Foo() calls Bar(), which in turn calls
+// GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in
+// the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't.
+std::string GetCurrentOsStackTraceExceptTop(UnitTest* /*unit_test*/,
+                                            int skip_count) {
+  // We pass skip_count + 1 to skip this wrapper function in addition
+  // to what the user really wants to skip.
+  return GetUnitTestImpl()->CurrentOsStackTraceExceptTop(skip_count + 1);
+}
+
+// Used by the GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_ macro to
+// suppress unreachable code warnings.
+namespace {
+class ClassUniqueToAlwaysTrue {};
+}
+
+bool IsTrue(bool condition) { return condition; }
+
+bool AlwaysTrue() {
+#if GTEST_HAS_EXCEPTIONS
+  // This condition is always false so AlwaysTrue() never actually throws,
+  // but it makes the compiler think that it may throw.
+  if (IsTrue(false))
+    throw ClassUniqueToAlwaysTrue();
+#endif  // GTEST_HAS_EXCEPTIONS
+  return true;
+}
+
+// If *pstr starts with the given prefix, modifies *pstr to be right
+// past the prefix and returns true; otherwise leaves *pstr unchanged
+// and returns false.  None of pstr, *pstr, and prefix can be NULL.
+bool SkipPrefix(const char* prefix, const char** pstr) {
+  const size_t prefix_len = strlen(prefix);
+  if (strncmp(*pstr, prefix, prefix_len) == 0) {
+    *pstr += prefix_len;
+    return true;
+  }
+  return false;
+}
+
+// Parses a string as a command line flag.  The string should have
+// the format "--flag=value".  When def_optional is true, the "=value"
+// part can be omitted.
+//
+// Returns the value of the flag, or NULL if the parsing failed.
+const char* ParseFlagValue(const char* str,
+                           const char* flag,
+                           bool def_optional) {
+  // str and flag must not be NULL.
+  if (str == NULL || flag == NULL) return NULL;
+
+  // The flag must start with "--" followed by GTEST_FLAG_PREFIX_.
+  const std::string flag_str = std::string("--") + GTEST_FLAG_PREFIX_ + flag;
+  const size_t flag_len = flag_str.length();
+  if (strncmp(str, flag_str.c_str(), flag_len) != 0) return NULL;
+
+  // Skips the flag name.
+  const char* flag_end = str + flag_len;
+
+  // When def_optional is true, it's OK to not have a "=value" part.
+  if (def_optional && (flag_end[0] == '\0')) {
+    return flag_end;
+  }
+
+  // If def_optional is true and there are more characters after the
+  // flag name, or if def_optional is false, there must be a '=' after
+  // the flag name.
+  if (flag_end[0] != '=') return NULL;
+
+  // Returns the string after "=".
+  return flag_end + 1;
+}
+
+// Parses a string for a bool flag, in the form of either
+// "--flag=value" or "--flag".
+//
+// In the former case, the value is taken as true as long as it does
+// not start with '0', 'f', or 'F'.
+//
+// In the latter case, the value is taken as true.
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+bool ParseBoolFlag(const char* str, const char* flag, bool* value) {
+  // Gets the value of the flag as a string.
+  const char* const value_str = ParseFlagValue(str, flag, true);
+
+  // Aborts if the parsing failed.
+  if (value_str == NULL) return false;
+
+  // Converts the string value to a bool.
+  *value = !(*value_str == '0' || *value_str == 'f' || *value_str == 'F');
+  return true;
+}
+
+// Parses a string for an Int32 flag, in the form of
+// "--flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+bool ParseInt32Flag(const char* str, const char* flag, Int32* value) {
+  // Gets the value of the flag as a string.
+  const char* const value_str = ParseFlagValue(str, flag, false);
+
+  // Aborts if the parsing failed.
+  if (value_str == NULL) return false;
+
+  // Sets *value to the value of the flag.
+  return ParseInt32(Message() << "The value of flag --" << flag,
+                    value_str, value);
+}
+
+// Parses a string for a string flag, in the form of
+// "--flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+bool ParseStringFlag(const char* str, const char* flag, std::string* value) {
+  // Gets the value of the flag as a string.
+  const char* const value_str = ParseFlagValue(str, flag, false);
+
+  // Aborts if the parsing failed.
+  if (value_str == NULL) return false;
+
+  // Sets *value to the value of the flag.
+  *value = value_str;
+  return true;
+}
+
+// Determines whether a string has a prefix that Google Test uses for its
+// flags, i.e., starts with GTEST_FLAG_PREFIX_ or GTEST_FLAG_PREFIX_DASH_.
+// If Google Test detects that a command line flag has its prefix but is not
+// recognized, it will print its help message. Flags starting with
+// GTEST_INTERNAL_PREFIX_ followed by "internal_" are considered Google Test
+// internal flags and do not trigger the help message.
+static bool HasGoogleTestFlagPrefix(const char* str) {
+  return (SkipPrefix("--", &str) ||
+          SkipPrefix("-", &str) ||
+          SkipPrefix("/", &str)) &&
+         !SkipPrefix(GTEST_FLAG_PREFIX_ "internal_", &str) &&
+         (SkipPrefix(GTEST_FLAG_PREFIX_, &str) ||
+          SkipPrefix(GTEST_FLAG_PREFIX_DASH_, &str));
+}
+
+// Prints a string containing code-encoded text.  The following escape
+// sequences can be used in the string to control the text color:
+//
+//   @@    prints a single '@' character.
+//   @R    changes the color to red.
+//   @G    changes the color to green.
+//   @Y    changes the color to yellow.
+//   @D    changes to the default terminal text color.
+//
+// TODO(wan@google.com): Write tests for this once we add stdout
+// capturing to Google Test.
+static void PrintColorEncoded(const char* str) {
+  GTestColor color = COLOR_DEFAULT;  // The current color.
+
+  // Conceptually, we split the string into segments divided by escape
+  // sequences.  Then we print one segment at a time.  At the end of
+  // each iteration, the str pointer advances to the beginning of the
+  // next segment.
+  for (;;) {
+    const char* p = strchr(str, '@');
+    if (p == NULL) {
+      ColoredPrintf(color, "%s", str);
+      return;
+    }
+
+    ColoredPrintf(color, "%s", std::string(str, p).c_str());
+
+    const char ch = p[1];
+    str = p + 2;
+    if (ch == '@') {
+      ColoredPrintf(color, "@");
+    } else if (ch == 'D') {
+      color = COLOR_DEFAULT;
+    } else if (ch == 'R') {
+      color = COLOR_RED;
+    } else if (ch == 'G') {
+      color = COLOR_GREEN;
+    } else if (ch == 'Y') {
+      color = COLOR_YELLOW;
+    } else {
+      --str;
+    }
+  }
+}
+
+static const char kColorEncodedHelpMessage[] =
+"This program contains tests written using " GTEST_NAME_ ". You can use the\n"
+"following command line flags to control its behavior:\n"
+"\n"
+"Test Selection:\n"
+"  @G--" GTEST_FLAG_PREFIX_ "list_tests@D\n"
+"      List the names of all tests instead of running them. The name of\n"
+"      TEST(Foo, Bar) is \"Foo.Bar\".\n"
+"  @G--" GTEST_FLAG_PREFIX_ "filter=@YPOSTIVE_PATTERNS"
+    "[@G-@YNEGATIVE_PATTERNS]@D\n"
+"      Run only the tests whose name matches one of the positive patterns but\n"
+"      none of the negative patterns. '?' matches any single character; '*'\n"
+"      matches any substring; ':' separates two patterns.\n"
+"  @G--" GTEST_FLAG_PREFIX_ "also_run_disabled_tests@D\n"
+"      Run all disabled tests too.\n"
+"\n"
+"Test Execution:\n"
+"  @G--" GTEST_FLAG_PREFIX_ "repeat=@Y[COUNT]@D\n"
+"      Run the tests repeatedly; use a negative count to repeat forever.\n"
+"  @G--" GTEST_FLAG_PREFIX_ "shuffle@D\n"
+"      Randomize tests' orders on every iteration.\n"
+"  @G--" GTEST_FLAG_PREFIX_ "random_seed=@Y[NUMBER]@D\n"
+"      Random number seed to use for shuffling test orders (between 1 and\n"
+"      99999, or 0 to use a seed based on the current time).\n"
+"\n"
+"Test Output:\n"
+"  @G--" GTEST_FLAG_PREFIX_ "color=@Y(@Gyes@Y|@Gno@Y|@Gauto@Y)@D\n"
+"      Enable/disable colored output. The default is @Gauto@D.\n"
+"  -@G-" GTEST_FLAG_PREFIX_ "print_time=0@D\n"
+"      Don't print the elapsed time of each test.\n"
+"  @G--" GTEST_FLAG_PREFIX_ "output=xml@Y[@G:@YDIRECTORY_PATH@G"
+    GTEST_PATH_SEP_ "@Y|@G:@YFILE_PATH]@D\n"
+"      Generate an XML report in the given directory or with the given file\n"
+"      name. @YFILE_PATH@D defaults to @Gtest_details.xml@D.\n"
+#if GTEST_CAN_STREAM_RESULTS_
+"  @G--" GTEST_FLAG_PREFIX_ "stream_result_to=@YHOST@G:@YPORT@D\n"
+"      Stream test results to the given server.\n"
+#endif  // GTEST_CAN_STREAM_RESULTS_
+"\n"
+"Assertion Behavior:\n"
+#if GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
+"  @G--" GTEST_FLAG_PREFIX_ "death_test_style=@Y(@Gfast@Y|@Gthreadsafe@Y)@D\n"
+"      Set the default death test style.\n"
+#endif  // GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
+"  @G--" GTEST_FLAG_PREFIX_ "break_on_failure@D\n"
+"      Turn assertion failures into debugger break-points.\n"
+"  @G--" GTEST_FLAG_PREFIX_ "throw_on_failure@D\n"
+"      Turn assertion failures into C++ exceptions.\n"
+"  @G--" GTEST_FLAG_PREFIX_ "catch_exceptions=0@D\n"
+"      Do not report exceptions as test failures. Instead, allow them\n"
+"      to crash the program or throw a pop-up (on Windows).\n"
+"\n"
+"Except for @G--" GTEST_FLAG_PREFIX_ "list_tests@D, you can alternatively set "
+    "the corresponding\n"
+"environment variable of a flag (all letters in upper-case). For example, to\n"
+"disable colored text output, you can either specify @G--" GTEST_FLAG_PREFIX_
+    "color=no@D or set\n"
+"the @G" GTEST_FLAG_PREFIX_UPPER_ "COLOR@D environment variable to @Gno@D.\n"
+"\n"
+"For more information, please read the " GTEST_NAME_ " documentation at\n"
+"@G" GTEST_PROJECT_URL_ "@D. If you find a bug in " GTEST_NAME_ "\n"
+"(not one in your own code or tests), please report it to\n"
+"@G<" GTEST_DEV_EMAIL_ ">@D.\n";
+
+// Parses the command line for Google Test flags, without initializing
+// other parts of Google Test.  The type parameter CharType can be
+// instantiated to either char or wchar_t.
+template <typename CharType>
+void ParseGoogleTestFlagsOnlyImpl(int* argc, CharType** argv) {
+  for (int i = 1; i < *argc; i++) {
+    const std::string arg_string = StreamableToString(argv[i]);
+    const char* const arg = arg_string.c_str();
+
+    using internal::ParseBoolFlag;
+    using internal::ParseInt32Flag;
+    using internal::ParseStringFlag;
+
+    // Do we see a Google Test flag?
+    if (ParseBoolFlag(arg, kAlsoRunDisabledTestsFlag,
+                      &GTEST_FLAG(also_run_disabled_tests)) ||
+        ParseBoolFlag(arg, kBreakOnFailureFlag,
+                      &GTEST_FLAG(break_on_failure)) ||
+        ParseBoolFlag(arg, kCatchExceptionsFlag,
+                      &GTEST_FLAG(catch_exceptions)) ||
+        ParseStringFlag(arg, kColorFlag, &GTEST_FLAG(color)) ||
+        ParseStringFlag(arg, kDeathTestStyleFlag,
+                        &GTEST_FLAG(death_test_style)) ||
+        ParseBoolFlag(arg, kDeathTestUseFork,
+                      &GTEST_FLAG(death_test_use_fork)) ||
+        ParseStringFlag(arg, kFilterFlag, &GTEST_FLAG(filter)) ||
+        ParseStringFlag(arg, kInternalRunDeathTestFlag,
+                        &GTEST_FLAG(internal_run_death_test)) ||
+        ParseBoolFlag(arg, kListTestsFlag, &GTEST_FLAG(list_tests)) ||
+        ParseStringFlag(arg, kOutputFlag, &GTEST_FLAG(output)) ||
+        ParseBoolFlag(arg, kPrintTimeFlag, &GTEST_FLAG(print_time)) ||
+        ParseInt32Flag(arg, kRandomSeedFlag, &GTEST_FLAG(random_seed)) ||
+        ParseInt32Flag(arg, kRepeatFlag, &GTEST_FLAG(repeat)) ||
+        ParseBoolFlag(arg, kShuffleFlag, &GTEST_FLAG(shuffle)) ||
+        ParseInt32Flag(arg, kStackTraceDepthFlag,
+                       &GTEST_FLAG(stack_trace_depth)) ||
+        ParseStringFlag(arg, kStreamResultToFlag,
+                        &GTEST_FLAG(stream_result_to)) ||
+        ParseBoolFlag(arg, kThrowOnFailureFlag,
+                      &GTEST_FLAG(throw_on_failure))
+        ) {
+      // Yes.  Shift the remainder of the argv list left by one.  Note
+      // that argv has (*argc + 1) elements, the last one always being
+      // NULL.  The following loop moves the trailing NULL element as
+      // well.
+      for (int j = i; j != *argc; j++) {
+        argv[j] = argv[j + 1];
+      }
+
+      // Decrements the argument count.
+      (*argc)--;
+
+      // We also need to decrement the iterator as we just removed
+      // an element.
+      i--;
+    } else if (arg_string == "--help" || arg_string == "-h" ||
+               arg_string == "-?" || arg_string == "/?" ||
+               HasGoogleTestFlagPrefix(arg)) {
+      // Both help flag and unrecognized Google Test flags (excluding
+      // internal ones) trigger help display.
+      g_help_flag = true;
+    }
+  }
+
+  if (g_help_flag) {
+    // We print the help here instead of in RUN_ALL_TESTS(), as the
+    // latter may not be called at all if the user is using Google
+    // Test with another testing framework.
+    PrintColorEncoded(kColorEncodedHelpMessage);
+  }
+}
+
+// Parses the command line for Google Test flags, without initializing
+// other parts of Google Test.
+void ParseGoogleTestFlagsOnly(int* argc, char** argv) {
+  ParseGoogleTestFlagsOnlyImpl(argc, argv);
+}
+void ParseGoogleTestFlagsOnly(int* argc, wchar_t** argv) {
+  ParseGoogleTestFlagsOnlyImpl(argc, argv);
+}
+
+// The internal implementation of InitGoogleTest().
+//
+// The type parameter CharType can be instantiated to either char or
+// wchar_t.
+template <typename CharType>
+void InitGoogleTestImpl(int* argc, CharType** argv) {
+  g_init_gtest_count++;
+
+  // We don't want to run the initialization code twice.
+  if (g_init_gtest_count != 1) return;
+
+  if (*argc <= 0) return;
+
+  internal::g_executable_path = internal::StreamableToString(argv[0]);
+
+#if GTEST_HAS_DEATH_TEST
+
+  g_argvs.clear();
+  for (int i = 0; i != *argc; i++) {
+    g_argvs.push_back(StreamableToString(argv[i]));
+  }
+
+#endif  // GTEST_HAS_DEATH_TEST
+
+  ParseGoogleTestFlagsOnly(argc, argv);
+  GetUnitTestImpl()->PostFlagParsingInit();
+}
+
+}  // namespace internal
+
+// Initializes Google Test.  This must be called before calling
+// RUN_ALL_TESTS().  In particular, it parses a command line for the
+// flags that Google Test recognizes.  Whenever a Google Test flag is
+// seen, it is removed from argv, and *argc is decremented.
+//
+// No value is returned.  Instead, the Google Test flag variables are
+// updated.
+//
+// Calling the function for the second time has no user-visible effect.
+void InitGoogleTest(int* argc, char** argv) {
+  internal::InitGoogleTestImpl(argc, argv);
+}
+
+// This overloaded version can be used in Windows programs compiled in
+// UNICODE mode.
+void InitGoogleTest(int* argc, wchar_t** argv) {
+  internal::InitGoogleTestImpl(argc, argv);
+}
+
+}  // namespace testing
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan), vladl@google.com (Vlad Losev)
+//
+// This file implements death tests.
+
+
+#if GTEST_HAS_DEATH_TEST
+
+# if GTEST_OS_MAC
+#  include <crt_externs.h>
+# endif  // GTEST_OS_MAC
+
+# include <errno.h>
+# include <fcntl.h>
+# include <limits.h>
+
+# if GTEST_OS_LINUX
+#  include <signal.h>
+# endif  // GTEST_OS_LINUX
+
+# include <stdarg.h>
+
+# if GTEST_OS_WINDOWS
+#  include <windows.h>
+# else
+#  include <sys/mman.h>
+#  include <sys/wait.h>
+# endif  // GTEST_OS_WINDOWS
+
+# if GTEST_OS_QNX
+#  include <spawn.h>
+# endif  // GTEST_OS_QNX
+
+#endif  // GTEST_HAS_DEATH_TEST
+
+
+// Indicates that this translation unit is part of Google Test's
+// implementation.  It must come before gtest-internal-inl.h is
+// included, or there will be a compiler error.  This trick is to
+// prevent a user from accidentally including gtest-internal-inl.h in
+// his code.
+#define GTEST_IMPLEMENTATION_ 1
+#undef GTEST_IMPLEMENTATION_
+
+namespace testing {
+
+// Constants.
+
+// The default death test style.
+static const char kDefaultDeathTestStyle[] = "fast";
+
+GTEST_DEFINE_string_(
+    death_test_style,
+    internal::StringFromGTestEnv("death_test_style", kDefaultDeathTestStyle),
+    "Indicates how to run a death test in a forked child process: "
+    "\"threadsafe\" (child process re-executes the test binary "
+    "from the beginning, running only the specific death test) or "
+    "\"fast\" (child process runs the death test immediately "
+    "after forking).");
+
+GTEST_DEFINE_bool_(
+    death_test_use_fork,
+    internal::BoolFromGTestEnv("death_test_use_fork", false),
+    "Instructs to use fork()/_exit() instead of clone() in death tests. "
+    "Ignored and always uses fork() on POSIX systems where clone() is not "
+    "implemented. Useful when running under valgrind or similar tools if "
+    "those do not support clone(). Valgrind 3.3.1 will just fail if "
+    "it sees an unsupported combination of clone() flags. "
+    "It is not recommended to use this flag w/o valgrind though it will "
+    "work in 99% of the cases. Once valgrind is fixed, this flag will "
+    "most likely be removed.");
+
+namespace internal {
+GTEST_DEFINE_string_(
+    internal_run_death_test, "",
+    "Indicates the file, line number, temporal index of "
+    "the single death test to run, and a file descriptor to "
+    "which a success code may be sent, all separated by "
+    "the '|' characters.  This flag is specified if and only if the current "
+    "process is a sub-process launched for running a thread-safe "
+    "death test.  FOR INTERNAL USE ONLY.");
+}  // namespace internal
+
+#if GTEST_HAS_DEATH_TEST
+
+namespace internal {
+
+// Valid only for fast death tests. Indicates the code is running in the
+// child process of a fast style death test.
+static bool g_in_fast_death_test_child = false;
+
+// Returns a Boolean value indicating whether the caller is currently
+// executing in the context of the death test child process.  Tools such as
+// Valgrind heap checkers may need this to modify their behavior in death
+// tests.  IMPORTANT: This is an internal utility.  Using it may break the
+// implementation of death tests.  User code MUST NOT use it.
+bool InDeathTestChild() {
+# if GTEST_OS_WINDOWS
+
+  // On Windows, death tests are thread-safe regardless of the value of the
+  // death_test_style flag.
+  return !GTEST_FLAG(internal_run_death_test).empty();
+
+# else
+
+  if (GTEST_FLAG(death_test_style) == "threadsafe")
+    return !GTEST_FLAG(internal_run_death_test).empty();
+  else
+    return g_in_fast_death_test_child;
+#endif
+}
+
+}  // namespace internal
+
+// ExitedWithCode constructor.
+ExitedWithCode::ExitedWithCode(int exit_code) : exit_code_(exit_code) {
+}
+
+// ExitedWithCode function-call operator.
+bool ExitedWithCode::operator()(int exit_status) const {
+# if GTEST_OS_WINDOWS
+
+  return exit_status == exit_code_;
+
+# else
+
+  return WIFEXITED(exit_status) && WEXITSTATUS(exit_status) == exit_code_;
+
+# endif  // GTEST_OS_WINDOWS
+}
+
+# if !GTEST_OS_WINDOWS
+// KilledBySignal constructor.
+KilledBySignal::KilledBySignal(int signum) : signum_(signum) {
+}
+
+// KilledBySignal function-call operator.
+bool KilledBySignal::operator()(int exit_status) const {
+  return WIFSIGNALED(exit_status) && WTERMSIG(exit_status) == signum_;
+}
+# endif  // !GTEST_OS_WINDOWS
+
+namespace internal {
+
+// Utilities needed for death tests.
+
+// Generates a textual description of a given exit code, in the format
+// specified by wait(2).
+static std::string ExitSummary(int exit_code) {
+  Message m;
+
+# if GTEST_OS_WINDOWS
+
+  m << "Exited with exit status " << exit_code;
+
+# else
+
+  if (WIFEXITED(exit_code)) {
+    m << "Exited with exit status " << WEXITSTATUS(exit_code);
+  } else if (WIFSIGNALED(exit_code)) {
+    m << "Terminated by signal " << WTERMSIG(exit_code);
+  }
+#  ifdef WCOREDUMP
+  if (WCOREDUMP(exit_code)) {
+    m << " (core dumped)";
+  }
+#  endif
+# endif  // GTEST_OS_WINDOWS
+
+  return m.GetString();
+}
+
+// Returns true if exit_status describes a process that was terminated
+// by a signal, or exited normally with a nonzero exit code.
+bool ExitedUnsuccessfully(int exit_status) {
+  return !ExitedWithCode(0)(exit_status);
+}
+
+# if !GTEST_OS_WINDOWS
+// Generates a textual failure message when a death test finds more than
+// one thread running, or cannot determine the number of threads, prior
+// to executing the given statement.  It is the responsibility of the
+// caller not to pass a thread_count of 1.
+static std::string DeathTestThreadWarning(size_t thread_count) {
+  Message msg;
+  msg << "Death tests use fork(), which is unsafe particularly"
+      << " in a threaded context. For this test, " << GTEST_NAME_ << " ";
+  if (thread_count == 0)
+    msg << "couldn't detect the number of threads.";
+  else
+    msg << "detected " << thread_count << " threads.";
+  return msg.GetString();
+}
+# endif  // !GTEST_OS_WINDOWS
+
+// Flag characters for reporting a death test that did not die.
+static const char kDeathTestLived = 'L';
+static const char kDeathTestReturned = 'R';
+static const char kDeathTestThrew = 'T';
+static const char kDeathTestInternalError = 'I';
+
+// An enumeration describing all of the possible ways that a death test can
+// conclude.  DIED means that the process died while executing the test
+// code; LIVED means that process lived beyond the end of the test code;
+// RETURNED means that the test statement attempted to execute a return
+// statement, which is not allowed; THREW means that the test statement
+// returned control by throwing an exception.  IN_PROGRESS means the test
+// has not yet concluded.
+// TODO(vladl@google.com): Unify names and possibly values for
+// AbortReason, DeathTestOutcome, and flag characters above.
+enum DeathTestOutcome { IN_PROGRESS, DIED, LIVED, RETURNED, THREW };
+
+// Routine for aborting the program which is safe to call from an
+// exec-style death test child process, in which case the error
+// message is propagated back to the parent process.  Otherwise, the
+// message is simply printed to stderr.  In either case, the program
+// then exits with status 1.
+void DeathTestAbort(const std::string& message) {
+  // On a POSIX system, this function may be called from a threadsafe-style
+  // death test child process, which operates on a very small stack.  Use
+  // the heap for any additional non-minuscule memory requirements.
+  const InternalRunDeathTestFlag* const flag =
+      GetUnitTestImpl()->internal_run_death_test_flag();
+  if (flag != NULL) {
+    FILE* parent = posix::FDOpen(flag->write_fd(), "w");
+    fputc(kDeathTestInternalError, parent);
+    fprintf(parent, "%s", message.c_str());
+    fflush(parent);
+    _exit(1);
+  } else {
+    fprintf(stderr, "%s", message.c_str());
+    fflush(stderr);
+    posix::Abort();
+  }
+}
+
+// A replacement for CHECK that calls DeathTestAbort if the assertion
+// fails.
+# define GTEST_DEATH_TEST_CHECK_(expression) \
+  do { \
+    if (!::testing::internal::IsTrue(expression)) { \
+      DeathTestAbort( \
+          ::std::string("CHECK failed: File ") + __FILE__ +  ", line " \
+          + ::testing::internal::StreamableToString(__LINE__) + ": " \
+          + #expression); \
+    } \
+  } while (::testing::internal::AlwaysFalse())
+
+// This macro is similar to GTEST_DEATH_TEST_CHECK_, but it is meant for
+// evaluating any system call that fulfills two conditions: it must return
+// -1 on failure, and set errno to EINTR when it is interrupted and
+// should be tried again.  The macro expands to a loop that repeatedly
+// evaluates the expression as long as it evaluates to -1 and sets
+// errno to EINTR.  If the expression evaluates to -1 but errno is
+// something other than EINTR, DeathTestAbort is called.
+# define GTEST_DEATH_TEST_CHECK_SYSCALL_(expression) \
+  do { \
+    int gtest_retval; \
+    do { \
+      gtest_retval = (expression); \
+    } while (gtest_retval == -1 && errno == EINTR); \
+    if (gtest_retval == -1) { \
+      DeathTestAbort( \
+          ::std::string("CHECK failed: File ") + __FILE__ + ", line " \
+          + ::testing::internal::StreamableToString(__LINE__) + ": " \
+          + #expression + " != -1"); \
+    } \
+  } while (::testing::internal::AlwaysFalse())
+
+// Returns the message describing the last system error in errno.
+std::string GetLastErrnoDescription() {
+    return errno == 0 ? "" : posix::StrError(errno);
+}
+
+// This is called from a death test parent process to read a failure
+// message from the death test child process and log it with the FATAL
+// severity. On Windows, the message is read from a pipe handle. On other
+// platforms, it is read from a file descriptor.
+static void FailFromInternalError(int fd) {
+  Message error;
+  char buffer[256];
+  int num_read;
+
+  do {
+    while ((num_read = posix::Read(fd, buffer, 255)) > 0) {
+      buffer[num_read] = '\0';
+      error << buffer;
+    }
+  } while (num_read == -1 && errno == EINTR);
+
+  if (num_read == 0) {
+    GTEST_LOG_(FATAL) << error.GetString();
+  } else {
+    const int last_error = errno;
+    GTEST_LOG_(FATAL) << "Error while reading death test internal: "
+                      << GetLastErrnoDescription() << " [" << last_error << "]";
+  }
+}
+
+// Death test constructor.  Increments the running death test count
+// for the current test.
+DeathTest::DeathTest() {
+  TestInfo* const info = GetUnitTestImpl()->current_test_info();
+  if (info == NULL) {
+    DeathTestAbort("Cannot run a death test outside of a TEST or "
+                   "TEST_F construct");
+  }
+}
+
+// Creates and returns a death test by dispatching to the current
+// death test factory.
+bool DeathTest::Create(const char* statement, const RE* regex,
+                       const char* file, int line, DeathTest** test) {
+  return GetUnitTestImpl()->death_test_factory()->Create(
+      statement, regex, file, line, test);
+}
+
+const char* DeathTest::LastMessage() {
+  return last_death_test_message_.c_str();
+}
+
+void DeathTest::set_last_death_test_message(const std::string& message) {
+  last_death_test_message_ = message;
+}
+
+std::string DeathTest::last_death_test_message_;
+
+// Provides cross platform implementation for some death functionality.
+class DeathTestImpl : public DeathTest {
+ protected:
+  DeathTestImpl(const char* a_statement, const RE* a_regex)
+      : statement_(a_statement),
+        regex_(a_regex),
+        spawned_(false),
+        status_(-1),
+        outcome_(IN_PROGRESS),
+        read_fd_(-1),
+        write_fd_(-1) {}
+
+  // read_fd_ is expected to be closed and cleared by a derived class.
+  ~DeathTestImpl() { GTEST_DEATH_TEST_CHECK_(read_fd_ == -1); }
+
+  void Abort(AbortReason reason);
+  virtual bool Passed(bool status_ok);
+
+  const char* statement() const { return statement_; }
+  const RE* regex() const { return regex_; }
+  bool spawned() const { return spawned_; }
+  void set_spawned(bool is_spawned) { spawned_ = is_spawned; }
+  int status() const { return status_; }
+  void set_status(int a_status) { status_ = a_status; }
+  DeathTestOutcome outcome() const { return outcome_; }
+  void set_outcome(DeathTestOutcome an_outcome) { outcome_ = an_outcome; }
+  int read_fd() const { return read_fd_; }
+  void set_read_fd(int fd) { read_fd_ = fd; }
+  int write_fd() const { return write_fd_; }
+  void set_write_fd(int fd) { write_fd_ = fd; }
+
+  // Called in the parent process only. Reads the result code of the death
+  // test child process via a pipe, interprets it to set the outcome_
+  // member, and closes read_fd_.  Outputs diagnostics and terminates in
+  // case of unexpected codes.
+  void ReadAndInterpretStatusByte();
+
+ private:
+  // The textual content of the code this object is testing.  This class
+  // doesn't own this string and should not attempt to delete it.
+  const char* const statement_;
+  // The regular expression which test output must match.  DeathTestImpl
+  // doesn't own this object and should not attempt to delete it.
+  const RE* const regex_;
+  // True if the death test child process has been successfully spawned.
+  bool spawned_;
+  // The exit status of the child process.
+  int status_;
+  // How the death test concluded.
+  DeathTestOutcome outcome_;
+  // Descriptor to the read end of the pipe to the child process.  It is
+  // always -1 in the child process.  The child keeps its write end of the
+  // pipe in write_fd_.
+  int read_fd_;
+  // Descriptor to the child's write end of the pipe to the parent process.
+  // It is always -1 in the parent process.  The parent keeps its end of the
+  // pipe in read_fd_.
+  int write_fd_;
+};
+
+// Called in the parent process only. Reads the result code of the death
+// test child process via a pipe, interprets it to set the outcome_
+// member, and closes read_fd_.  Outputs diagnostics and terminates in
+// case of unexpected codes.
+void DeathTestImpl::ReadAndInterpretStatusByte() {
+  char flag;
+  int bytes_read;
+
+  // The read() here blocks until data is available (signifying the
+  // failure of the death test) or until the pipe is closed (signifying
+  // its success), so it's okay to call this in the parent before
+  // the child process has exited.
+  do {
+    bytes_read = posix::Read(read_fd(), &flag, 1);
+  } while (bytes_read == -1 && errno == EINTR);
+
+  if (bytes_read == 0) {
+    set_outcome(DIED);
+  } else if (bytes_read == 1) {
+    switch (flag) {
+      case kDeathTestReturned:
+        set_outcome(RETURNED);
+        break;
+      case kDeathTestThrew:
+        set_outcome(THREW);
+        break;
+      case kDeathTestLived:
+        set_outcome(LIVED);
+        break;
+      case kDeathTestInternalError:
+        FailFromInternalError(read_fd());  // Does not return.
+        break;
+      default:
+        GTEST_LOG_(FATAL) << "Death test child process reported "
+                          << "unexpected status byte ("
+                          << static_cast<unsigned int>(flag) << ")";
+    }
+  } else {
+    GTEST_LOG_(FATAL) << "Read from death test child process failed: "
+                      << GetLastErrnoDescription();
+  }
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(posix::Close(read_fd()));
+  set_read_fd(-1);
+}
+
+// Signals that the death test code which should have exited, didn't.
+// Should be called only in a death test child process.
+// Writes a status byte to the child's status file descriptor, then
+// calls _exit(1).
+void DeathTestImpl::Abort(AbortReason reason) {
+  // The parent process considers the death test to be a failure if
+  // it finds any data in our pipe.  So, here we write a single flag byte
+  // to the pipe, then exit.
+  const char status_ch =
+      reason == TEST_DID_NOT_DIE ? kDeathTestLived :
+      reason == TEST_THREW_EXCEPTION ? kDeathTestThrew : kDeathTestReturned;
+
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(posix::Write(write_fd(), &status_ch, 1));
+  // We are leaking the descriptor here because on some platforms (i.e.,
+  // when built as Windows DLL), destructors of global objects will still
+  // run after calling _exit(). On such systems, write_fd_ will be
+  // indirectly closed from the destructor of UnitTestImpl, causing double
+  // close if it is also closed here. On debug configurations, double close
+  // may assert. As there are no in-process buffers to flush here, we are
+  // relying on the OS to close the descriptor after the process terminates
+  // when the destructors are not run.
+  _exit(1);  // Exits w/o any normal exit hooks (we were supposed to crash)
+}
+
+// Returns an indented copy of stderr output for a death test.
+// This makes distinguishing death test output lines from regular log lines
+// much easier.
+static ::std::string FormatDeathTestOutput(const ::std::string& output) {
+  ::std::string ret;
+  for (size_t at = 0; ; ) {
+    const size_t line_end = output.find('\n', at);
+    ret += "[  DEATH   ] ";
+    if (line_end == ::std::string::npos) {
+      ret += output.substr(at);
+      break;
+    }
+    ret += output.substr(at, line_end + 1 - at);
+    at = line_end + 1;
+  }
+  return ret;
+}
+
+// Assesses the success or failure of a death test, using both private
+// members which have previously been set, and one argument:
+//
+// Private data members:
+//   outcome:  An enumeration describing how the death test
+//             concluded: DIED, LIVED, THREW, or RETURNED.  The death test
+//             fails in the latter three cases.
+//   status:   The exit status of the child process. On *nix, it is in the
+//             in the format specified by wait(2). On Windows, this is the
+//             value supplied to the ExitProcess() API or a numeric code
+//             of the exception that terminated the program.
+//   regex:    A regular expression object to be applied to
+//             the test's captured standard error output; the death test
+//             fails if it does not match.
+//
+// Argument:
+//   status_ok: true if exit_status is acceptable in the context of
+//              this particular death test, which fails if it is false
+//
+// Returns true iff all of the above conditions are met.  Otherwise, the
+// first failing condition, in the order given above, is the one that is
+// reported. Also sets the last death test message string.
+bool DeathTestImpl::Passed(bool status_ok) {
+  if (!spawned())
+    return false;
+
+  const std::string error_message = GetCapturedStderr();
+
+  bool success = false;
+  Message buffer;
+
+  buffer << "Death test: " << statement() << "\n";
+  switch (outcome()) {
+    case LIVED:
+      buffer << "    Result: failed to die.\n"
+             << " Error msg:\n" << FormatDeathTestOutput(error_message);
+      break;
+    case THREW:
+      buffer << "    Result: threw an exception.\n"
+             << " Error msg:\n" << FormatDeathTestOutput(error_message);
+      break;
+    case RETURNED:
+      buffer << "    Result: illegal return in test statement.\n"
+             << " Error msg:\n" << FormatDeathTestOutput(error_message);
+      break;
+    case DIED:
+      if (status_ok) {
+        const bool matched = RE::PartialMatch(error_message.c_str(), *regex());
+        if (matched) {
+          success = true;
+        } else {
+          buffer << "    Result: died but not with expected error.\n"
+                 << "  Expected: " << regex()->pattern() << "\n"
+                 << "Actual msg:\n" << FormatDeathTestOutput(error_message);
+        }
+      } else {
+        buffer << "    Result: died but not with expected exit code:\n"
+               << "            " << ExitSummary(status()) << "\n"
+               << "Actual msg:\n" << FormatDeathTestOutput(error_message);
+      }
+      break;
+    case IN_PROGRESS:
+    default:
+      GTEST_LOG_(FATAL)
+          << "DeathTest::Passed somehow called before conclusion of test";
+  }
+
+  DeathTest::set_last_death_test_message(buffer.GetString());
+  return success;
+}
+
+# if GTEST_OS_WINDOWS
+// WindowsDeathTest implements death tests on Windows. Due to the
+// specifics of starting new processes on Windows, death tests there are
+// always threadsafe, and Google Test considers the
+// --gtest_death_test_style=fast setting to be equivalent to
+// --gtest_death_test_style=threadsafe there.
+//
+// A few implementation notes:  Like the Linux version, the Windows
+// implementation uses pipes for child-to-parent communication. But due to
+// the specifics of pipes on Windows, some extra steps are required:
+//
+// 1. The parent creates a communication pipe and stores handles to both
+//    ends of it.
+// 2. The parent starts the child and provides it with the information
+//    necessary to acquire the handle to the write end of the pipe.
+// 3. The child acquires the write end of the pipe and signals the parent
+//    using a Windows event.
+// 4. Now the parent can release the write end of the pipe on its side. If
+//    this is done before step 3, the object's reference count goes down to
+//    0 and it is destroyed, preventing the child from acquiring it. The
+//    parent now has to release it, or read operations on the read end of
+//    the pipe will not return when the child terminates.
+// 5. The parent reads child's output through the pipe (outcome code and
+//    any possible error messages) from the pipe, and its stderr and then
+//    determines whether to fail the test.
+//
+// Note: to distinguish Win32 API calls from the local method and function
+// calls, the former are explicitly resolved in the global namespace.
+//
+class WindowsDeathTest : public DeathTestImpl {
+ public:
+  WindowsDeathTest(const char* a_statement,
+                   const RE* a_regex,
+                   const char* file,
+                   int line)
+      : DeathTestImpl(a_statement, a_regex), file_(file), line_(line) {}
+
+  // All of these virtual functions are inherited from DeathTest.
+  virtual int Wait();
+  virtual TestRole AssumeRole();
+
+ private:
+  // The name of the file in which the death test is located.
+  const char* const file_;
+  // The line number on which the death test is located.
+  const int line_;
+  // Handle to the write end of the pipe to the child process.
+  AutoHandle write_handle_;
+  // Child process handle.
+  AutoHandle child_handle_;
+  // Event the child process uses to signal the parent that it has
+  // acquired the handle to the write end of the pipe. After seeing this
+  // event the parent can release its own handles to make sure its
+  // ReadFile() calls return when the child terminates.
+  AutoHandle event_handle_;
+};
+
+// Waits for the child in a death test to exit, returning its exit
+// status, or 0 if no child process exists.  As a side effect, sets the
+// outcome data member.
+int WindowsDeathTest::Wait() {
+  if (!spawned())
+    return 0;
+
+  // Wait until the child either signals that it has acquired the write end
+  // of the pipe or it dies.
+  const HANDLE wait_handles[2] = { child_handle_.Get(), event_handle_.Get() };
+  switch (::WaitForMultipleObjects(2,
+                                   wait_handles,
+                                   FALSE,  // Waits for any of the handles.
+                                   INFINITE)) {
+    case WAIT_OBJECT_0:
+    case WAIT_OBJECT_0 + 1:
+      break;
+    default:
+      GTEST_DEATH_TEST_CHECK_(false);  // Should not get here.
+  }
+
+  // The child has acquired the write end of the pipe or exited.
+  // We release the handle on our side and continue.
+  write_handle_.Reset();
+  event_handle_.Reset();
+
+  ReadAndInterpretStatusByte();
+
+  // Waits for the child process to exit if it haven't already. This
+  // returns immediately if the child has already exited, regardless of
+  // whether previous calls to WaitForMultipleObjects synchronized on this
+  // handle or not.
+  GTEST_DEATH_TEST_CHECK_(
+      WAIT_OBJECT_0 == ::WaitForSingleObject(child_handle_.Get(),
+                                             INFINITE));
+  DWORD status_code;
+  GTEST_DEATH_TEST_CHECK_(
+      ::GetExitCodeProcess(child_handle_.Get(), &status_code) != FALSE);
+  child_handle_.Reset();
+  set_status(static_cast<int>(status_code));
+  return status();
+}
+
+// The AssumeRole process for a Windows death test.  It creates a child
+// process with the same executable as the current process to run the
+// death test.  The child process is given the --gtest_filter and
+// --gtest_internal_run_death_test flags such that it knows to run the
+// current death test only.
+DeathTest::TestRole WindowsDeathTest::AssumeRole() {
+  const UnitTestImpl* const impl = GetUnitTestImpl();
+  const InternalRunDeathTestFlag* const flag =
+      impl->internal_run_death_test_flag();
+  const TestInfo* const info = impl->current_test_info();
+  const int death_test_index = info->result()->death_test_count();
+
+  if (flag != NULL) {
+    // ParseInternalRunDeathTestFlag() has performed all the necessary
+    // processing.
+    set_write_fd(flag->write_fd());
+    return EXECUTE_TEST;
+  }
+
+  // WindowsDeathTest uses an anonymous pipe to communicate results of
+  // a death test.
+  SECURITY_ATTRIBUTES handles_are_inheritable = {
+    sizeof(SECURITY_ATTRIBUTES), NULL, TRUE };
+  HANDLE read_handle, write_handle;
+  GTEST_DEATH_TEST_CHECK_(
+      ::CreatePipe(&read_handle, &write_handle, &handles_are_inheritable,
+                   0)  // Default buffer size.
+      != FALSE);
+  set_read_fd(::_open_osfhandle(reinterpret_cast<intptr_t>(read_handle),
+                                O_RDONLY));
+  write_handle_.Reset(write_handle);
+  event_handle_.Reset(::CreateEvent(
+      &handles_are_inheritable,
+      TRUE,    // The event will automatically reset to non-signaled state.
+      FALSE,   // The initial state is non-signalled.
+      NULL));  // The even is unnamed.
+  GTEST_DEATH_TEST_CHECK_(event_handle_.Get() != NULL);
+  const std::string filter_flag =
+      std::string("--") + GTEST_FLAG_PREFIX_ + kFilterFlag + "=" +
+      info->test_case_name() + "." + info->name();
+  const std::string internal_flag =
+      std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag +
+      "=" + file_ + "|" + StreamableToString(line_) + "|" +
+      StreamableToString(death_test_index) + "|" +
+      StreamableToString(static_cast<unsigned int>(::GetCurrentProcessId())) +
+      // size_t has the same width as pointers on both 32-bit and 64-bit
+      // Windows platforms.
+      // See http://msdn.microsoft.com/en-us/library/tcxf1dw6.aspx.
+      "|" + StreamableToString(reinterpret_cast<size_t>(write_handle)) +
+      "|" + StreamableToString(reinterpret_cast<size_t>(event_handle_.Get()));
+
+  char executable_path[_MAX_PATH + 1];  // NOLINT
+  GTEST_DEATH_TEST_CHECK_(
+      _MAX_PATH + 1 != ::GetModuleFileNameA(NULL,
+                                            executable_path,
+                                            _MAX_PATH));
+
+  std::string command_line =
+      std::string(::GetCommandLineA()) + " " + filter_flag + " \"" +
+      internal_flag + "\"";
+
+  DeathTest::set_last_death_test_message("");
+
+  CaptureStderr();
+  // Flush the log buffers since the log streams are shared with the child.
+  FlushInfoLog();
+
+  // The child process will share the standard handles with the parent.
+  STARTUPINFOA startup_info;
+  memset(&startup_info, 0, sizeof(STARTUPINFO));
+  startup_info.dwFlags = STARTF_USESTDHANDLES;
+  startup_info.hStdInput = ::GetStdHandle(STD_INPUT_HANDLE);
+  startup_info.hStdOutput = ::GetStdHandle(STD_OUTPUT_HANDLE);
+  startup_info.hStdError = ::GetStdHandle(STD_ERROR_HANDLE);
+
+  PROCESS_INFORMATION process_info;
+  GTEST_DEATH_TEST_CHECK_(::CreateProcessA(
+      executable_path,
+      const_cast<char*>(command_line.c_str()),
+      NULL,   // Retuned process handle is not inheritable.
+      NULL,   // Retuned thread handle is not inheritable.
+      TRUE,   // Child inherits all inheritable handles (for write_handle_).
+      0x0,    // Default creation flags.
+      NULL,   // Inherit the parent's environment.
+      UnitTest::GetInstance()->original_working_dir(),
+      &startup_info,
+      &process_info) != FALSE);
+  child_handle_.Reset(process_info.hProcess);
+  ::CloseHandle(process_info.hThread);
+  set_spawned(true);
+  return OVERSEE_TEST;
+}
+# else  // We are not on Windows.
+
+// ForkingDeathTest provides implementations for most of the abstract
+// methods of the DeathTest interface.  Only the AssumeRole method is
+// left undefined.
+class ForkingDeathTest : public DeathTestImpl {
+ public:
+  ForkingDeathTest(const char* statement, const RE* regex);
+
+  // All of these virtual functions are inherited from DeathTest.
+  virtual int Wait();
+
+ protected:
+  void set_child_pid(pid_t child_pid) { child_pid_ = child_pid; }
+
+ private:
+  // PID of child process during death test; 0 in the child process itself.
+  pid_t child_pid_;
+};
+
+// Constructs a ForkingDeathTest.
+ForkingDeathTest::ForkingDeathTest(const char* a_statement, const RE* a_regex)
+    : DeathTestImpl(a_statement, a_regex),
+      child_pid_(-1) {}
+
+// Waits for the child in a death test to exit, returning its exit
+// status, or 0 if no child process exists.  As a side effect, sets the
+// outcome data member.
+int ForkingDeathTest::Wait() {
+  if (!spawned())
+    return 0;
+
+  ReadAndInterpretStatusByte();
+
+  int status_value;
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(waitpid(child_pid_, &status_value, 0));
+  set_status(status_value);
+  return status_value;
+}
+
+// A concrete death test class that forks, then immediately runs the test
+// in the child process.
+class NoExecDeathTest : public ForkingDeathTest {
+ public:
+  NoExecDeathTest(const char* a_statement, const RE* a_regex) :
+      ForkingDeathTest(a_statement, a_regex) { }
+  virtual TestRole AssumeRole();
+};
+
+// The AssumeRole process for a fork-and-run death test.  It implements a
+// straightforward fork, with a simple pipe to transmit the status byte.
+DeathTest::TestRole NoExecDeathTest::AssumeRole() {
+  const size_t thread_count = GetThreadCount();
+  if (thread_count != 1) {
+    GTEST_LOG_(WARNING) << DeathTestThreadWarning(thread_count);
+  }
+
+  int pipe_fd[2];
+  GTEST_DEATH_TEST_CHECK_(pipe(pipe_fd) != -1);
+
+  DeathTest::set_last_death_test_message("");
+  CaptureStderr();
+  // When we fork the process below, the log file buffers are copied, but the
+  // file descriptors are shared.  We flush all log files here so that closing
+  // the file descriptors in the child process doesn't throw off the
+  // synchronization between descriptors and buffers in the parent process.
+  // This is as close to the fork as possible to avoid a race condition in case
+  // there are multiple threads running before the death test, and another
+  // thread writes to the log file.
+  FlushInfoLog();
+
+  const pid_t child_pid = fork();
+  GTEST_DEATH_TEST_CHECK_(child_pid != -1);
+  set_child_pid(child_pid);
+  if (child_pid == 0) {
+    GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[0]));
+    set_write_fd(pipe_fd[1]);
+    // Redirects all logging to stderr in the child process to prevent
+    // concurrent writes to the log files.  We capture stderr in the parent
+    // process and append the child process' output to a log.
+    LogToStderr();
+    // Event forwarding to the listeners of event listener API mush be shut
+    // down in death test subprocesses.
+    GetUnitTestImpl()->listeners()->SuppressEventForwarding();
+    g_in_fast_death_test_child = true;
+    return EXECUTE_TEST;
+  } else {
+    GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[1]));
+    set_read_fd(pipe_fd[0]);
+    set_spawned(true);
+    return OVERSEE_TEST;
+  }
+}
+
+// A concrete death test class that forks and re-executes the main
+// program from the beginning, with command-line flags set that cause
+// only this specific death test to be run.
+class ExecDeathTest : public ForkingDeathTest {
+ public:
+  ExecDeathTest(const char* a_statement, const RE* a_regex,
+                const char* file, int line) :
+      ForkingDeathTest(a_statement, a_regex), file_(file), line_(line) { }
+  virtual TestRole AssumeRole();
+ private:
+  static ::std::vector<testing::internal::string>
+  GetArgvsForDeathTestChildProcess() {
+    ::std::vector<testing::internal::string> args = GetInjectableArgvs();
+    return args;
+  }
+  // The name of the file in which the death test is located.
+  const char* const file_;
+  // The line number on which the death test is located.
+  const int line_;
+};
+
+// Utility class for accumulating command-line arguments.
+class Arguments {
+ public:
+  Arguments() {
+    args_.push_back(NULL);
+  }
+
+  ~Arguments() {
+    for (std::vector<char*>::iterator i = args_.begin(); i != args_.end();
+         ++i) {
+      free(*i);
+    }
+  }
+  void AddArgument(const char* argument) {
+    args_.insert(args_.end() - 1, posix::StrDup(argument));
+  }
+
+  template <typename Str>
+  void AddArguments(const ::std::vector<Str>& arguments) {
+    for (typename ::std::vector<Str>::const_iterator i = arguments.begin();
+         i != arguments.end();
+         ++i) {
+      args_.insert(args_.end() - 1, posix::StrDup(i->c_str()));
+    }
+  }
+  char* const* Argv() {
+    return &args_[0];
+  }
+
+ private:
+  std::vector<char*> args_;
+};
+
+// A struct that encompasses the arguments to the child process of a
+// threadsafe-style death test process.
+struct ExecDeathTestArgs {
+  char* const* argv;  // Command-line arguments for the child's call to exec
+  int close_fd;       // File descriptor to close; the read end of a pipe
+};
+
+#  if GTEST_OS_MAC
+inline char** GetEnviron() {
+  // When Google Test is built as a framework on MacOS X, the environ variable
+  // is unavailable. Apple's documentation (man environ) recommends using
+  // _NSGetEnviron() instead.
+  return *_NSGetEnviron();
+}
+#  else
+// Some POSIX platforms expect you to declare environ. extern "C" makes
+// it reside in the global namespace.
+extern "C" char** environ;
+inline char** GetEnviron() { return environ; }
+#  endif  // GTEST_OS_MAC
+
+#  if !GTEST_OS_QNX
+// The main function for a threadsafe-style death test child process.
+// This function is called in a clone()-ed process and thus must avoid
+// any potentially unsafe operations like malloc or libc functions.
+static int ExecDeathTestChildMain(void* child_arg) {
+  ExecDeathTestArgs* const args = static_cast<ExecDeathTestArgs*>(child_arg);
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(close(args->close_fd));
+
+  // We need to execute the test program in the same environment where
+  // it was originally invoked.  Therefore we change to the original
+  // working directory first.
+  const char* const original_dir =
+      UnitTest::GetInstance()->original_working_dir();
+  // We can safely call chdir() as it's a direct system call.
+  if (chdir(original_dir) != 0) {
+    DeathTestAbort(std::string("chdir(\"") + original_dir + "\") failed: " +
+                   GetLastErrnoDescription());
+    return EXIT_FAILURE;
+  }
+
+  // We can safely call execve() as it's a direct system call.  We
+  // cannot use execvp() as it's a libc function and thus potentially
+  // unsafe.  Since execve() doesn't search the PATH, the user must
+  // invoke the test program via a valid path that contains at least
+  // one path separator.
+  execve(args->argv[0], args->argv, GetEnviron());
+  DeathTestAbort(std::string("execve(") + args->argv[0] + ", ...) in " +
+                 original_dir + " failed: " +
+                 GetLastErrnoDescription());
+  return EXIT_FAILURE;
+}
+#  endif  // !GTEST_OS_QNX
+
+// Two utility routines that together determine the direction the stack
+// grows.
+// This could be accomplished more elegantly by a single recursive
+// function, but we want to guard against the unlikely possibility of
+// a smart compiler optimizing the recursion away.
+//
+// GTEST_NO_INLINE_ is required to prevent GCC 4.6 from inlining
+// StackLowerThanAddress into StackGrowsDown, which then doesn't give
+// correct answer.
+void StackLowerThanAddress(const void* ptr, bool* result) GTEST_NO_INLINE_;
+void StackLowerThanAddress(const void* ptr, bool* result) {
+  int dummy;
+  *result = (&dummy < ptr);
+}
+
+bool StackGrowsDown() {
+  int dummy;
+  bool result;
+  StackLowerThanAddress(&dummy, &result);
+  return result;
+}
+
+// Spawns a child process with the same executable as the current process in
+// a thread-safe manner and instructs it to run the death test.  The
+// implementation uses fork(2) + exec.  On systems where clone(2) is
+// available, it is used instead, being slightly more thread-safe.  On QNX,
+// fork supports only single-threaded environments, so this function uses
+// spawn(2) there instead.  The function dies with an error message if
+// anything goes wrong.
+static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) {
+  ExecDeathTestArgs args = { argv, close_fd };
+  pid_t child_pid = -1;
+
+#  if GTEST_OS_QNX
+  // Obtains the current directory and sets it to be closed in the child
+  // process.
+  const int cwd_fd = open(".", O_RDONLY);
+  GTEST_DEATH_TEST_CHECK_(cwd_fd != -1);
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(fcntl(cwd_fd, F_SETFD, FD_CLOEXEC));
+  // We need to execute the test program in the same environment where
+  // it was originally invoked.  Therefore we change to the original
+  // working directory first.
+  const char* const original_dir =
+      UnitTest::GetInstance()->original_working_dir();
+  // We can safely call chdir() as it's a direct system call.
+  if (chdir(original_dir) != 0) {
+    DeathTestAbort(std::string("chdir(\"") + original_dir + "\") failed: " +
+                   GetLastErrnoDescription());
+    return EXIT_FAILURE;
+  }
+
+  int fd_flags;
+  // Set close_fd to be closed after spawn.
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(fd_flags = fcntl(close_fd, F_GETFD));
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(fcntl(close_fd, F_SETFD,
+                                        fd_flags | FD_CLOEXEC));
+  struct inheritance inherit = {0};
+  // spawn is a system call.
+  child_pid = spawn(args.argv[0], 0, NULL, &inherit, args.argv, GetEnviron());
+  // Restores the current working directory.
+  GTEST_DEATH_TEST_CHECK_(fchdir(cwd_fd) != -1);
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(close(cwd_fd));
+
+#  else   // GTEST_OS_QNX
+#   if GTEST_OS_LINUX
+  // When a SIGPROF signal is received while fork() or clone() are executing,
+  // the process may hang. To avoid this, we ignore SIGPROF here and re-enable
+  // it after the call to fork()/clone() is complete.
+  struct sigaction saved_sigprof_action;
+  struct sigaction ignore_sigprof_action;
+  memset(&ignore_sigprof_action, 0, sizeof(ignore_sigprof_action));
+  sigemptyset(&ignore_sigprof_action.sa_mask);
+  ignore_sigprof_action.sa_handler = SIG_IGN;
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(sigaction(
+      SIGPROF, &ignore_sigprof_action, &saved_sigprof_action));
+#   endif  // GTEST_OS_LINUX
+
+#   if GTEST_HAS_CLONE
+  const bool use_fork = GTEST_FLAG(death_test_use_fork);
+
+  if (!use_fork) {
+    static const bool stack_grows_down = StackGrowsDown();
+    const size_t stack_size = getpagesize();
+    // MMAP_ANONYMOUS is not defined on Mac, so we use MAP_ANON instead.
+    void* const stack = mmap(NULL, stack_size, PROT_READ | PROT_WRITE,
+                             MAP_ANON | MAP_PRIVATE, -1, 0);
+    GTEST_DEATH_TEST_CHECK_(stack != MAP_FAILED);
+
+    // Maximum stack alignment in bytes:  For a downward-growing stack, this
+    // amount is subtracted from size of the stack space to get an address
+    // that is within the stack space and is aligned on all systems we care
+    // about.  As far as I know there is no ABI with stack alignment greater
+    // than 64.  We assume stack and stack_size already have alignment of
+    // kMaxStackAlignment.
+    const size_t kMaxStackAlignment = 64;
+    void* const stack_top =
+        static_cast<char*>(stack) +
+            (stack_grows_down ? stack_size - kMaxStackAlignment : 0);
+    GTEST_DEATH_TEST_CHECK_(stack_size > kMaxStackAlignment &&
+        reinterpret_cast<intptr_t>(stack_top) % kMaxStackAlignment == 0);
+
+    child_pid = clone(&ExecDeathTestChildMain, stack_top, SIGCHLD, &args);
+
+    GTEST_DEATH_TEST_CHECK_(munmap(stack, stack_size) != -1);
+  }
+#   else
+  const bool use_fork = true;
+#   endif  // GTEST_HAS_CLONE
+
+  if (use_fork && (child_pid = fork()) == 0) {
+      ExecDeathTestChildMain(&args);
+      _exit(0);
+  }
+#  endif  // GTEST_OS_QNX
+#  if GTEST_OS_LINUX
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(
+      sigaction(SIGPROF, &saved_sigprof_action, NULL));
+#  endif  // GTEST_OS_LINUX
+
+  GTEST_DEATH_TEST_CHECK_(child_pid != -1);
+  return child_pid;
+}
+
+// The AssumeRole process for a fork-and-exec death test.  It re-executes the
+// main program from the beginning, setting the --gtest_filter
+// and --gtest_internal_run_death_test flags to cause only the current
+// death test to be re-run.
+DeathTest::TestRole ExecDeathTest::AssumeRole() {
+  const UnitTestImpl* const impl = GetUnitTestImpl();
+  const InternalRunDeathTestFlag* const flag =
+      impl->internal_run_death_test_flag();
+  const TestInfo* const info = impl->current_test_info();
+  const int death_test_index = info->result()->death_test_count();
+
+  if (flag != NULL) {
+    set_write_fd(flag->write_fd());
+    return EXECUTE_TEST;
+  }
+
+  int pipe_fd[2];
+  GTEST_DEATH_TEST_CHECK_(pipe(pipe_fd) != -1);
+  // Clear the close-on-exec flag on the write end of the pipe, lest
+  // it be closed when the child process does an exec:
+  GTEST_DEATH_TEST_CHECK_(fcntl(pipe_fd[1], F_SETFD, 0) != -1);
+
+  const std::string filter_flag =
+      std::string("--") + GTEST_FLAG_PREFIX_ + kFilterFlag + "="
+      + info->test_case_name() + "." + info->name();
+  const std::string internal_flag =
+      std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag + "="
+      + file_ + "|" + StreamableToString(line_) + "|"
+      + StreamableToString(death_test_index) + "|"
+      + StreamableToString(pipe_fd[1]);
+  Arguments args;
+  args.AddArguments(GetArgvsForDeathTestChildProcess());
+  args.AddArgument(filter_flag.c_str());
+  args.AddArgument(internal_flag.c_str());
+
+  DeathTest::set_last_death_test_message("");
+
+  CaptureStderr();
+  // See the comment in NoExecDeathTest::AssumeRole for why the next line
+  // is necessary.
+  FlushInfoLog();
+
+  const pid_t child_pid = ExecDeathTestSpawnChild(args.Argv(), pipe_fd[0]);
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[1]));
+  set_child_pid(child_pid);
+  set_read_fd(pipe_fd[0]);
+  set_spawned(true);
+  return OVERSEE_TEST;
+}
+
+# endif  // !GTEST_OS_WINDOWS
+
+// Creates a concrete DeathTest-derived class that depends on the
+// --gtest_death_test_style flag, and sets the pointer pointed to
+// by the "test" argument to its address.  If the test should be
+// skipped, sets that pointer to NULL.  Returns true, unless the
+// flag is set to an invalid value.
+bool DefaultDeathTestFactory::Create(const char* statement, const RE* regex,
+                                     const char* file, int line,
+                                     DeathTest** test) {
+  UnitTestImpl* const impl = GetUnitTestImpl();
+  const InternalRunDeathTestFlag* const flag =
+      impl->internal_run_death_test_flag();
+  const int death_test_index = impl->current_test_info()
+      ->increment_death_test_count();
+
+  if (flag != NULL) {
+    if (death_test_index > flag->index()) {
+      DeathTest::set_last_death_test_message(
+          "Death test count (" + StreamableToString(death_test_index)
+          + ") somehow exceeded expected maximum ("
+          + StreamableToString(flag->index()) + ")");
+      return false;
+    }
+
+    if (!(flag->file() == file && flag->line() == line &&
+          flag->index() == death_test_index)) {
+      *test = NULL;
+      return true;
+    }
+  }
+
+# if GTEST_OS_WINDOWS
+
+  if (GTEST_FLAG(death_test_style) == "threadsafe" ||
+      GTEST_FLAG(death_test_style) == "fast") {
+    *test = new WindowsDeathTest(statement, regex, file, line);
+  }
+
+# else
+
+  if (GTEST_FLAG(death_test_style) == "threadsafe") {
+    *test = new ExecDeathTest(statement, regex, file, line);
+  } else if (GTEST_FLAG(death_test_style) == "fast") {
+    *test = new NoExecDeathTest(statement, regex);
+  }
+
+# endif  // GTEST_OS_WINDOWS
+
+  else {  // NOLINT - this is more readable than unbalanced brackets inside #if.
+    DeathTest::set_last_death_test_message(
+        "Unknown death test style \"" + GTEST_FLAG(death_test_style)
+        + "\" encountered");
+    return false;
+  }
+
+  return true;
+}
+
+// Splits a given string on a given delimiter, populating a given
+// vector with the fields.  GTEST_HAS_DEATH_TEST implies that we have
+// ::std::string, so we can use it here.
+static void SplitString(const ::std::string& str, char delimiter,
+                        ::std::vector< ::std::string>* dest) {
+  ::std::vector< ::std::string> parsed;
+  ::std::string::size_type pos = 0;
+  while (::testing::internal::AlwaysTrue()) {
+    const ::std::string::size_type colon = str.find(delimiter, pos);
+    if (colon == ::std::string::npos) {
+      parsed.push_back(str.substr(pos));
+      break;
+    } else {
+      parsed.push_back(str.substr(pos, colon - pos));
+      pos = colon + 1;
+    }
+  }
+  dest->swap(parsed);
+}
+
+# if GTEST_OS_WINDOWS
+// Recreates the pipe and event handles from the provided parameters,
+// signals the event, and returns a file descriptor wrapped around the pipe
+// handle. This function is called in the child process only.
+int GetStatusFileDescriptor(unsigned int parent_process_id,
+                            size_t write_handle_as_size_t,
+                            size_t event_handle_as_size_t) {
+  AutoHandle parent_process_handle(::OpenProcess(PROCESS_DUP_HANDLE,
+                                                   FALSE,  // Non-inheritable.
+                                                   parent_process_id));
+  if (parent_process_handle.Get() == INVALID_HANDLE_VALUE) {
+    DeathTestAbort("Unable to open parent process " +
+                   StreamableToString(parent_process_id));
+  }
+
+  // TODO(vladl@google.com): Replace the following check with a
+  // compile-time assertion when available.
+  GTEST_CHECK_(sizeof(HANDLE) <= sizeof(size_t));
+
+  const HANDLE write_handle =
+      reinterpret_cast<HANDLE>(write_handle_as_size_t);
+  HANDLE dup_write_handle;
+
+  // The newly initialized handle is accessible only in in the parent
+  // process. To obtain one accessible within the child, we need to use
+  // DuplicateHandle.
+  if (!::DuplicateHandle(parent_process_handle.Get(), write_handle,
+                         ::GetCurrentProcess(), &dup_write_handle,
+                         0x0,    // Requested privileges ignored since
+                                 // DUPLICATE_SAME_ACCESS is used.
+                         FALSE,  // Request non-inheritable handler.
+                         DUPLICATE_SAME_ACCESS)) {
+    DeathTestAbort("Unable to duplicate the pipe handle " +
+                   StreamableToString(write_handle_as_size_t) +
+                   " from the parent process " +
+                   StreamableToString(parent_process_id));
+  }
+
+  const HANDLE event_handle = reinterpret_cast<HANDLE>(event_handle_as_size_t);
+  HANDLE dup_event_handle;
+
+  if (!::DuplicateHandle(parent_process_handle.Get(), event_handle,
+                         ::GetCurrentProcess(), &dup_event_handle,
+                         0x0,
+                         FALSE,
+                         DUPLICATE_SAME_ACCESS)) {
+    DeathTestAbort("Unable to duplicate the event handle " +
+                   StreamableToString(event_handle_as_size_t) +
+                   " from the parent process " +
+                   StreamableToString(parent_process_id));
+  }
+
+  const int write_fd =
+      ::_open_osfhandle(reinterpret_cast<intptr_t>(dup_write_handle), O_APPEND);
+  if (write_fd == -1) {
+    DeathTestAbort("Unable to convert pipe handle " +
+                   StreamableToString(write_handle_as_size_t) +
+                   " to a file descriptor");
+  }
+
+  // Signals the parent that the write end of the pipe has been acquired
+  // so the parent can release its own write end.
+  ::SetEvent(dup_event_handle);
+
+  return write_fd;
+}
+# endif  // GTEST_OS_WINDOWS
+
+// Returns a newly created InternalRunDeathTestFlag object with fields
+// initialized from the GTEST_FLAG(internal_run_death_test) flag if
+// the flag is specified; otherwise returns NULL.
+InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag() {
+  if (GTEST_FLAG(internal_run_death_test) == "") return NULL;
+
+  // GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we
+  // can use it here.
+  int line = -1;
+  int index = -1;
+  ::std::vector< ::std::string> fields;
+  SplitString(GTEST_FLAG(internal_run_death_test).c_str(), '|', &fields);
+  int write_fd = -1;
+
+# if GTEST_OS_WINDOWS
+
+  unsigned int parent_process_id = 0;
+  size_t write_handle_as_size_t = 0;
+  size_t event_handle_as_size_t = 0;
+
+  if (fields.size() != 6
+      || !ParseNaturalNumber(fields[1], &line)
+      || !ParseNaturalNumber(fields[2], &index)
+      || !ParseNaturalNumber(fields[3], &parent_process_id)
+      || !ParseNaturalNumber(fields[4], &write_handle_as_size_t)
+      || !ParseNaturalNumber(fields[5], &event_handle_as_size_t)) {
+    DeathTestAbort("Bad --gtest_internal_run_death_test flag: " +
+                   GTEST_FLAG(internal_run_death_test));
+  }
+  write_fd = GetStatusFileDescriptor(parent_process_id,
+                                     write_handle_as_size_t,
+                                     event_handle_as_size_t);
+# else
+
+  if (fields.size() != 4
+      || !ParseNaturalNumber(fields[1], &line)
+      || !ParseNaturalNumber(fields[2], &index)
+      || !ParseNaturalNumber(fields[3], &write_fd)) {
+    DeathTestAbort("Bad --gtest_internal_run_death_test flag: "
+        + GTEST_FLAG(internal_run_death_test));
+  }
+
+# endif  // GTEST_OS_WINDOWS
+
+  return new InternalRunDeathTestFlag(fields[0], line, index, write_fd);
+}
+
+}  // namespace internal
+
+#endif  // GTEST_HAS_DEATH_TEST
+
+}  // namespace testing
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: keith.ray@gmail.com (Keith Ray)
+
+
+#include <stdlib.h>
+
+#if GTEST_OS_WINDOWS_MOBILE
+# include <windows.h>
+#elif GTEST_OS_WINDOWS
+# include <direct.h>
+# include <io.h>
+#elif GTEST_OS_SYMBIAN
+// Symbian OpenC has PATH_MAX in sys/syslimits.h
+# include <sys/syslimits.h>
+#else
+# include <limits.h>
+# include <climits>  // Some Linux distributions define PATH_MAX here.
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+#if GTEST_OS_WINDOWS
+# define GTEST_PATH_MAX_ _MAX_PATH
+#elif defined(PATH_MAX)
+# define GTEST_PATH_MAX_ PATH_MAX
+#elif defined(_XOPEN_PATH_MAX)
+# define GTEST_PATH_MAX_ _XOPEN_PATH_MAX
+#else
+# define GTEST_PATH_MAX_ _POSIX_PATH_MAX
+#endif  // GTEST_OS_WINDOWS
+
+
+namespace testing {
+namespace internal {
+
+#if GTEST_OS_WINDOWS
+// On Windows, '\\' is the standard path separator, but many tools and the
+// Windows API also accept '/' as an alternate path separator. Unless otherwise
+// noted, a file path can contain either kind of path separators, or a mixture
+// of them.
+const char kPathSeparator = '\\';
+const char kAlternatePathSeparator = '/';
+//const char kPathSeparatorString[] = "\\";
+const char kAlternatePathSeparatorString[] = "/";
+# if GTEST_OS_WINDOWS_MOBILE
+// Windows CE doesn't have a current directory. You should not use
+// the current directory in tests on Windows CE, but this at least
+// provides a reasonable fallback.
+const char kCurrentDirectoryString[] = "\\";
+// Windows CE doesn't define INVALID_FILE_ATTRIBUTES
+const DWORD kInvalidFileAttributes = 0xffffffff;
+# else
+const char kCurrentDirectoryString[] = ".\\";
+# endif  // GTEST_OS_WINDOWS_MOBILE
+#else
+const char kPathSeparator = '/';
+//const char kPathSeparatorString[] = "/";
+const char kCurrentDirectoryString[] = "./";
+#endif  // GTEST_OS_WINDOWS
+
+// Returns whether the given character is a valid path separator.
+static bool IsPathSeparator(char c) {
+#if GTEST_HAS_ALT_PATH_SEP_
+  return (c == kPathSeparator) || (c == kAlternatePathSeparator);
+#else
+  return c == kPathSeparator;
+#endif
+}
+
+// Returns the current working directory, or "" if unsuccessful.
+FilePath FilePath::GetCurrentDir() {
+#if GTEST_OS_WINDOWS_MOBILE
+  // Windows CE doesn't have a current directory, so we just return
+  // something reasonable.
+  return FilePath(kCurrentDirectoryString);
+#elif GTEST_OS_WINDOWS
+  char cwd[GTEST_PATH_MAX_ + 1] = { '\0' };
+  return FilePath(_getcwd(cwd, sizeof(cwd)) == NULL ? "" : cwd);
+#else
+  char cwd[GTEST_PATH_MAX_ + 1] = { '\0' };
+  return FilePath(getcwd(cwd, sizeof(cwd)) == NULL ? "" : cwd);
+#endif  // GTEST_OS_WINDOWS_MOBILE
+}
+
+// Returns a copy of the FilePath with the case-insensitive extension removed.
+// Example: FilePath("dir/file.exe").RemoveExtension("EXE") returns
+// FilePath("dir/file"). If a case-insensitive extension is not
+// found, returns a copy of the original FilePath.
+FilePath FilePath::RemoveExtension(const char* extension) const {
+  const std::string dot_extension = std::string(".") + extension;
+  if (String::EndsWithCaseInsensitive(pathname_, dot_extension)) {
+    return FilePath(pathname_.substr(
+        0, pathname_.length() - dot_extension.length()));
+  }
+  return *this;
+}
+
+// Returns a pointer to the last occurence of a valid path separator in
+// the FilePath. On Windows, for example, both '/' and '\' are valid path
+// separators. Returns NULL if no path separator was found.
+const char* FilePath::FindLastPathSeparator() const {
+  const char* const last_sep = strrchr(c_str(), kPathSeparator);
+#if GTEST_HAS_ALT_PATH_SEP_
+  const char* const last_alt_sep = strrchr(c_str(), kAlternatePathSeparator);
+  // Comparing two pointers of which only one is NULL is undefined.
+  if (last_alt_sep != NULL &&
+      (last_sep == NULL || last_alt_sep > last_sep)) {
+    return last_alt_sep;
+  }
+#endif
+  return last_sep;
+}
+
+// Returns a copy of the FilePath with the directory part removed.
+// Example: FilePath("path/to/file").RemoveDirectoryName() returns
+// FilePath("file"). If there is no directory part ("just_a_file"), it returns
+// the FilePath unmodified. If there is no file part ("just_a_dir/") it
+// returns an empty FilePath ("").
+// On Windows platform, '\' is the path separator, otherwise it is '/'.
+FilePath FilePath::RemoveDirectoryName() const {
+  const char* const last_sep = FindLastPathSeparator();
+  return last_sep ? FilePath(last_sep + 1) : *this;
+}
+
+// RemoveFileName returns the directory path with the filename removed.
+// Example: FilePath("path/to/file").RemoveFileName() returns "path/to/".
+// If the FilePath is "a_file" or "/a_file", RemoveFileName returns
+// FilePath("./") or, on Windows, FilePath(".\\"). If the filepath does
+// not have a file, like "just/a/dir/", it returns the FilePath unmodified.
+// On Windows platform, '\' is the path separator, otherwise it is '/'.
+FilePath FilePath::RemoveFileName() const {
+  const char* const last_sep = FindLastPathSeparator();
+  std::string dir;
+  if (last_sep) {
+    dir = std::string(c_str(), last_sep + 1 - c_str());
+  } else {
+    dir = kCurrentDirectoryString;
+  }
+  return FilePath(dir);
+}
+
+// Helper functions for naming files in a directory for xml output.
+
+// Given directory = "dir", base_name = "test", number = 0,
+// extension = "xml", returns "dir/test.xml". If number is greater
+// than zero (e.g., 12), returns "dir/test_12.xml".
+// On Windows platform, uses \ as the separator rather than /.
+FilePath FilePath::MakeFileName(const FilePath& directory,
+                                const FilePath& base_name,
+                                int number,
+                                const char* extension) {
+  std::string file;
+  if (number == 0) {
+    file = base_name.string() + "." + extension;
+  } else {
+    file = base_name.string() + "_" + StreamableToString(number)
+        + "." + extension;
+  }
+  return ConcatPaths(directory, FilePath(file));
+}
+
+// Given directory = "dir", relative_path = "test.xml", returns "dir/test.xml".
+// On Windows, uses \ as the separator rather than /.
+FilePath FilePath::ConcatPaths(const FilePath& directory,
+                               const FilePath& relative_path) {
+  if (directory.IsEmpty())
+    return relative_path;
+  const FilePath dir(directory.RemoveTrailingPathSeparator());
+  return FilePath(dir.string() + kPathSeparator + relative_path.string());
+}
+
+// Returns true if pathname describes something findable in the file-system,
+// either a file, directory, or whatever.
+bool FilePath::FileOrDirectoryExists() const {
+#if GTEST_OS_WINDOWS_MOBILE
+  LPCWSTR unicode = String::AnsiToUtf16(pathname_.c_str());
+  const DWORD attributes = GetFileAttributes(unicode);
+  delete [] unicode;
+  return attributes != kInvalidFileAttributes;
+#else
+  posix::StatStruct file_stat;
+  return posix::Stat(pathname_.c_str(), &file_stat) == 0;
+#endif  // GTEST_OS_WINDOWS_MOBILE
+}
+
+// Returns true if pathname describes a directory in the file-system
+// that exists.
+bool FilePath::DirectoryExists() const {
+  bool result = false;
+#if GTEST_OS_WINDOWS
+  // Don't strip off trailing separator if path is a root directory on
+  // Windows (like "C:\\").
+  const FilePath& path(IsRootDirectory() ? *this :
+                                           RemoveTrailingPathSeparator());
+#else
+  const FilePath& path(*this);
+#endif
+
+#if GTEST_OS_WINDOWS_MOBILE
+  LPCWSTR unicode = String::AnsiToUtf16(path.c_str());
+  const DWORD attributes = GetFileAttributes(unicode);
+  delete [] unicode;
+  if ((attributes != kInvalidFileAttributes) &&
+      (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
+    result = true;
+  }
+#else
+  posix::StatStruct file_stat;
+  result = posix::Stat(path.c_str(), &file_stat) == 0 &&
+      posix::IsDir(file_stat);
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+  return result;
+}
+
+// Returns true if pathname describes a root directory. (Windows has one
+// root directory per disk drive.)
+bool FilePath::IsRootDirectory() const {
+#if GTEST_OS_WINDOWS
+  // TODO(wan@google.com): on Windows a network share like
+  // \\server\share can be a root directory, although it cannot be the
+  // current directory.  Handle this properly.
+  return pathname_.length() == 3 && IsAbsolutePath();
+#else
+  return pathname_.length() == 1 && IsPathSeparator(pathname_.c_str()[0]);
+#endif
+}
+
+// Returns true if pathname describes an absolute path.
+bool FilePath::IsAbsolutePath() const {
+  const char* const name = pathname_.c_str();
+#if GTEST_OS_WINDOWS
+  return pathname_.length() >= 3 &&
+     ((name[0] >= 'a' && name[0] <= 'z') ||
+      (name[0] >= 'A' && name[0] <= 'Z')) &&
+     name[1] == ':' &&
+     IsPathSeparator(name[2]);
+#else
+  return IsPathSeparator(name[0]);
+#endif
+}
+
+// Returns a pathname for a file that does not currently exist. The pathname
+// will be directory/base_name.extension or
+// directory/base_name_<number>.extension if directory/base_name.extension
+// already exists. The number will be incremented until a pathname is found
+// that does not already exist.
+// Examples: 'dir/foo_test.xml' or 'dir/foo_test_1.xml'.
+// There could be a race condition if two or more processes are calling this
+// function at the same time -- they could both pick the same filename.
+FilePath FilePath::GenerateUniqueFileName(const FilePath& directory,
+                                          const FilePath& base_name,
+                                          const char* extension) {
+  FilePath full_pathname;
+  int number = 0;
+  do {
+    full_pathname.Set(MakeFileName(directory, base_name, number++, extension));
+  } while (full_pathname.FileOrDirectoryExists());
+  return full_pathname;
+}
+
+// Returns true if FilePath ends with a path separator, which indicates that
+// it is intended to represent a directory. Returns false otherwise.
+// This does NOT check that a directory (or file) actually exists.
+bool FilePath::IsDirectory() const {
+  return !pathname_.empty() &&
+         IsPathSeparator(pathname_.c_str()[pathname_.length() - 1]);
+}
+
+// Create directories so that path exists. Returns true if successful or if
+// the directories already exist; returns false if unable to create directories
+// for any reason.
+bool FilePath::CreateDirectoriesRecursively() const {
+  if (!this->IsDirectory()) {
+    return false;
+  }
+
+  if (pathname_.length() == 0 || this->DirectoryExists()) {
+    return true;
+  }
+
+  const FilePath parent(this->RemoveTrailingPathSeparator().RemoveFileName());
+  return parent.CreateDirectoriesRecursively() && this->CreateFolder();
+}
+
+// Create the directory so that path exists. Returns true if successful or
+// if the directory already exists; returns false if unable to create the
+// directory for any reason, including if the parent directory does not
+// exist. Not named "CreateDirectory" because that's a macro on Windows.
+bool FilePath::CreateFolder() const {
+#if GTEST_OS_WINDOWS_MOBILE
+  FilePath removed_sep(this->RemoveTrailingPathSeparator());
+  LPCWSTR unicode = String::AnsiToUtf16(removed_sep.c_str());
+  int result = CreateDirectory(unicode, NULL) ? 0 : -1;
+  delete [] unicode;
+#elif GTEST_OS_WINDOWS
+  int result = _mkdir(pathname_.c_str());
+#else
+  int result = mkdir(pathname_.c_str(), 0777);
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+  if (result == -1) {
+    return this->DirectoryExists();  // An error is OK if the directory exists.
+  }
+  return true;  // No error.
+}
+
+// If input name has a trailing separator character, remove it and return the
+// name, otherwise return the name string unmodified.
+// On Windows platform, uses \ as the separator, other platforms use /.
+FilePath FilePath::RemoveTrailingPathSeparator() const {
+  return IsDirectory()
+      ? FilePath(pathname_.substr(0, pathname_.length() - 1))
+      : *this;
+}
+
+// Removes any redundant separators that might be in the pathname.
+// For example, "bar///foo" becomes "bar/foo". Does not eliminate other
+// redundancies that might be in a pathname involving "." or "..".
+// TODO(wan@google.com): handle Windows network shares (e.g. \\server\share).
+void FilePath::Normalize() {
+  if (pathname_.c_str() == NULL) {
+    pathname_ = "";
+    return;
+  }
+  const char* src = pathname_.c_str();
+  char* const dest = new char[pathname_.length() + 1];
+  char* dest_ptr = dest;
+  memset(dest_ptr, 0, pathname_.length() + 1);
+
+  while (*src != '\0') {
+    *dest_ptr = *src;
+    if (!IsPathSeparator(*src)) {
+      src++;
+    } else {
+#if GTEST_HAS_ALT_PATH_SEP_
+      if (*dest_ptr == kAlternatePathSeparator) {
+        *dest_ptr = kPathSeparator;
+      }
+#endif
+      while (IsPathSeparator(*src))
+        src++;
+    }
+    dest_ptr++;
+  }
+  *dest_ptr = '\0';
+  pathname_ = dest;
+  delete[] dest;
+}
+
+}  // namespace internal
+}  // namespace testing
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+
+
+#include <limits.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#if GTEST_OS_WINDOWS_MOBILE
+# include <windows.h>  // For TerminateProcess()
+#elif GTEST_OS_WINDOWS
+# include <io.h>
+# include <sys/stat.h>
+#else
+# include <unistd.h>
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+#if GTEST_OS_MAC
+# include <mach/mach_init.h>
+# include <mach/task.h>
+# include <mach/vm_map.h>
+#endif  // GTEST_OS_MAC
+
+#if GTEST_OS_QNX
+# include <devctl.h>
+# include <sys/procfs.h>
+#endif  // GTEST_OS_QNX
+
+
+// Indicates that this translation unit is part of Google Test's
+// implementation.  It must come before gtest-internal-inl.h is
+// included, or there will be a compiler error.  This trick is to
+// prevent a user from accidentally including gtest-internal-inl.h in
+// his code.
+#define GTEST_IMPLEMENTATION_ 1
+#undef GTEST_IMPLEMENTATION_
+
+namespace testing {
+namespace internal {
+
+#if defined(_MSC_VER) || defined(__BORLANDC__)
+// MSVC and C++Builder do not provide a definition of STDERR_FILENO.
+const int kStdOutFileno = 1;
+const int kStdErrFileno = 2;
+#else
+const int kStdOutFileno = STDOUT_FILENO;
+const int kStdErrFileno = STDERR_FILENO;
+#endif  // _MSC_VER
+
+#if GTEST_OS_MAC
+
+// Returns the number of threads running in the process, or 0 to indicate that
+// we cannot detect it.
+size_t GetThreadCount() {
+  const task_t task = mach_task_self();
+  mach_msg_type_number_t thread_count;
+  thread_act_array_t thread_list;
+  const kern_return_t status = task_threads(task, &thread_list, &thread_count);
+  if (status == KERN_SUCCESS) {
+    // task_threads allocates resources in thread_list and we need to free them
+    // to avoid leaks.
+    vm_deallocate(task,
+                  reinterpret_cast<vm_address_t>(thread_list),
+                  sizeof(thread_t) * thread_count);
+    return static_cast<size_t>(thread_count);
+  } else {
+    return 0;
+  }
+}
+
+#elif GTEST_OS_QNX
+
+// Returns the number of threads running in the process, or 0 to indicate that
+// we cannot detect it.
+size_t GetThreadCount() {
+  const int fd = open("/proc/self/as", O_RDONLY);
+  if (fd < 0) {
+    return 0;
+  }
+  procfs_info process_info;
+  const int status =
+      devctl(fd, DCMD_PROC_INFO, &process_info, sizeof(process_info), NULL);
+  close(fd);
+  if (status == EOK) {
+    return static_cast<size_t>(process_info.num_threads);
+  } else {
+    return 0;
+  }
+}
+
+#else
+
+size_t GetThreadCount() {
+  // There's no portable way to detect the number of threads, so we just
+  // return 0 to indicate that we cannot detect it.
+  return 0;
+}
+
+#endif  // GTEST_OS_MAC
+
+#if GTEST_USES_POSIX_RE
+
+// Implements RE.  Currently only needed for death tests.
+
+RE::~RE() {
+  if (is_valid_) {
+    // regfree'ing an invalid regex might crash because the content
+    // of the regex is undefined. Since the regex's are essentially
+    // the same, one cannot be valid (or invalid) without the other
+    // being so too.
+    regfree(&partial_regex_);
+    regfree(&full_regex_);
+  }
+  free(const_cast<char*>(pattern_));
+}
+
+// Returns true iff regular expression re matches the entire str.
+bool RE::FullMatch(const char* str, const RE& re) {
+  if (!re.is_valid_) return false;
+
+  regmatch_t match;
+  return regexec(&re.full_regex_, str, 1, &match, 0) == 0;
+}
+
+// Returns true iff regular expression re matches a substring of str
+// (including str itself).
+bool RE::PartialMatch(const char* str, const RE& re) {
+  if (!re.is_valid_) return false;
+
+  regmatch_t match;
+  return regexec(&re.partial_regex_, str, 1, &match, 0) == 0;
+}
+
+// Initializes an RE from its string representation.
+void RE::Init(const char* regex) {
+  pattern_ = posix::StrDup(regex);
+
+  // Reserves enough bytes to hold the regular expression used for a
+  // full match.
+  const size_t full_regex_len = strlen(regex) + 10;
+  char* const full_pattern = new char[full_regex_len];
+
+  snprintf(full_pattern, full_regex_len, "^(%s)$", regex);
+  is_valid_ = regcomp(&full_regex_, full_pattern, REG_EXTENDED) == 0;
+  // We want to call regcomp(&partial_regex_, ...) even if the
+  // previous expression returns false.  Otherwise partial_regex_ may
+  // not be properly initialized can may cause trouble when it's
+  // freed.
+  //
+  // Some implementation of POSIX regex (e.g. on at least some
+  // versions of Cygwin) doesn't accept the empty string as a valid
+  // regex.  We change it to an equivalent form "()" to be safe.
+  if (is_valid_) {
+    const char* const partial_regex = (*regex == '\0') ? "()" : regex;
+    is_valid_ = regcomp(&partial_regex_, partial_regex, REG_EXTENDED) == 0;
+  }
+  EXPECT_TRUE(is_valid_)
+      << "Regular expression \"" << regex
+      << "\" is not a valid POSIX Extended regular expression.";
+
+  delete[] full_pattern;
+}
+
+#elif GTEST_USES_SIMPLE_RE
+
+// Returns true iff ch appears anywhere in str (excluding the
+// terminating '\0' character).
+bool IsInSet(char ch, const char* str) {
+  return ch != '\0' && strchr(str, ch) != NULL;
+}
+
+// Returns true iff ch belongs to the given classification.  Unlike
+// similar functions in <ctype.h>, these aren't affected by the
+// current locale.
+bool IsAsciiDigit(char ch) { return '0' <= ch && ch <= '9'; }
+bool IsAsciiPunct(char ch) {
+  return IsInSet(ch, "^-!\"#$%&'()*+,./:;<=>?@[\\]_`{|}~");
+}
+bool IsRepeat(char ch) { return IsInSet(ch, "?*+"); }
+bool IsAsciiWhiteSpace(char ch) { return IsInSet(ch, " \f\n\r\t\v"); }
+bool IsAsciiWordChar(char ch) {
+  return ('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z') ||
+      ('0' <= ch && ch <= '9') || ch == '_';
+}
+
+// Returns true iff "\\c" is a supported escape sequence.
+bool IsValidEscape(char c) {
+  return (IsAsciiPunct(c) || IsInSet(c, "dDfnrsStvwW"));
+}
+
+// Returns true iff the given atom (specified by escaped and pattern)
+// matches ch.  The result is undefined if the atom is invalid.
+bool AtomMatchesChar(bool escaped, char pattern_char, char ch) {
+  if (escaped) {  // "\\p" where p is pattern_char.
+    switch (pattern_char) {
+      case 'd': return IsAsciiDigit(ch);
+      case 'D': return !IsAsciiDigit(ch);
+      case 'f': return ch == '\f';
+      case 'n': return ch == '\n';
+      case 'r': return ch == '\r';
+      case 's': return IsAsciiWhiteSpace(ch);
+      case 'S': return !IsAsciiWhiteSpace(ch);
+      case 't': return ch == '\t';
+      case 'v': return ch == '\v';
+      case 'w': return IsAsciiWordChar(ch);
+      case 'W': return !IsAsciiWordChar(ch);
+    }
+    return IsAsciiPunct(pattern_char) && pattern_char == ch;
+  }
+
+  return (pattern_char == '.' && ch != '\n') || pattern_char == ch;
+}
+
+// Helper function used by ValidateRegex() to format error messages.
+std::string FormatRegexSyntaxError(const char* regex, int index) {
+  return (Message() << "Syntax error at index " << index
+          << " in simple regular expression \"" << regex << "\": ").GetString();
+}
+
+// Generates non-fatal failures and returns false if regex is invalid;
+// otherwise returns true.
+bool ValidateRegex(const char* regex) {
+  if (regex == NULL) {
+    // TODO(wan@google.com): fix the source file location in the
+    // assertion failures to match where the regex is used in user
+    // code.
+    ADD_FAILURE() << "NULL is not a valid simple regular expression.";
+    return false;
+  }
+
+  bool is_valid = true;
+
+  // True iff ?, *, or + can follow the previous atom.
+  bool prev_repeatable = false;
+  for (int i = 0; regex[i]; i++) {
+    if (regex[i] == '\\') {  // An escape sequence
+      i++;
+      if (regex[i] == '\0') {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i - 1)
+                      << "'\\' cannot appear at the end.";
+        return false;
+      }
+
+      if (!IsValidEscape(regex[i])) {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i - 1)
+                      << "invalid escape sequence \"\\" << regex[i] << "\".";
+        is_valid = false;
+      }
+      prev_repeatable = true;
+    } else {  // Not an escape sequence.
+      const char ch = regex[i];
+
+      if (ch == '^' && i > 0) {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
+                      << "'^' can only appear at the beginning.";
+        is_valid = false;
+      } else if (ch == '$' && regex[i + 1] != '\0') {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
+                      << "'$' can only appear at the end.";
+        is_valid = false;
+      } else if (IsInSet(ch, "()[]{}|")) {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
+                      << "'" << ch << "' is unsupported.";
+        is_valid = false;
+      } else if (IsRepeat(ch) && !prev_repeatable) {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
+                      << "'" << ch << "' can only follow a repeatable token.";
+        is_valid = false;
+      }
+
+      prev_repeatable = !IsInSet(ch, "^$?*+");
+    }
+  }
+
+  return is_valid;
+}
+
+// Matches a repeated regex atom followed by a valid simple regular
+// expression.  The regex atom is defined as c if escaped is false,
+// or \c otherwise.  repeat is the repetition meta character (?, *,
+// or +).  The behavior is undefined if str contains too many
+// characters to be indexable by size_t, in which case the test will
+// probably time out anyway.  We are fine with this limitation as
+// std::string has it too.
+bool MatchRepetitionAndRegexAtHead(
+    bool escaped, char c, char repeat, const char* regex,
+    const char* str) {
+  const size_t min_count = (repeat == '+') ? 1 : 0;
+  const size_t max_count = (repeat == '?') ? 1 :
+      static_cast<size_t>(-1) - 1;
+  // We cannot call numeric_limits::max() as it conflicts with the
+  // max() macro on Windows.
+
+  for (size_t i = 0; i <= max_count; ++i) {
+    // We know that the atom matches each of the first i characters in str.
+    if (i >= min_count && MatchRegexAtHead(regex, str + i)) {
+      // We have enough matches at the head, and the tail matches too.
+      // Since we only care about *whether* the pattern matches str
+      // (as opposed to *how* it matches), there is no need to find a
+      // greedy match.
+      return true;
+    }
+    if (str[i] == '\0' || !AtomMatchesChar(escaped, c, str[i]))
+      return false;
+  }
+  return false;
+}
+
+// Returns true iff regex matches a prefix of str.  regex must be a
+// valid simple regular expression and not start with "^", or the
+// result is undefined.
+bool MatchRegexAtHead(const char* regex, const char* str) {
+  if (*regex == '\0')  // An empty regex matches a prefix of anything.
+    return true;
+
+  // "$" only matches the end of a string.  Note that regex being
+  // valid guarantees that there's nothing after "$" in it.
+  if (*regex == '$')
+    return *str == '\0';
+
+  // Is the first thing in regex an escape sequence?
+  const bool escaped = *regex == '\\';
+  if (escaped)
+    ++regex;
+  if (IsRepeat(regex[1])) {
+    // MatchRepetitionAndRegexAtHead() calls MatchRegexAtHead(), so
+    // here's an indirect recursion.  It terminates as the regex gets
+    // shorter in each recursion.
+    return MatchRepetitionAndRegexAtHead(
+        escaped, regex[0], regex[1], regex + 2, str);
+  } else {
+    // regex isn't empty, isn't "$", and doesn't start with a
+    // repetition.  We match the first atom of regex with the first
+    // character of str and recurse.
+    return (*str != '\0') && AtomMatchesChar(escaped, *regex, *str) &&
+        MatchRegexAtHead(regex + 1, str + 1);
+  }
+}
+
+// Returns true iff regex matches any substring of str.  regex must be
+// a valid simple regular expression, or the result is undefined.
+//
+// The algorithm is recursive, but the recursion depth doesn't exceed
+// the regex length, so we won't need to worry about running out of
+// stack space normally.  In rare cases the time complexity can be
+// exponential with respect to the regex length + the string length,
+// but usually it's must faster (often close to linear).
+bool MatchRegexAnywhere(const char* regex, const char* str) {
+  if (regex == NULL || str == NULL)
+    return false;
+
+  if (*regex == '^')
+    return MatchRegexAtHead(regex + 1, str);
+
+  // A successful match can be anywhere in str.
+  do {
+    if (MatchRegexAtHead(regex, str))
+      return true;
+  } while (*str++ != '\0');
+  return false;
+}
+
+// Implements the RE class.
+
+RE::~RE() {
+  free(const_cast<char*>(pattern_));
+  free(const_cast<char*>(full_pattern_));
+}
+
+// Returns true iff regular expression re matches the entire str.
+bool RE::FullMatch(const char* str, const RE& re) {
+  return re.is_valid_ && MatchRegexAnywhere(re.full_pattern_, str);
+}
+
+// Returns true iff regular expression re matches a substring of str
+// (including str itself).
+bool RE::PartialMatch(const char* str, const RE& re) {
+  return re.is_valid_ && MatchRegexAnywhere(re.pattern_, str);
+}
+
+// Initializes an RE from its string representation.
+void RE::Init(const char* regex) {
+  pattern_ = full_pattern_ = NULL;
+  if (regex != NULL) {
+    pattern_ = posix::StrDup(regex);
+  }
+
+  is_valid_ = ValidateRegex(regex);
+  if (!is_valid_) {
+    // No need to calculate the full pattern when the regex is invalid.
+    return;
+  }
+
+  const size_t len = strlen(regex);
+  // Reserves enough bytes to hold the regular expression used for a
+  // full match: we need space to prepend a '^', append a '$', and
+  // terminate the string with '\0'.
+  char* buffer = static_cast<char*>(malloc(len + 3));
+  full_pattern_ = buffer;
+
+  if (*regex != '^')
+    *buffer++ = '^';  // Makes sure full_pattern_ starts with '^'.
+
+  // We don't use snprintf or strncpy, as they trigger a warning when
+  // compiled with VC++ 8.0.
+  memcpy(buffer, regex, len);
+  buffer += len;
+
+  if (len == 0 || regex[len - 1] != '$')
+    *buffer++ = '$';  // Makes sure full_pattern_ ends with '$'.
+
+  *buffer = '\0';
+}
+
+#endif  // GTEST_USES_POSIX_RE
+
+const char kUnknownFile[] = "unknown file";
+
+// Formats a source file path and a line number as they would appear
+// in an error message from the compiler used to compile this code.
+GTEST_API_ ::std::string FormatFileLocation(const char* file, int line) {
+  const std::string file_name(file == NULL ? kUnknownFile : file);
+
+  if (line < 0) {
+    return file_name + ":";
+  }
+#ifdef _MSC_VER
+  return file_name + "(" + StreamableToString(line) + "):";
+#else
+  return file_name + ":" + StreamableToString(line) + ":";
+#endif  // _MSC_VER
+}
+
+// Formats a file location for compiler-independent XML output.
+// Although this function is not platform dependent, we put it next to
+// FormatFileLocation in order to contrast the two functions.
+// Note that FormatCompilerIndependentFileLocation() does NOT append colon
+// to the file location it produces, unlike FormatFileLocation().
+GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(
+    const char* file, int line) {
+  const std::string file_name(file == NULL ? kUnknownFile : file);
+
+  if (line < 0)
+    return file_name;
+  else
+    return file_name + ":" + StreamableToString(line);
+}
+
+
+GTestLog::GTestLog(GTestLogSeverity severity, const char* file, int line)
+    : severity_(severity) {
+  const char* const marker =
+      severity == GTEST_INFO ?    "[  INFO ]" :
+      severity == GTEST_WARNING ? "[WARNING]" :
+      severity == GTEST_ERROR ?   "[ ERROR ]" : "[ FATAL ]";
+  GetStream() << ::std::endl << marker << " "
+              << FormatFileLocation(file, line).c_str() << ": ";
+}
+
+// Flushes the buffers and, if severity is GTEST_FATAL, aborts the program.
+GTestLog::~GTestLog() {
+  GetStream() << ::std::endl;
+  if (severity_ == GTEST_FATAL) {
+    fflush(stderr);
+    posix::Abort();
+  }
+}
+// Disable Microsoft deprecation warnings for POSIX functions called from
+// this class (creat, dup, dup2, and close)
+#ifdef _MSC_VER
+# pragma warning(push)
+# pragma warning(disable: 4996)
+#endif  // _MSC_VER
+
+#if GTEST_HAS_STREAM_REDIRECTION
+
+// Object that captures an output stream (stdout/stderr).
+class CapturedStream {
+ public:
+  // The ctor redirects the stream to a temporary file.
+  explicit CapturedStream(int fd) : fd_(fd), uncaptured_fd_(dup(fd)) {
+# if GTEST_OS_WINDOWS
+    char temp_dir_path[MAX_PATH + 1] = { '\0' };  // NOLINT
+    char temp_file_path[MAX_PATH + 1] = { '\0' };  // NOLINT
+
+    ::GetTempPathA(sizeof(temp_dir_path), temp_dir_path);
+    const UINT success = ::GetTempFileNameA(temp_dir_path,
+                                            "gtest_redir",
+                                            0,  // Generate unique file name.
+                                            temp_file_path);
+    GTEST_CHECK_(success != 0)
+        << "Unable to create a temporary file in " << temp_dir_path;
+    const int captured_fd = creat(temp_file_path, _S_IREAD | _S_IWRITE);
+    GTEST_CHECK_(captured_fd != -1) << "Unable to open temporary file "
+                                    << temp_file_path;
+    filename_ = temp_file_path;
+# else
+    // There's no guarantee that a test has write access to the current
+    // directory, so we create the temporary file in the /tmp directory
+    // instead. We use /tmp on most systems, and /sdcard on Android.
+    // That's because Android doesn't have /tmp.
+#  if GTEST_OS_LINUX_ANDROID
+    // Note: Android applications are expected to call the framework's
+    // Context.getExternalStorageDirectory() method through JNI to get
+    // the location of the world-writable SD Card directory. However,
+    // this requires a Context handle, which cannot be retrieved
+    // globally from native code. Doing so also precludes running the
+    // code as part of a regular standalone executable, which doesn't
+    // run in a Dalvik process (e.g. when running it through 'adb shell').
+    //
+    // The location /sdcard is directly accessible from native code
+    // and is the only location (unofficially) supported by the Android
+    // team. It's generally a symlink to the real SD Card mount point
+    // which can be /mnt/sdcard, /mnt/sdcard0, /system/media/sdcard, or
+    // other OEM-customized locations. Never rely on these, and always
+    // use /sdcard.
+    char name_template[] = "/sdcard/gtest_captured_stream.XXXXXX";
+#  else
+    char name_template[] = "/tmp/captured_stream.XXXXXX";
+#  endif  // GTEST_OS_LINUX_ANDROID
+    const int captured_fd = mkstemp(name_template);
+    filename_ = name_template;
+# endif  // GTEST_OS_WINDOWS
+    fflush(NULL);
+    dup2(captured_fd, fd_);
+    close(captured_fd);
+  }
+
+  ~CapturedStream() {
+    remove(filename_.c_str());
+  }
+
+  std::string GetCapturedString() {
+    if (uncaptured_fd_ != -1) {
+      // Restores the original stream.
+      fflush(NULL);
+      dup2(uncaptured_fd_, fd_);
+      close(uncaptured_fd_);
+      uncaptured_fd_ = -1;
+    }
+
+    FILE* const file = posix::FOpen(filename_.c_str(), "r");
+    const std::string content = ReadEntireFile(file);
+    posix::FClose(file);
+    return content;
+  }
+
+ private:
+  // Reads the entire content of a file as an std::string.
+  static std::string ReadEntireFile(FILE* file);
+
+  // Returns the size (in bytes) of a file.
+  static size_t GetFileSize(FILE* file);
+
+  const int fd_;  // A stream to capture.
+  int uncaptured_fd_;
+  // Name of the temporary file holding the stderr output.
+  ::std::string filename_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(CapturedStream);
+};
+
+// Returns the size (in bytes) of a file.
+size_t CapturedStream::GetFileSize(FILE* file) {
+  fseek(file, 0, SEEK_END);
+  return static_cast<size_t>(ftell(file));
+}
+
+// Reads the entire content of a file as a string.
+std::string CapturedStream::ReadEntireFile(FILE* file) {
+  const size_t file_size = GetFileSize(file);
+  char* const buffer = new char[file_size];
+
+  size_t bytes_last_read = 0;  // # of bytes read in the last fread()
+  size_t bytes_read = 0;       // # of bytes read so far
+
+  fseek(file, 0, SEEK_SET);
+
+  // Keeps reading the file until we cannot read further or the
+  // pre-determined file size is reached.
+  do {
+    bytes_last_read = fread(buffer+bytes_read, 1, file_size-bytes_read, file);
+    bytes_read += bytes_last_read;
+  } while (bytes_last_read > 0 && bytes_read < file_size);
+
+  const std::string content(buffer, bytes_read);
+  delete[] buffer;
+
+  return content;
+}
+
+# ifdef _MSC_VER
+#  pragma warning(pop)
+# endif  // _MSC_VER
+
+static CapturedStream* g_captured_stderr = NULL;
+static CapturedStream* g_captured_stdout = NULL;
+
+// Starts capturing an output stream (stdout/stderr).
+void CaptureStream(int fd, const char* stream_name, CapturedStream** stream) {
+  if (*stream != NULL) {
+    GTEST_LOG_(FATAL) << "Only one " << stream_name
+                      << " capturer can exist at a time.";
+  }
+  *stream = new CapturedStream(fd);
+}
+
+// Stops capturing the output stream and returns the captured string.
+std::string GetCapturedStream(CapturedStream** captured_stream) {
+  const std::string content = (*captured_stream)->GetCapturedString();
+
+  delete *captured_stream;
+  *captured_stream = NULL;
+
+  return content;
+}
+
+// Starts capturing stdout.
+void CaptureStdout() {
+  CaptureStream(kStdOutFileno, "stdout", &g_captured_stdout);
+}
+
+// Starts capturing stderr.
+void CaptureStderr() {
+  CaptureStream(kStdErrFileno, "stderr", &g_captured_stderr);
+}
+
+// Stops capturing stdout and returns the captured string.
+std::string GetCapturedStdout() {
+  return GetCapturedStream(&g_captured_stdout);
+}
+
+// Stops capturing stderr and returns the captured string.
+std::string GetCapturedStderr() {
+  return GetCapturedStream(&g_captured_stderr);
+}
+
+#endif  // GTEST_HAS_STREAM_REDIRECTION
+
+#if GTEST_HAS_DEATH_TEST
+
+// A copy of all command line arguments.  Set by InitGoogleTest().
+::std::vector<testing::internal::string> g_argvs;
+
+static const ::std::vector<testing::internal::string>* g_injected_test_argvs =
+                                        NULL;  // Owned.
+
+void SetInjectableArgvs(const ::std::vector<testing::internal::string>* argvs) {
+  if (g_injected_test_argvs != argvs)
+    delete g_injected_test_argvs;
+  g_injected_test_argvs = argvs;
+}
+
+const ::std::vector<testing::internal::string>& GetInjectableArgvs() {
+  if (g_injected_test_argvs != NULL) {
+    return *g_injected_test_argvs;
+  }
+  return g_argvs;
+}
+#endif  // GTEST_HAS_DEATH_TEST
+
+#if GTEST_OS_WINDOWS_MOBILE
+namespace posix {
+void Abort() {
+  DebugBreak();
+  TerminateProcess(GetCurrentProcess(), 1);
+}
+}  // namespace posix
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+// Returns the name of the environment variable corresponding to the
+// given flag.  For example, FlagToEnvVar("foo") will return
+// "GTEST_FOO" in the open-source version.
+static std::string FlagToEnvVar(const char* flag) {
+  const std::string full_flag =
+      (Message() << GTEST_FLAG_PREFIX_ << flag).GetString();
+
+  Message env_var;
+  for (size_t i = 0; i != full_flag.length(); i++) {
+    env_var << ToUpper(full_flag.c_str()[i]);
+  }
+
+  return env_var.GetString();
+}
+
+// Parses 'str' for a 32-bit signed integer.  If successful, writes
+// the result to *value and returns true; otherwise leaves *value
+// unchanged and returns false.
+bool ParseInt32(const Message& src_text, const char* str, Int32* value) {
+  // Parses the environment variable as a decimal integer.
+  char* end = NULL;
+  const long long_value = strtol(str, &end, 10);  // NOLINT
+
+  // Has strtol() consumed all characters in the string?
+  if (*end != '\0') {
+    // No - an invalid character was encountered.
+    Message msg;
+    msg << "WARNING: " << src_text
+        << " is expected to be a 32-bit integer, but actually"
+        << " has value \"" << str << "\".\n";
+    printf("%s", msg.GetString().c_str());
+    fflush(stdout);
+    return false;
+  }
+
+  // Is the parsed value in the range of an Int32?
+  const Int32 result = static_cast<Int32>(long_value);
+  if (long_value == LONG_MAX || long_value == LONG_MIN ||
+      // The parsed value overflows as a long.  (strtol() returns
+      // LONG_MAX or LONG_MIN when the input overflows.)
+      result != long_value
+      // The parsed value overflows as an Int32.
+      ) {
+    Message msg;
+    msg << "WARNING: " << src_text
+        << " is expected to be a 32-bit integer, but actually"
+        << " has value " << str << ", which overflows.\n";
+    printf("%s", msg.GetString().c_str());
+    fflush(stdout);
+    return false;
+  }
+
+  *value = result;
+  return true;
+}
+
+// Reads and returns the Boolean environment variable corresponding to
+// the given flag; if it's not set, returns default_value.
+//
+// The value is considered true iff it's not "0".
+bool BoolFromGTestEnv(const char* flag, bool default_value) {
+  const std::string env_var = FlagToEnvVar(flag);
+  const char* const string_value = posix::GetEnv(env_var.c_str());
+  return string_value == NULL ?
+      default_value : strcmp(string_value, "0") != 0;
+}
+
+// Reads and returns a 32-bit integer stored in the environment
+// variable corresponding to the given flag; if it isn't set or
+// doesn't represent a valid 32-bit integer, returns default_value.
+Int32 Int32FromGTestEnv(const char* flag, Int32 default_value) {
+  const std::string env_var = FlagToEnvVar(flag);
+  const char* const string_value = posix::GetEnv(env_var.c_str());
+  if (string_value == NULL) {
+    // The environment variable is not set.
+    return default_value;
+  }
+
+  Int32 result = default_value;
+  if (!ParseInt32(Message() << "Environment variable " << env_var,
+                  string_value, &result)) {
+    printf("The default value %s is used.\n",
+           (Message() << default_value).GetString().c_str());
+    fflush(stdout);
+    return default_value;
+  }
+
+  return result;
+}
+
+// Reads and returns the string environment variable corresponding to
+// the given flag; if it's not set, returns default_value.
+const char* StringFromGTestEnv(const char* flag, const char* default_value) {
+  const std::string env_var = FlagToEnvVar(flag);
+  const char* const value = posix::GetEnv(env_var.c_str());
+  return value == NULL ? default_value : value;
+}
+
+}  // namespace internal
+}  // namespace testing
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+
+// Google Test - The Google C++ Testing Framework
+//
+// This file implements a universal value printer that can print a
+// value of any type T:
+//
+//   void ::testing::internal::UniversalPrinter<T>::Print(value, ostream_ptr);
+//
+// It uses the << operator when possible, and prints the bytes in the
+// object otherwise.  A user can override its behavior for a class
+// type Foo by defining either operator<<(::std::ostream&, const Foo&)
+// or void PrintTo(const Foo&, ::std::ostream*) in the namespace that
+// defines Foo.
+
+#include <ctype.h>
+#include <stdio.h>
+#include <ostream>  // NOLINT
+#include <string>
+
+namespace testing {
+
+namespace {
+
+using ::std::ostream;
+
+// Prints a segment of bytes in the given object.
+void PrintByteSegmentInObjectTo(const unsigned char* obj_bytes, size_t start,
+                                size_t count, ostream* os) {
+  char text[5] = "";
+  for (size_t i = 0; i != count; i++) {
+    const size_t j = start + i;
+    if (i != 0) {
+      // Organizes the bytes into groups of 2 for easy parsing by
+      // human.
+      if ((j % 2) == 0)
+        *os << ' ';
+      else
+        *os << '-';
+    }
+    GTEST_SNPRINTF_(text, sizeof(text), "%02X", obj_bytes[j]);
+    *os << text;
+  }
+}
+
+// Prints the bytes in the given value to the given ostream.
+void PrintBytesInObjectToImpl(const unsigned char* obj_bytes, size_t count,
+                              ostream* os) {
+  // Tells the user how big the object is.
+  *os << count << "-byte object <";
+
+  const size_t kThreshold = 132;
+  const size_t kChunkSize = 64;
+  // If the object size is bigger than kThreshold, we'll have to omit
+  // some details by printing only the first and the last kChunkSize
+  // bytes.
+  // TODO(wan): let the user control the threshold using a flag.
+  if (count < kThreshold) {
+    PrintByteSegmentInObjectTo(obj_bytes, 0, count, os);
+  } else {
+    PrintByteSegmentInObjectTo(obj_bytes, 0, kChunkSize, os);
+    *os << " ... ";
+    // Rounds up to 2-byte boundary.
+    const size_t resume_pos = (count - kChunkSize + 1)/2*2;
+    PrintByteSegmentInObjectTo(obj_bytes, resume_pos, count - resume_pos, os);
+  }
+  *os << ">";
+}
+
+}  // namespace
+
+namespace internal2 {
+
+// Delegates to PrintBytesInObjectToImpl() to print the bytes in the
+// given object.  The delegation simplifies the implementation, which
+// uses the << operator and thus is easier done outside of the
+// ::testing::internal namespace, which contains a << operator that
+// sometimes conflicts with the one in STL.
+void PrintBytesInObjectTo(const unsigned char* obj_bytes, size_t count,
+                          ostream* os) {
+  PrintBytesInObjectToImpl(obj_bytes, count, os);
+}
+
+}  // namespace internal2
+
+namespace internal {
+
+// Depending on the value of a char (or wchar_t), we print it in one
+// of three formats:
+//   - as is if it's a printable ASCII (e.g. 'a', '2', ' '),
+//   - as a hexidecimal escape sequence (e.g. '\x7F'), or
+//   - as a special escape sequence (e.g. '\r', '\n').
+enum CharFormat {
+  kAsIs,
+  kHexEscape,
+  kSpecialEscape
+};
+
+// Returns true if c is a printable ASCII character.  We test the
+// value of c directly instead of calling isprint(), which is buggy on
+// Windows Mobile.
+inline bool IsPrintableAscii(wchar_t c) {
+  return 0x20 <= c && c <= 0x7E;
+}
+
+// Prints a wide or narrow char c as a character literal without the
+// quotes, escaping it when necessary; returns how c was formatted.
+// The template argument UnsignedChar is the unsigned version of Char,
+// which is the type of c.
+template <typename UnsignedChar, typename Char>
+static CharFormat PrintAsCharLiteralTo(Char c, ostream* os) {
+  switch (static_cast<wchar_t>(c)) {
+    case L'\0':
+      *os << "\\0";
+      break;
+    case L'\'':
+      *os << "\\'";
+      break;
+    case L'\\':
+      *os << "\\\\";
+      break;
+    case L'\a':
+      *os << "\\a";
+      break;
+    case L'\b':
+      *os << "\\b";
+      break;
+    case L'\f':
+      *os << "\\f";
+      break;
+    case L'\n':
+      *os << "\\n";
+      break;
+    case L'\r':
+      *os << "\\r";
+      break;
+    case L'\t':
+      *os << "\\t";
+      break;
+    case L'\v':
+      *os << "\\v";
+      break;
+    default:
+      if (IsPrintableAscii(c)) {
+        *os << static_cast<char>(c);
+        return kAsIs;
+      } else {
+        *os << "\\x" + String::FormatHexInt(static_cast<UnsignedChar>(c));
+        return kHexEscape;
+      }
+  }
+  return kSpecialEscape;
+}
+
+// Prints a wchar_t c as if it's part of a string literal, escaping it when
+// necessary; returns how c was formatted.
+static CharFormat PrintAsStringLiteralTo(wchar_t c, ostream* os) {
+  switch (c) {
+    case L'\'':
+      *os << "'";
+      return kAsIs;
+    case L'"':
+      *os << "\\\"";
+      return kSpecialEscape;
+    default:
+      return PrintAsCharLiteralTo<wchar_t>(c, os);
+  }
+}
+
+// Prints a char c as if it's part of a string literal, escaping it when
+// necessary; returns how c was formatted.
+static CharFormat PrintAsStringLiteralTo(char c, ostream* os) {
+  return PrintAsStringLiteralTo(
+      static_cast<wchar_t>(static_cast<unsigned char>(c)), os);
+}
+
+// Prints a wide or narrow character c and its code.  '\0' is printed
+// as "'\\0'", other unprintable characters are also properly escaped
+// using the standard C++ escape sequence.  The template argument
+// UnsignedChar is the unsigned version of Char, which is the type of c.
+template <typename UnsignedChar, typename Char>
+void PrintCharAndCodeTo(Char c, ostream* os) {
+  // First, print c as a literal in the most readable form we can find.
+  *os << ((sizeof(c) > 1) ? "L'" : "'");
+  const CharFormat format = PrintAsCharLiteralTo<UnsignedChar>(c, os);
+  *os << "'";
+
+  // To aid user debugging, we also print c's code in decimal, unless
+  // it's 0 (in which case c was printed as '\\0', making the code
+  // obvious).
+  if (c == 0)
+    return;
+  *os << " (" << static_cast<int>(c);
+
+  // For more convenience, we print c's code again in hexidecimal,
+  // unless c was already printed in the form '\x##' or the code is in
+  // [1, 9].
+  if (format == kHexEscape || (1 <= c && c <= 9)) {
+    // Do nothing.
+  } else {
+    *os << ", 0x" << String::FormatHexInt(static_cast<UnsignedChar>(c));
+  }
+  *os << ")";
+}
+
+void PrintTo(unsigned char c, ::std::ostream* os) {
+  PrintCharAndCodeTo<unsigned char>(c, os);
+}
+void PrintTo(signed char c, ::std::ostream* os) {
+  PrintCharAndCodeTo<unsigned char>(c, os);
+}
+
+// Prints a wchar_t as a symbol if it is printable or as its internal
+// code otherwise and also as its code.  L'\0' is printed as "L'\\0'".
+void PrintTo(wchar_t wc, ostream* os) {
+  PrintCharAndCodeTo<wchar_t>(wc, os);
+}
+
+// Prints the given array of characters to the ostream.  CharType must be either
+// char or wchar_t.
+// The array starts at begin, the length is len, it may include '\0' characters
+// and may not be NUL-terminated.
+template <typename CharType>
+static void PrintCharsAsStringTo(
+    const CharType* begin, size_t len, ostream* os) {
+  const char* const kQuoteBegin = sizeof(CharType) == 1 ? "\"" : "L\"";
+  *os << kQuoteBegin;
+  bool is_previous_hex = false;
+  for (size_t index = 0; index < len; ++index) {
+    const CharType cur = begin[index];
+    if (is_previous_hex && IsXDigit(cur)) {
+      // Previous character is of '\x..' form and this character can be
+      // interpreted as another hexadecimal digit in its number. Break string to
+      // disambiguate.
+      *os << "\" " << kQuoteBegin;
+    }
+    is_previous_hex = PrintAsStringLiteralTo(cur, os) == kHexEscape;
+  }
+  *os << "\"";
+}
+
+// Prints a (const) char/wchar_t array of 'len' elements, starting at address
+// 'begin'.  CharType must be either char or wchar_t.
+template <typename CharType>
+static void UniversalPrintCharArray(
+    const CharType* begin, size_t len, ostream* os) {
+  // The code
+  //   const char kFoo[] = "foo";
+  // generates an array of 4, not 3, elements, with the last one being '\0'.
+  //
+  // Therefore when printing a char array, we don't print the last element if
+  // it's '\0', such that the output matches the string literal as it's
+  // written in the source code.
+  if (len > 0 && begin[len - 1] == '\0') {
+    PrintCharsAsStringTo(begin, len - 1, os);
+    return;
+  }
+
+  // If, however, the last element in the array is not '\0', e.g.
+  //    const char kFoo[] = { 'f', 'o', 'o' };
+  // we must print the entire array.  We also print a message to indicate
+  // that the array is not NUL-terminated.
+  PrintCharsAsStringTo(begin, len, os);
+  *os << " (no terminating NUL)";
+}
+
+// Prints a (const) char array of 'len' elements, starting at address 'begin'.
+void UniversalPrintArray(const char* begin, size_t len, ostream* os) {
+  UniversalPrintCharArray(begin, len, os);
+}
+
+// Prints a (const) wchar_t array of 'len' elements, starting at address
+// 'begin'.
+void UniversalPrintArray(const wchar_t* begin, size_t len, ostream* os) {
+  UniversalPrintCharArray(begin, len, os);
+}
+
+// Prints the given C string to the ostream.
+void PrintTo(const char* s, ostream* os) {
+  if (s == NULL) {
+    *os << "NULL";
+  } else {
+    *os << ImplicitCast_<const void*>(s) << " pointing to ";
+    PrintCharsAsStringTo(s, strlen(s), os);
+  }
+}
+
+// MSVC compiler can be configured to define whar_t as a typedef
+// of unsigned short. Defining an overload for const wchar_t* in that case
+// would cause pointers to unsigned shorts be printed as wide strings,
+// possibly accessing more memory than intended and causing invalid
+// memory accesses. MSVC defines _NATIVE_WCHAR_T_DEFINED symbol when
+// wchar_t is implemented as a native type.
+#if !defined(_MSC_VER) || defined(_NATIVE_WCHAR_T_DEFINED)
+// Prints the given wide C string to the ostream.
+void PrintTo(const wchar_t* s, ostream* os) {
+  if (s == NULL) {
+    *os << "NULL";
+  } else {
+    *os << ImplicitCast_<const void*>(s) << " pointing to ";
+    PrintCharsAsStringTo(s, wcslen(s), os);
+  }
+}
+#endif  // wchar_t is native
+
+// Prints a ::string object.
+#if GTEST_HAS_GLOBAL_STRING
+void PrintStringTo(const ::string& s, ostream* os) {
+  PrintCharsAsStringTo(s.data(), s.size(), os);
+}
+#endif  // GTEST_HAS_GLOBAL_STRING
+
+void PrintStringTo(const ::std::string& s, ostream* os) {
+  PrintCharsAsStringTo(s.data(), s.size(), os);
+}
+
+// Prints a ::wstring object.
+#if GTEST_HAS_GLOBAL_WSTRING
+void PrintWideStringTo(const ::wstring& s, ostream* os) {
+  PrintCharsAsStringTo(s.data(), s.size(), os);
+}
+#endif  // GTEST_HAS_GLOBAL_WSTRING
+
+#if GTEST_HAS_STD_WSTRING
+void PrintWideStringTo(const ::std::wstring& s, ostream* os) {
+  PrintCharsAsStringTo(s.data(), s.size(), os);
+}
+#endif  // GTEST_HAS_STD_WSTRING
+
+}  // namespace internal
+
+}  // namespace testing
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: mheule@google.com (Markus Heule)
+//
+// The Google C++ Testing Framework (Google Test)
+
+
+// Indicates that this translation unit is part of Google Test's
+// implementation.  It must come before gtest-internal-inl.h is
+// included, or there will be a compiler error.  This trick is to
+// prevent a user from accidentally including gtest-internal-inl.h in
+// his code.
+#define GTEST_IMPLEMENTATION_ 1
+#undef GTEST_IMPLEMENTATION_
+
+namespace testing {
+
+using internal::GetUnitTestImpl;
+
+// Gets the summary of the failure message by omitting the stack trace
+// in it.
+std::string TestPartResult::ExtractSummary(const char* message) {
+  const char* const stack_trace = strstr(message, internal::kStackTraceMarker);
+  return stack_trace == NULL ? message :
+      std::string(message, stack_trace);
+}
+
+// Prints a TestPartResult object.
+std::ostream& operator<<(std::ostream& os, const TestPartResult& result) {
+  return os
+      << result.file_name() << ":" << result.line_number() << ": "
+      << (result.type() == TestPartResult::kSuccess ? "Success" :
+          result.type() == TestPartResult::kFatalFailure ? "Fatal failure" :
+          "Non-fatal failure") << ":\n"
+      << result.message() << std::endl;
+}
+
+// Appends a TestPartResult to the array.
+void TestPartResultArray::Append(const TestPartResult& result) {
+  array_.push_back(result);
+}
+
+// Returns the TestPartResult at the given index (0-based).
+const TestPartResult& TestPartResultArray::GetTestPartResult(int index) const {
+  if (index < 0 || index >= size()) {
+    printf("\nInvalid index (%d) into TestPartResultArray.\n", index);
+    internal::posix::Abort();
+  }
+
+  return array_[index];
+}
+
+// Returns the number of TestPartResult objects in the array.
+int TestPartResultArray::size() const {
+  return static_cast<int>(array_.size());
+}
+
+namespace internal {
+
+HasNewFatalFailureHelper::HasNewFatalFailureHelper()
+    : has_new_fatal_failure_(false),
+      original_reporter_(GetUnitTestImpl()->
+                         GetTestPartResultReporterForCurrentThread()) {
+  GetUnitTestImpl()->SetTestPartResultReporterForCurrentThread(this);
+}
+
+HasNewFatalFailureHelper::~HasNewFatalFailureHelper() {
+  GetUnitTestImpl()->SetTestPartResultReporterForCurrentThread(
+      original_reporter_);
+}
+
+void HasNewFatalFailureHelper::ReportTestPartResult(
+    const TestPartResult& result) {
+  if (result.fatally_failed())
+    has_new_fatal_failure_ = true;
+  original_reporter_->ReportTestPartResult(result);
+}
+
+}  // namespace internal
+
+}  // namespace testing
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+
+
+namespace testing {
+namespace internal {
+
+#if GTEST_HAS_TYPED_TEST_P
+
+// Skips to the first non-space char in str. Returns an empty string if str
+// contains only whitespace characters.
+static const char* SkipSpaces(const char* str) {
+  while (IsSpace(*str))
+    str++;
+  return str;
+}
+
+// Verifies that registered_tests match the test names in
+// defined_test_names_; returns registered_tests if successful, or
+// aborts the program otherwise.
+const char* TypedTestCasePState::VerifyRegisteredTestNames(
+    const char* file, int line, const char* registered_tests) {
+  typedef ::std::set<const char*>::const_iterator DefinedTestIter;
+  registered_ = true;
+
+  // Skip initial whitespace in registered_tests since some
+  // preprocessors prefix stringizied literals with whitespace.
+  registered_tests = SkipSpaces(registered_tests);
+
+  Message errors;
+  ::std::set<std::string> tests;
+  for (const char* names = registered_tests; names != NULL;
+       names = SkipComma(names)) {
+    const std::string name = GetPrefixUntilComma(names);
+    if (tests.count(name) != 0) {
+      errors << "Test " << name << " is listed more than once.\n";
+      continue;
+    }
+
+    bool found = false;
+    for (DefinedTestIter it = defined_test_names_.begin();
+         it != defined_test_names_.end();
+         ++it) {
+      if (name == *it) {
+        found = true;
+        break;
+      }
+    }
+
+    if (found) {
+      tests.insert(name);
+    } else {
+      errors << "No test named " << name
+             << " can be found in this test case.\n";
+    }
+  }
+
+  for (DefinedTestIter it = defined_test_names_.begin();
+       it != defined_test_names_.end();
+       ++it) {
+    if (tests.count(*it) == 0) {
+      errors << "You forgot to list test " << *it << ".\n";
+    }
+  }
+
+  const std::string& errors_str = errors.GetString();
+  if (errors_str != "") {
+    fprintf(stderr, "%s %s", FormatFileLocation(file, line).c_str(),
+            errors_str.c_str());
+    fflush(stderr);
+    posix::Abort();
+  }
+
+  return registered_tests;
+}
+
+#endif  // GTEST_HAS_TYPED_TEST_P
+
+}  // namespace internal
+}  // namespace testing
diff --git a/lib/kokkos/tpls/gtest/gtest/gtest-test-part.h b/lib/kokkos/tpls/gtest/gtest/gtest-test-part.h
new file mode 120000
index 0000000000000000000000000000000000000000..48d39090f1cabfc4a852d54e0e1f186362eeb1f5
--- /dev/null
+++ b/lib/kokkos/tpls/gtest/gtest/gtest-test-part.h
@@ -0,0 +1 @@
+gtest.h
\ No newline at end of file
diff --git a/lib/kokkos/tpls/gtest/gtest/gtest.h b/lib/kokkos/tpls/gtest/gtest/gtest.h
new file mode 100644
index 0000000000000000000000000000000000000000..c74d098fa9b179ea87a57a4a42b735e430b83c6d
--- /dev/null
+++ b/lib/kokkos/tpls/gtest/gtest/gtest.h
@@ -0,0 +1,20065 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+//
+// The Google C++ Testing Framework (Google Test)
+//
+// This header file defines the public API for Google Test.  It should be
+// included by any test program that uses Google Test.
+//
+// IMPORTANT NOTE: Due to limitation of the C++ language, we have to
+// leave some internal implementation details in this header file.
+// They are clearly marked by comments like this:
+//
+//   // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+//
+// Such code is NOT meant to be used by a user directly, and is subject
+// to CHANGE WITHOUT NOTICE.  Therefore DO NOT DEPEND ON IT in a user
+// program!
+//
+// Acknowledgment: Google Test borrowed the idea of automatic test
+// registration from Barthelemy Dagenais' (barthelemy@prologique.com)
+// easyUnit framework.
+
+#ifdef __GNUC__
+#pragma GCC system_header
+#endif
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_H_
+#define GTEST_INCLUDE_GTEST_GTEST_H_
+
+#include <limits>
+#include <ostream>
+#include <vector>
+
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: wan@google.com (Zhanyong Wan), eefacm@gmail.com (Sean Mcafee)
+//
+// The Google C++ Testing Framework (Google Test)
+//
+// This header file declares functions and macros used internally by
+// Google Test.  They are subject to change without notice.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
+
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: wan@google.com (Zhanyong Wan)
+//
+// Low-level types and utilities for porting Google Test to various
+// platforms.  They are subject to change without notice.  DO NOT USE
+// THEM IN USER CODE.
+//
+// This file is fundamental to Google Test.  All other Google Test source
+// files are expected to #include this.  Therefore, it cannot #include
+// any other Google Test header.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
+
+// The user can define the following macros in the build script to
+// control Google Test's behavior.  If the user doesn't define a macro
+// in this list, Google Test will define it.
+//
+//   GTEST_HAS_CLONE          - Define it to 1/0 to indicate that clone(2)
+//                              is/isn't available.
+//   GTEST_HAS_EXCEPTIONS     - Define it to 1/0 to indicate that exceptions
+//                              are enabled.
+//   GTEST_HAS_GLOBAL_STRING  - Define it to 1/0 to indicate that ::string
+//                              is/isn't available (some systems define
+//                              ::string, which is different to std::string).
+//   GTEST_HAS_GLOBAL_WSTRING - Define it to 1/0 to indicate that ::string
+//                              is/isn't available (some systems define
+//                              ::wstring, which is different to std::wstring).
+//   GTEST_HAS_POSIX_RE       - Define it to 1/0 to indicate that POSIX regular
+//                              expressions are/aren't available.
+//   GTEST_HAS_PTHREAD        - Define it to 1/0 to indicate that <pthread.h>
+//                              is/isn't available.
+//   GTEST_HAS_RTTI           - Define it to 1/0 to indicate that RTTI is/isn't
+//                              enabled.
+//   GTEST_HAS_STD_WSTRING    - Define it to 1/0 to indicate that
+//                              std::wstring does/doesn't work (Google Test can
+//                              be used where std::wstring is unavailable).
+//   GTEST_HAS_TR1_TUPLE      - Define it to 1/0 to indicate tr1::tuple
+//                              is/isn't available.
+//   GTEST_HAS_SEH            - Define it to 1/0 to indicate whether the
+//                              compiler supports Microsoft's "Structured
+//                              Exception Handling".
+//   GTEST_HAS_STREAM_REDIRECTION
+//                            - Define it to 1/0 to indicate whether the
+//                              platform supports I/O stream redirection using
+//                              dup() and dup2().
+//   GTEST_USE_OWN_TR1_TUPLE  - Define it to 1/0 to indicate whether Google
+//                              Test's own tr1 tuple implementation should be
+//                              used.  Unused when the user sets
+//                              GTEST_HAS_TR1_TUPLE to 0.
+//   GTEST_LANG_CXX11         - Define it to 1/0 to indicate that Google Test
+//                              is building in C++11/C++98 mode.
+//   GTEST_LINKED_AS_SHARED_LIBRARY
+//                            - Define to 1 when compiling tests that use
+//                              Google Test as a shared library (known as
+//                              DLL on Windows).
+//   GTEST_CREATE_SHARED_LIBRARY
+//                            - Define to 1 when compiling Google Test itself
+//                              as a shared library.
+
+// This header defines the following utilities:
+//
+// Macros indicating the current platform (defined to 1 if compiled on
+// the given platform; otherwise undefined):
+//   GTEST_OS_AIX      - IBM AIX
+//   GTEST_OS_CYGWIN   - Cygwin
+//   GTEST_OS_HPUX     - HP-UX
+//   GTEST_OS_LINUX    - Linux
+//     GTEST_OS_LINUX_ANDROID - Google Android
+//   GTEST_OS_MAC      - Mac OS X
+//     GTEST_OS_IOS    - iOS
+//       GTEST_OS_IOS_SIMULATOR - iOS simulator
+//   GTEST_OS_NACL     - Google Native Client (NaCl)
+//   GTEST_OS_OPENBSD  - OpenBSD
+//   GTEST_OS_QNX      - QNX
+//   GTEST_OS_SOLARIS  - Sun Solaris
+//   GTEST_OS_SYMBIAN  - Symbian
+//   GTEST_OS_WINDOWS  - Windows (Desktop, MinGW, or Mobile)
+//     GTEST_OS_WINDOWS_DESKTOP  - Windows Desktop
+//     GTEST_OS_WINDOWS_MINGW    - MinGW
+//     GTEST_OS_WINDOWS_MOBILE   - Windows Mobile
+//   GTEST_OS_ZOS      - z/OS
+//
+// Among the platforms, Cygwin, Linux, Max OS X, and Windows have the
+// most stable support.  Since core members of the Google Test project
+// don't have access to other platforms, support for them may be less
+// stable.  If you notice any problems on your platform, please notify
+// googletestframework@googlegroups.com (patches for fixing them are
+// even more welcome!).
+//
+// Note that it is possible that none of the GTEST_OS_* macros are defined.
+//
+// Macros indicating available Google Test features (defined to 1 if
+// the corresponding feature is supported; otherwise undefined):
+//   GTEST_HAS_COMBINE      - the Combine() function (for value-parameterized
+//                            tests)
+//   GTEST_HAS_DEATH_TEST   - death tests
+//   GTEST_HAS_PARAM_TEST   - value-parameterized tests
+//   GTEST_HAS_TYPED_TEST   - typed tests
+//   GTEST_HAS_TYPED_TEST_P - type-parameterized tests
+//   GTEST_USES_POSIX_RE    - enhanced POSIX regex is used. Do not confuse with
+//                            GTEST_HAS_POSIX_RE (see above) which users can
+//                            define themselves.
+//   GTEST_USES_SIMPLE_RE   - our own simple regex is used;
+//                            the above two are mutually exclusive.
+//   GTEST_CAN_COMPARE_NULL - accepts untyped NULL in EXPECT_EQ().
+//
+// Macros for basic C++ coding:
+//   GTEST_AMBIGUOUS_ELSE_BLOCKER_ - for disabling a gcc warning.
+//   GTEST_ATTRIBUTE_UNUSED_  - declares that a class' instances or a
+//                              variable don't have to be used.
+//   GTEST_DISALLOW_ASSIGN_   - disables operator=.
+//   GTEST_DISALLOW_COPY_AND_ASSIGN_ - disables copy ctor and operator=.
+//   GTEST_MUST_USE_RESULT_   - declares that a function's result must be used.
+//
+// Synchronization:
+//   Mutex, MutexLock, ThreadLocal, GetThreadCount()
+//                  - synchronization primitives.
+//   GTEST_IS_THREADSAFE - defined to 1 to indicate that the above
+//                         synchronization primitives have real implementations
+//                         and Google Test is thread-safe; or 0 otherwise.
+//
+// Template meta programming:
+//   is_pointer     - as in TR1; needed on Symbian and IBM XL C/C++ only.
+//   IteratorTraits - partial implementation of std::iterator_traits, which
+//                    is not available in libCstd when compiled with Sun C++.
+//
+// Smart pointers:
+//   scoped_ptr     - as in TR2.
+//
+// Regular expressions:
+//   RE             - a simple regular expression class using the POSIX
+//                    Extended Regular Expression syntax on UNIX-like
+//                    platforms, or a reduced regular exception syntax on
+//                    other platforms, including Windows.
+//
+// Logging:
+//   GTEST_LOG_()   - logs messages at the specified severity level.
+//   LogToStderr()  - directs all log messages to stderr.
+//   FlushInfoLog() - flushes informational log messages.
+//
+// Stdout and stderr capturing:
+//   CaptureStdout()     - starts capturing stdout.
+//   GetCapturedStdout() - stops capturing stdout and returns the captured
+//                         string.
+//   CaptureStderr()     - starts capturing stderr.
+//   GetCapturedStderr() - stops capturing stderr and returns the captured
+//                         string.
+//
+// Integer types:
+//   TypeWithSize   - maps an integer to a int type.
+//   Int32, UInt32, Int64, UInt64, TimeInMillis
+//                  - integers of known sizes.
+//   BiggestInt     - the biggest signed integer type.
+//
+// Command-line utilities:
+//   GTEST_FLAG()       - references a flag.
+//   GTEST_DECLARE_*()  - declares a flag.
+//   GTEST_DEFINE_*()   - defines a flag.
+//   GetInjectableArgvs() - returns the command line as a vector of strings.
+//
+// Environment variable utilities:
+//   GetEnv()             - gets the value of an environment variable.
+//   BoolFromGTestEnv()   - parses a bool environment variable.
+//   Int32FromGTestEnv()  - parses an Int32 environment variable.
+//   StringFromGTestEnv() - parses a string environment variable.
+
+#include <ctype.h>   // for isspace, etc
+#include <stddef.h>  // for ptrdiff_t
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#ifndef _WIN32_WCE
+# include <sys/types.h>
+# include <sys/stat.h>
+#endif  // !_WIN32_WCE
+
+#if defined __APPLE__
+# include <AvailabilityMacros.h>
+# include <TargetConditionals.h>
+#endif
+
+#include <iostream>  // NOLINT
+#include <sstream>  // NOLINT
+#include <string>  // NOLINT
+
+#define GTEST_DEV_EMAIL_ "googletestframework@@googlegroups.com"
+#define GTEST_FLAG_PREFIX_ "gtest_"
+#define GTEST_FLAG_PREFIX_DASH_ "gtest-"
+#define GTEST_FLAG_PREFIX_UPPER_ "GTEST_"
+#define GTEST_NAME_ "Google Test"
+#define GTEST_PROJECT_URL_ "http://code.google.com/p/googletest/"
+
+// Determines the version of gcc that is used to compile this.
+#ifdef __GNUC__
+// 40302 means version 4.3.2.
+# define GTEST_GCC_VER_ \
+    (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__)
+#endif  // __GNUC__
+
+// Determines the platform on which Google Test is compiled.
+#ifdef __CYGWIN__
+# define GTEST_OS_CYGWIN 1
+#elif defined __SYMBIAN32__
+# define GTEST_OS_SYMBIAN 1
+#elif defined _WIN32
+# define GTEST_OS_WINDOWS 1
+# ifdef _WIN32_WCE
+#  define GTEST_OS_WINDOWS_MOBILE 1
+# elif defined(__MINGW__) || defined(__MINGW32__)
+#  define GTEST_OS_WINDOWS_MINGW 1
+# else
+#  define GTEST_OS_WINDOWS_DESKTOP 1
+# endif  // _WIN32_WCE
+#elif defined __APPLE__
+# define GTEST_OS_MAC 1
+# if TARGET_OS_IPHONE
+#  define GTEST_OS_IOS 1
+#  if TARGET_IPHONE_SIMULATOR
+#   define GTEST_OS_IOS_SIMULATOR 1
+#  endif
+# endif
+#elif defined __linux__
+# define GTEST_OS_LINUX 1
+# if defined __ANDROID__
+#  define GTEST_OS_LINUX_ANDROID 1
+# endif
+#elif defined __MVS__
+# define GTEST_OS_ZOS 1
+#elif defined(__sun) && defined(__SVR4)
+# define GTEST_OS_SOLARIS 1
+#elif defined(_AIX)
+# define GTEST_OS_AIX 1
+#elif defined(__hpux)
+# define GTEST_OS_HPUX 1
+#elif defined __native_client__
+# define GTEST_OS_NACL 1
+#elif defined __OpenBSD__
+# define GTEST_OS_OPENBSD 1
+#elif defined __QNX__
+# define GTEST_OS_QNX 1
+#endif  // __CYGWIN__
+
+#ifndef GTEST_LANG_CXX11
+// gcc and clang define __GXX_EXPERIMENTAL_CXX0X__ when
+// -std={c,gnu}++{0x,11} is passed.  The C++11 standard specifies a
+// value for __cplusplus, and recent versions of clang, gcc, and
+// probably other compilers set that too in C++11 mode.
+# if __GXX_EXPERIMENTAL_CXX0X__ || __cplusplus >= 201103L
+// Compiling in at least C++11 mode.
+#  define GTEST_LANG_CXX11 1
+# else
+#  define GTEST_LANG_CXX11 0
+# endif
+#endif
+
+// Brings in definitions for functions used in the testing::internal::posix
+// namespace (read, write, close, chdir, isatty, stat). We do not currently
+// use them on Windows Mobile.
+#if !GTEST_OS_WINDOWS
+// This assumes that non-Windows OSes provide unistd.h. For OSes where this
+// is not the case, we need to include headers that provide the functions
+// mentioned above.
+# include <unistd.h>
+# include <strings.h>
+#elif !GTEST_OS_WINDOWS_MOBILE
+# include <direct.h>
+# include <io.h>
+#endif
+
+#if GTEST_OS_LINUX_ANDROID
+// Used to define __ANDROID_API__ matching the target NDK API level.
+#  include <android/api-level.h>  // NOLINT
+#endif
+
+// Defines this to true iff Google Test can use POSIX regular expressions.
+#ifndef GTEST_HAS_POSIX_RE
+# if GTEST_OS_LINUX_ANDROID
+// On Android, <regex.h> is only available starting with Gingerbread.
+#  define GTEST_HAS_POSIX_RE (__ANDROID_API__ >= 9)
+# else
+#  define GTEST_HAS_POSIX_RE (!GTEST_OS_WINDOWS)
+# endif
+#endif
+
+#if GTEST_HAS_POSIX_RE
+
+// On some platforms, <regex.h> needs someone to define size_t, and
+// won't compile otherwise.  We can #include it here as we already
+// included <stdlib.h>, which is guaranteed to define size_t through
+// <stddef.h>.
+# include <regex.h>  // NOLINT
+
+# define GTEST_USES_POSIX_RE 1
+
+#elif GTEST_OS_WINDOWS
+
+// <regex.h> is not available on Windows.  Use our own simple regex
+// implementation instead.
+# define GTEST_USES_SIMPLE_RE 1
+
+#else
+
+// <regex.h> may not be available on this platform.  Use our own
+// simple regex implementation instead.
+# define GTEST_USES_SIMPLE_RE 1
+
+#endif  // GTEST_HAS_POSIX_RE
+
+#ifndef GTEST_HAS_EXCEPTIONS
+// The user didn't tell us whether exceptions are enabled, so we need
+// to figure it out.
+# if defined(_MSC_VER) || defined(__BORLANDC__)
+// MSVC's and C++Builder's implementations of the STL use the _HAS_EXCEPTIONS
+// macro to enable exceptions, so we'll do the same.
+// Assumes that exceptions are enabled by default.
+#  ifndef _HAS_EXCEPTIONS
+#   define _HAS_EXCEPTIONS 1
+#  endif  // _HAS_EXCEPTIONS
+#  define GTEST_HAS_EXCEPTIONS _HAS_EXCEPTIONS
+# elif defined(__GNUC__) && __EXCEPTIONS
+// gcc defines __EXCEPTIONS to 1 iff exceptions are enabled.
+#  define GTEST_HAS_EXCEPTIONS 1
+# elif defined(__SUNPRO_CC)
+// Sun Pro CC supports exceptions.  However, there is no compile-time way of
+// detecting whether they are enabled or not.  Therefore, we assume that
+// they are enabled unless the user tells us otherwise.
+#  define GTEST_HAS_EXCEPTIONS 1
+# elif defined(__IBMCPP__) && __EXCEPTIONS
+// xlC defines __EXCEPTIONS to 1 iff exceptions are enabled.
+#  define GTEST_HAS_EXCEPTIONS 1
+# elif defined(__HP_aCC)
+// Exception handling is in effect by default in HP aCC compiler. It has to
+// be turned of by +noeh compiler option if desired.
+#  define GTEST_HAS_EXCEPTIONS 1
+# else
+// For other compilers, we assume exceptions are disabled to be
+// conservative.
+#  define GTEST_HAS_EXCEPTIONS 0
+# endif  // defined(_MSC_VER) || defined(__BORLANDC__)
+#endif  // GTEST_HAS_EXCEPTIONS
+
+#if !defined(GTEST_HAS_STD_STRING)
+// Even though we don't use this macro any longer, we keep it in case
+// some clients still depend on it.
+# define GTEST_HAS_STD_STRING 1
+#elif !GTEST_HAS_STD_STRING
+// The user told us that ::std::string isn't available.
+# error "Google Test cannot be used where ::std::string isn't available."
+#endif  // !defined(GTEST_HAS_STD_STRING)
+
+#ifndef GTEST_HAS_GLOBAL_STRING
+// The user didn't tell us whether ::string is available, so we need
+// to figure it out.
+
+# define GTEST_HAS_GLOBAL_STRING 0
+
+#endif  // GTEST_HAS_GLOBAL_STRING
+
+#ifndef GTEST_HAS_STD_WSTRING
+// The user didn't tell us whether ::std::wstring is available, so we need
+// to figure it out.
+// TODO(wan@google.com): uses autoconf to detect whether ::std::wstring
+//   is available.
+
+// Cygwin 1.7 and below doesn't support ::std::wstring.
+// Solaris' libc++ doesn't support it either.  Android has
+// no support for it at least as recent as Froyo (2.2).
+# define GTEST_HAS_STD_WSTRING \
+    (!(GTEST_OS_LINUX_ANDROID || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS))
+
+#endif  // GTEST_HAS_STD_WSTRING
+
+#ifndef GTEST_HAS_GLOBAL_WSTRING
+// The user didn't tell us whether ::wstring is available, so we need
+// to figure it out.
+# define GTEST_HAS_GLOBAL_WSTRING \
+    (GTEST_HAS_STD_WSTRING && GTEST_HAS_GLOBAL_STRING)
+#endif  // GTEST_HAS_GLOBAL_WSTRING
+
+// Determines whether RTTI is available.
+#ifndef GTEST_HAS_RTTI
+// The user didn't tell us whether RTTI is enabled, so we need to
+// figure it out.
+
+# ifdef _MSC_VER
+
+#  ifdef _CPPRTTI  // MSVC defines this macro iff RTTI is enabled.
+#   define GTEST_HAS_RTTI 1
+#  else
+#   define GTEST_HAS_RTTI 0
+#  endif
+
+// Starting with version 4.3.2, gcc defines __GXX_RTTI iff RTTI is enabled.
+# elif defined(__GNUC__) && (GTEST_GCC_VER_ >= 40302)
+
+#  ifdef __GXX_RTTI
+// When building against STLport with the Android NDK and with
+// -frtti -fno-exceptions, the build fails at link time with undefined
+// references to __cxa_bad_typeid. Note sure if STL or toolchain bug,
+// so disable RTTI when detected.
+#   if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR) && \
+       !defined(__EXCEPTIONS)
+#    define GTEST_HAS_RTTI 0
+#   else
+#    define GTEST_HAS_RTTI 1
+#   endif  // GTEST_OS_LINUX_ANDROID && __STLPORT_MAJOR && !__EXCEPTIONS
+#  else
+#   define GTEST_HAS_RTTI 0
+#  endif  // __GXX_RTTI
+
+// Clang defines __GXX_RTTI starting with version 3.0, but its manual recommends
+// using has_feature instead. has_feature(cxx_rtti) is supported since 2.7, the
+// first version with C++ support.
+# elif defined(__clang__)
+
+#  define GTEST_HAS_RTTI __has_feature(cxx_rtti)
+
+// Starting with version 9.0 IBM Visual Age defines __RTTI_ALL__ to 1 if
+// both the typeid and dynamic_cast features are present.
+# elif defined(__IBMCPP__) && (__IBMCPP__ >= 900)
+
+#  ifdef __RTTI_ALL__
+#   define GTEST_HAS_RTTI 1
+#  else
+#   define GTEST_HAS_RTTI 0
+#  endif
+
+# else
+
+// For all other compilers, we assume RTTI is enabled.
+#  define GTEST_HAS_RTTI 1
+
+# endif  // _MSC_VER
+
+#endif  // GTEST_HAS_RTTI
+
+// It's this header's responsibility to #include <typeinfo> when RTTI
+// is enabled.
+#if GTEST_HAS_RTTI
+# include <typeinfo>
+#endif
+
+// Determines whether Google Test can use the pthreads library.
+#ifndef GTEST_HAS_PTHREAD
+// The user didn't tell us explicitly, so we assume pthreads support is
+// available on Linux and Mac.
+//
+// To disable threading support in Google Test, add -DGTEST_HAS_PTHREAD=0
+// to your compiler flags.
+# define GTEST_HAS_PTHREAD (GTEST_OS_LINUX || GTEST_OS_MAC || GTEST_OS_HPUX \
+    || GTEST_OS_QNX)
+#endif  // GTEST_HAS_PTHREAD
+
+#if GTEST_HAS_PTHREAD
+// gtest-port.h guarantees to #include <pthread.h> when GTEST_HAS_PTHREAD is
+// true.
+# include <pthread.h>  // NOLINT
+
+// For timespec and nanosleep, used below.
+# include <time.h>  // NOLINT
+#endif
+
+// Determines whether Google Test can use tr1/tuple.  You can define
+// this macro to 0 to prevent Google Test from using tuple (any
+// feature depending on tuple with be disabled in this mode).
+#ifndef GTEST_HAS_TR1_TUPLE
+# if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR)
+// STLport, provided with the Android NDK, has neither <tr1/tuple> or <tuple>.
+#  define GTEST_HAS_TR1_TUPLE 0
+# else
+// The user didn't tell us not to do it, so we assume it's OK.
+#  define GTEST_HAS_TR1_TUPLE 1
+# endif
+#endif  // GTEST_HAS_TR1_TUPLE
+
+// Determines whether Google Test's own tr1 tuple implementation
+// should be used.
+#ifndef GTEST_USE_OWN_TR1_TUPLE
+// The user didn't tell us, so we need to figure it out.
+
+// We use our own TR1 tuple if we aren't sure the user has an
+// implementation of it already.  At this time, libstdc++ 4.0.0+ and
+// MSVC 2010 are the only mainstream standard libraries that come
+// with a TR1 tuple implementation.  NVIDIA's CUDA NVCC compiler
+// pretends to be GCC by defining __GNUC__ and friends, but cannot
+// compile GCC's tuple implementation.  MSVC 2008 (9.0) provides TR1
+// tuple in a 323 MB Feature Pack download, which we cannot assume the
+// user has.  QNX's QCC compiler is a modified GCC but it doesn't
+// support TR1 tuple.  libc++ only provides std::tuple, in C++11 mode,
+// and it can be used with some compilers that define __GNUC__.
+# if (defined(__GNUC__) && !defined(__CUDACC__) && (GTEST_GCC_VER_ >= 40000) \
+      && !GTEST_OS_QNX && !defined(_LIBCPP_VERSION)) || _MSC_VER >= 1600
+#  define GTEST_ENV_HAS_TR1_TUPLE_ 1
+# endif
+
+// C++11 specifies that <tuple> provides std::tuple. Use that if gtest is used
+// in C++11 mode and libstdc++ isn't very old (binaries targeting OS X 10.6
+// can build with clang but need to use gcc4.2's libstdc++).
+# if GTEST_LANG_CXX11 && (!defined(__GLIBCXX__) || __GLIBCXX__ > 20110325)
+#  define GTEST_ENV_HAS_STD_TUPLE_ 1
+# endif
+
+# if GTEST_ENV_HAS_TR1_TUPLE_ || GTEST_ENV_HAS_STD_TUPLE_
+#  define GTEST_USE_OWN_TR1_TUPLE 0
+# else
+#  define GTEST_USE_OWN_TR1_TUPLE 1
+# endif
+
+#endif  // GTEST_USE_OWN_TR1_TUPLE
+
+// To avoid conditional compilation everywhere, we make it
+// gtest-port.h's responsibility to #include the header implementing
+// tr1/tuple.
+#if GTEST_HAS_TR1_TUPLE
+
+# if GTEST_USE_OWN_TR1_TUPLE
+// This file was GENERATED by command:
+//     pump.py gtest-tuple.h.pump
+// DO NOT EDIT BY HAND!!!
+
+// Copyright 2009 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+
+// Implements a subset of TR1 tuple needed by Google Test and Google Mock.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_
+
+#include <utility>  // For ::std::pair.
+
+// The compiler used in Symbian has a bug that prevents us from declaring the
+// tuple template as a friend (it complains that tuple is redefined).  This
+// hack bypasses the bug by declaring the members that should otherwise be
+// private as public.
+// Sun Studio versions < 12 also have the above bug.
+#if defined(__SYMBIAN32__) || (defined(__SUNPRO_CC) && __SUNPRO_CC < 0x590)
+# define GTEST_DECLARE_TUPLE_AS_FRIEND_ public:
+#else
+# define GTEST_DECLARE_TUPLE_AS_FRIEND_ \
+    template <GTEST_10_TYPENAMES_(U)> friend class tuple; \
+   private:
+#endif
+
+// GTEST_n_TUPLE_(T) is the type of an n-tuple.
+#define GTEST_0_TUPLE_(T) tuple<>
+#define GTEST_1_TUPLE_(T) tuple<T##0, void, void, void, void, void, void, \
+    void, void, void>
+#define GTEST_2_TUPLE_(T) tuple<T##0, T##1, void, void, void, void, void, \
+    void, void, void>
+#define GTEST_3_TUPLE_(T) tuple<T##0, T##1, T##2, void, void, void, void, \
+    void, void, void>
+#define GTEST_4_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, void, void, void, \
+    void, void, void>
+#define GTEST_5_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, void, void, \
+    void, void, void>
+#define GTEST_6_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, void, \
+    void, void, void>
+#define GTEST_7_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, T##6, \
+    void, void, void>
+#define GTEST_8_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, T##6, \
+    T##7, void, void>
+#define GTEST_9_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, T##6, \
+    T##7, T##8, void>
+#define GTEST_10_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, T##6, \
+    T##7, T##8, T##9>
+
+// GTEST_n_TYPENAMES_(T) declares a list of n typenames.
+#define GTEST_0_TYPENAMES_(T)
+#define GTEST_1_TYPENAMES_(T) typename T##0
+#define GTEST_2_TYPENAMES_(T) typename T##0, typename T##1
+#define GTEST_3_TYPENAMES_(T) typename T##0, typename T##1, typename T##2
+#define GTEST_4_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
+    typename T##3
+#define GTEST_5_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
+    typename T##3, typename T##4
+#define GTEST_6_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
+    typename T##3, typename T##4, typename T##5
+#define GTEST_7_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
+    typename T##3, typename T##4, typename T##5, typename T##6
+#define GTEST_8_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
+    typename T##3, typename T##4, typename T##5, typename T##6, typename T##7
+#define GTEST_9_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
+    typename T##3, typename T##4, typename T##5, typename T##6, \
+    typename T##7, typename T##8
+#define GTEST_10_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
+    typename T##3, typename T##4, typename T##5, typename T##6, \
+    typename T##7, typename T##8, typename T##9
+
+// In theory, defining stuff in the ::std namespace is undefined
+// behavior.  We can do this as we are playing the role of a standard
+// library vendor.
+namespace std {
+namespace tr1 {
+
+template <typename T0 = void, typename T1 = void, typename T2 = void,
+    typename T3 = void, typename T4 = void, typename T5 = void,
+    typename T6 = void, typename T7 = void, typename T8 = void,
+    typename T9 = void>
+class tuple;
+
+// Anything in namespace gtest_internal is Google Test's INTERNAL
+// IMPLEMENTATION DETAIL and MUST NOT BE USED DIRECTLY in user code.
+namespace gtest_internal {
+
+// ByRef<T>::type is T if T is a reference; otherwise it's const T&.
+template <typename T>
+struct ByRef { typedef const T& type; };  // NOLINT
+template <typename T>
+struct ByRef<T&> { typedef T& type; };  // NOLINT
+
+// A handy wrapper for ByRef.
+#define GTEST_BY_REF_(T) typename ::std::tr1::gtest_internal::ByRef<T>::type
+
+// AddRef<T>::type is T if T is a reference; otherwise it's T&.  This
+// is the same as tr1::add_reference<T>::type.
+template <typename T>
+struct AddRef { typedef T& type; };  // NOLINT
+template <typename T>
+struct AddRef<T&> { typedef T& type; };  // NOLINT
+
+// A handy wrapper for AddRef.
+#define GTEST_ADD_REF_(T) typename ::std::tr1::gtest_internal::AddRef<T>::type
+
+// A helper for implementing get<k>().
+template <int k> class Get;
+
+// A helper for implementing tuple_element<k, T>.  kIndexValid is true
+// iff k < the number of fields in tuple type T.
+template <bool kIndexValid, int kIndex, class Tuple>
+struct TupleElement;
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 0, GTEST_10_TUPLE_(T) > {
+  typedef T0 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 1, GTEST_10_TUPLE_(T) > {
+  typedef T1 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 2, GTEST_10_TUPLE_(T) > {
+  typedef T2 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 3, GTEST_10_TUPLE_(T) > {
+  typedef T3 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 4, GTEST_10_TUPLE_(T) > {
+  typedef T4 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 5, GTEST_10_TUPLE_(T) > {
+  typedef T5 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 6, GTEST_10_TUPLE_(T) > {
+  typedef T6 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 7, GTEST_10_TUPLE_(T) > {
+  typedef T7 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 8, GTEST_10_TUPLE_(T) > {
+  typedef T8 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 9, GTEST_10_TUPLE_(T) > {
+  typedef T9 type;
+};
+
+}  // namespace gtest_internal
+
+template <>
+class tuple<> {
+ public:
+  tuple() {}
+  tuple(const tuple& /* t */)  {}
+  tuple& operator=(const tuple& /* t */) { return *this; }
+};
+
+template <GTEST_1_TYPENAMES_(T)>
+class GTEST_1_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0) : f0_(f0) {}
+
+  tuple(const tuple& t) : f0_(t.f0_) {}
+
+  template <GTEST_1_TYPENAMES_(U)>
+  tuple(const GTEST_1_TUPLE_(U)& t) : f0_(t.f0_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_1_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_1_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_1_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_1_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    return *this;
+  }
+
+  T0 f0_;
+};
+
+template <GTEST_2_TYPENAMES_(T)>
+class GTEST_2_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1) : f0_(f0),
+      f1_(f1) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_) {}
+
+  template <GTEST_2_TYPENAMES_(U)>
+  tuple(const GTEST_2_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_) {}
+  template <typename U0, typename U1>
+  tuple(const ::std::pair<U0, U1>& p) : f0_(p.first), f1_(p.second) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_2_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_2_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+  template <typename U0, typename U1>
+  tuple& operator=(const ::std::pair<U0, U1>& p) {
+    f0_ = p.first;
+    f1_ = p.second;
+    return *this;
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_2_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_2_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+};
+
+template <GTEST_3_TYPENAMES_(T)>
+class GTEST_3_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2) : f0_(f0), f1_(f1), f2_(f2) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_) {}
+
+  template <GTEST_3_TYPENAMES_(U)>
+  tuple(const GTEST_3_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_3_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_3_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_3_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_3_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+};
+
+template <GTEST_4_TYPENAMES_(T)>
+class GTEST_4_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_(), f3_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3) : f0_(f0), f1_(f1), f2_(f2),
+      f3_(f3) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_) {}
+
+  template <GTEST_4_TYPENAMES_(U)>
+  tuple(const GTEST_4_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
+      f3_(t.f3_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_4_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_4_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_4_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_4_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    f3_ = t.f3_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+  T3 f3_;
+};
+
+template <GTEST_5_TYPENAMES_(T)>
+class GTEST_5_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_(), f3_(), f4_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3,
+      GTEST_BY_REF_(T4) f4) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
+      f4_(t.f4_) {}
+
+  template <GTEST_5_TYPENAMES_(U)>
+  tuple(const GTEST_5_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
+      f3_(t.f3_), f4_(t.f4_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_5_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_5_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_5_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_5_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    f3_ = t.f3_;
+    f4_ = t.f4_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+  T3 f3_;
+  T4 f4_;
+};
+
+template <GTEST_6_TYPENAMES_(T)>
+class GTEST_6_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4,
+      GTEST_BY_REF_(T5) f5) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4),
+      f5_(f5) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
+      f4_(t.f4_), f5_(t.f5_) {}
+
+  template <GTEST_6_TYPENAMES_(U)>
+  tuple(const GTEST_6_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
+      f3_(t.f3_), f4_(t.f4_), f5_(t.f5_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_6_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_6_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_6_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_6_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    f3_ = t.f3_;
+    f4_ = t.f4_;
+    f5_ = t.f5_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+  T3 f3_;
+  T4 f4_;
+  T5 f5_;
+};
+
+template <GTEST_7_TYPENAMES_(T)>
+class GTEST_7_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4,
+      GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6) : f0_(f0), f1_(f1), f2_(f2),
+      f3_(f3), f4_(f4), f5_(f5), f6_(f6) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
+      f4_(t.f4_), f5_(t.f5_), f6_(t.f6_) {}
+
+  template <GTEST_7_TYPENAMES_(U)>
+  tuple(const GTEST_7_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
+      f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_7_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_7_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_7_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_7_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    f3_ = t.f3_;
+    f4_ = t.f4_;
+    f5_ = t.f5_;
+    f6_ = t.f6_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+  T3 f3_;
+  T4 f4_;
+  T5 f5_;
+  T6 f6_;
+};
+
+template <GTEST_8_TYPENAMES_(T)>
+class GTEST_8_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_(), f7_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4,
+      GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6,
+      GTEST_BY_REF_(T7) f7) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4),
+      f5_(f5), f6_(f6), f7_(f7) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
+      f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_) {}
+
+  template <GTEST_8_TYPENAMES_(U)>
+  tuple(const GTEST_8_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
+      f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_8_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_8_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_8_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_8_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    f3_ = t.f3_;
+    f4_ = t.f4_;
+    f5_ = t.f5_;
+    f6_ = t.f6_;
+    f7_ = t.f7_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+  T3 f3_;
+  T4 f4_;
+  T5 f5_;
+  T6 f6_;
+  T7 f7_;
+};
+
+template <GTEST_9_TYPENAMES_(T)>
+class GTEST_9_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_(), f7_(), f8_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4,
+      GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6, GTEST_BY_REF_(T7) f7,
+      GTEST_BY_REF_(T8) f8) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4),
+      f5_(f5), f6_(f6), f7_(f7), f8_(f8) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
+      f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_) {}
+
+  template <GTEST_9_TYPENAMES_(U)>
+  tuple(const GTEST_9_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
+      f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_9_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_9_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_9_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_9_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    f3_ = t.f3_;
+    f4_ = t.f4_;
+    f5_ = t.f5_;
+    f6_ = t.f6_;
+    f7_ = t.f7_;
+    f8_ = t.f8_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+  T3 f3_;
+  T4 f4_;
+  T5 f5_;
+  T6 f6_;
+  T7 f7_;
+  T8 f8_;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+class tuple {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_(), f7_(), f8_(),
+      f9_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4,
+      GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6, GTEST_BY_REF_(T7) f7,
+      GTEST_BY_REF_(T8) f8, GTEST_BY_REF_(T9) f9) : f0_(f0), f1_(f1), f2_(f2),
+      f3_(f3), f4_(f4), f5_(f5), f6_(f6), f7_(f7), f8_(f8), f9_(f9) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
+      f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_), f9_(t.f9_) {}
+
+  template <GTEST_10_TYPENAMES_(U)>
+  tuple(const GTEST_10_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
+      f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_),
+      f9_(t.f9_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_10_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_10_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_10_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_10_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    f3_ = t.f3_;
+    f4_ = t.f4_;
+    f5_ = t.f5_;
+    f6_ = t.f6_;
+    f7_ = t.f7_;
+    f8_ = t.f8_;
+    f9_ = t.f9_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+  T3 f3_;
+  T4 f4_;
+  T5 f5_;
+  T6 f6_;
+  T7 f7_;
+  T8 f8_;
+  T9 f9_;
+};
+
+// 6.1.3.2 Tuple creation functions.
+
+// Known limitations: we don't support passing an
+// std::tr1::reference_wrapper<T> to make_tuple().  And we don't
+// implement tie().
+
+inline tuple<> make_tuple() { return tuple<>(); }
+
+template <GTEST_1_TYPENAMES_(T)>
+inline GTEST_1_TUPLE_(T) make_tuple(const T0& f0) {
+  return GTEST_1_TUPLE_(T)(f0);
+}
+
+template <GTEST_2_TYPENAMES_(T)>
+inline GTEST_2_TUPLE_(T) make_tuple(const T0& f0, const T1& f1) {
+  return GTEST_2_TUPLE_(T)(f0, f1);
+}
+
+template <GTEST_3_TYPENAMES_(T)>
+inline GTEST_3_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2) {
+  return GTEST_3_TUPLE_(T)(f0, f1, f2);
+}
+
+template <GTEST_4_TYPENAMES_(T)>
+inline GTEST_4_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
+    const T3& f3) {
+  return GTEST_4_TUPLE_(T)(f0, f1, f2, f3);
+}
+
+template <GTEST_5_TYPENAMES_(T)>
+inline GTEST_5_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
+    const T3& f3, const T4& f4) {
+  return GTEST_5_TUPLE_(T)(f0, f1, f2, f3, f4);
+}
+
+template <GTEST_6_TYPENAMES_(T)>
+inline GTEST_6_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
+    const T3& f3, const T4& f4, const T5& f5) {
+  return GTEST_6_TUPLE_(T)(f0, f1, f2, f3, f4, f5);
+}
+
+template <GTEST_7_TYPENAMES_(T)>
+inline GTEST_7_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
+    const T3& f3, const T4& f4, const T5& f5, const T6& f6) {
+  return GTEST_7_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6);
+}
+
+template <GTEST_8_TYPENAMES_(T)>
+inline GTEST_8_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
+    const T3& f3, const T4& f4, const T5& f5, const T6& f6, const T7& f7) {
+  return GTEST_8_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6, f7);
+}
+
+template <GTEST_9_TYPENAMES_(T)>
+inline GTEST_9_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
+    const T3& f3, const T4& f4, const T5& f5, const T6& f6, const T7& f7,
+    const T8& f8) {
+  return GTEST_9_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6, f7, f8);
+}
+
+template <GTEST_10_TYPENAMES_(T)>
+inline GTEST_10_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
+    const T3& f3, const T4& f4, const T5& f5, const T6& f6, const T7& f7,
+    const T8& f8, const T9& f9) {
+  return GTEST_10_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6, f7, f8, f9);
+}
+
+// 6.1.3.3 Tuple helper classes.
+
+template <typename Tuple> struct tuple_size;
+
+template <GTEST_0_TYPENAMES_(T)>
+struct tuple_size<GTEST_0_TUPLE_(T) > {
+  static const int value = 0;
+};
+
+template <GTEST_1_TYPENAMES_(T)>
+struct tuple_size<GTEST_1_TUPLE_(T) > {
+  static const int value = 1;
+};
+
+template <GTEST_2_TYPENAMES_(T)>
+struct tuple_size<GTEST_2_TUPLE_(T) > {
+  static const int value = 2;
+};
+
+template <GTEST_3_TYPENAMES_(T)>
+struct tuple_size<GTEST_3_TUPLE_(T) > {
+  static const int value = 3;
+};
+
+template <GTEST_4_TYPENAMES_(T)>
+struct tuple_size<GTEST_4_TUPLE_(T) > {
+  static const int value = 4;
+};
+
+template <GTEST_5_TYPENAMES_(T)>
+struct tuple_size<GTEST_5_TUPLE_(T) > {
+  static const int value = 5;
+};
+
+template <GTEST_6_TYPENAMES_(T)>
+struct tuple_size<GTEST_6_TUPLE_(T) > {
+  static const int value = 6;
+};
+
+template <GTEST_7_TYPENAMES_(T)>
+struct tuple_size<GTEST_7_TUPLE_(T) > {
+  static const int value = 7;
+};
+
+template <GTEST_8_TYPENAMES_(T)>
+struct tuple_size<GTEST_8_TUPLE_(T) > {
+  static const int value = 8;
+};
+
+template <GTEST_9_TYPENAMES_(T)>
+struct tuple_size<GTEST_9_TUPLE_(T) > {
+  static const int value = 9;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct tuple_size<GTEST_10_TUPLE_(T) > {
+  static const int value = 10;
+};
+
+template <int k, class Tuple>
+struct tuple_element {
+  typedef typename gtest_internal::TupleElement<
+      k < (tuple_size<Tuple>::value), k, Tuple>::type type;
+};
+
+#define GTEST_TUPLE_ELEMENT_(k, Tuple) typename tuple_element<k, Tuple >::type
+
+// 6.1.3.4 Element access.
+
+namespace gtest_internal {
+
+template <>
+class Get<0> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(0, Tuple))
+  Field(Tuple& t) { return t.f0_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(0, Tuple))
+  ConstField(const Tuple& t) { return t.f0_; }
+};
+
+template <>
+class Get<1> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(1, Tuple))
+  Field(Tuple& t) { return t.f1_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(1, Tuple))
+  ConstField(const Tuple& t) { return t.f1_; }
+};
+
+template <>
+class Get<2> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(2, Tuple))
+  Field(Tuple& t) { return t.f2_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(2, Tuple))
+  ConstField(const Tuple& t) { return t.f2_; }
+};
+
+template <>
+class Get<3> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(3, Tuple))
+  Field(Tuple& t) { return t.f3_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(3, Tuple))
+  ConstField(const Tuple& t) { return t.f3_; }
+};
+
+template <>
+class Get<4> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(4, Tuple))
+  Field(Tuple& t) { return t.f4_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(4, Tuple))
+  ConstField(const Tuple& t) { return t.f4_; }
+};
+
+template <>
+class Get<5> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(5, Tuple))
+  Field(Tuple& t) { return t.f5_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(5, Tuple))
+  ConstField(const Tuple& t) { return t.f5_; }
+};
+
+template <>
+class Get<6> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(6, Tuple))
+  Field(Tuple& t) { return t.f6_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(6, Tuple))
+  ConstField(const Tuple& t) { return t.f6_; }
+};
+
+template <>
+class Get<7> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(7, Tuple))
+  Field(Tuple& t) { return t.f7_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(7, Tuple))
+  ConstField(const Tuple& t) { return t.f7_; }
+};
+
+template <>
+class Get<8> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(8, Tuple))
+  Field(Tuple& t) { return t.f8_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(8, Tuple))
+  ConstField(const Tuple& t) { return t.f8_; }
+};
+
+template <>
+class Get<9> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(9, Tuple))
+  Field(Tuple& t) { return t.f9_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(9, Tuple))
+  ConstField(const Tuple& t) { return t.f9_; }
+};
+
+}  // namespace gtest_internal
+
+template <int k, GTEST_10_TYPENAMES_(T)>
+GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(k, GTEST_10_TUPLE_(T)))
+get(GTEST_10_TUPLE_(T)& t) {
+  return gtest_internal::Get<k>::Field(t);
+}
+
+template <int k, GTEST_10_TYPENAMES_(T)>
+GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(k,  GTEST_10_TUPLE_(T)))
+get(const GTEST_10_TUPLE_(T)& t) {
+  return gtest_internal::Get<k>::ConstField(t);
+}
+
+// 6.1.3.5 Relational operators
+
+// We only implement == and !=, as we don't have a need for the rest yet.
+
+namespace gtest_internal {
+
+// SameSizeTuplePrefixComparator<k, k>::Eq(t1, t2) returns true if the
+// first k fields of t1 equals the first k fields of t2.
+// SameSizeTuplePrefixComparator(k1, k2) would be a compiler error if
+// k1 != k2.
+template <int kSize1, int kSize2>
+struct SameSizeTuplePrefixComparator;
+
+template <>
+struct SameSizeTuplePrefixComparator<0, 0> {
+  template <class Tuple1, class Tuple2>
+  static bool Eq(const Tuple1& /* t1 */, const Tuple2& /* t2 */) {
+    return true;
+  }
+};
+
+template <int k>
+struct SameSizeTuplePrefixComparator<k, k> {
+  template <class Tuple1, class Tuple2>
+  static bool Eq(const Tuple1& t1, const Tuple2& t2) {
+    return SameSizeTuplePrefixComparator<k - 1, k - 1>::Eq(t1, t2) &&
+        ::std::tr1::get<k - 1>(t1) == ::std::tr1::get<k - 1>(t2);
+  }
+};
+
+}  // namespace gtest_internal
+
+template <GTEST_10_TYPENAMES_(T), GTEST_10_TYPENAMES_(U)>
+inline bool operator==(const GTEST_10_TUPLE_(T)& t,
+                       const GTEST_10_TUPLE_(U)& u) {
+  return gtest_internal::SameSizeTuplePrefixComparator<
+      tuple_size<GTEST_10_TUPLE_(T) >::value,
+      tuple_size<GTEST_10_TUPLE_(U) >::value>::Eq(t, u);
+}
+
+template <GTEST_10_TYPENAMES_(T), GTEST_10_TYPENAMES_(U)>
+inline bool operator!=(const GTEST_10_TUPLE_(T)& t,
+                       const GTEST_10_TUPLE_(U)& u) { return !(t == u); }
+
+// 6.1.4 Pairs.
+// Unimplemented.
+
+}  // namespace tr1
+}  // namespace std
+
+#undef GTEST_0_TUPLE_
+#undef GTEST_1_TUPLE_
+#undef GTEST_2_TUPLE_
+#undef GTEST_3_TUPLE_
+#undef GTEST_4_TUPLE_
+#undef GTEST_5_TUPLE_
+#undef GTEST_6_TUPLE_
+#undef GTEST_7_TUPLE_
+#undef GTEST_8_TUPLE_
+#undef GTEST_9_TUPLE_
+#undef GTEST_10_TUPLE_
+
+#undef GTEST_0_TYPENAMES_
+#undef GTEST_1_TYPENAMES_
+#undef GTEST_2_TYPENAMES_
+#undef GTEST_3_TYPENAMES_
+#undef GTEST_4_TYPENAMES_
+#undef GTEST_5_TYPENAMES_
+#undef GTEST_6_TYPENAMES_
+#undef GTEST_7_TYPENAMES_
+#undef GTEST_8_TYPENAMES_
+#undef GTEST_9_TYPENAMES_
+#undef GTEST_10_TYPENAMES_
+
+#undef GTEST_DECLARE_TUPLE_AS_FRIEND_
+#undef GTEST_BY_REF_
+#undef GTEST_ADD_REF_
+#undef GTEST_TUPLE_ELEMENT_
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_
+# elif GTEST_ENV_HAS_STD_TUPLE_
+#  include <tuple>
+// C++11 puts its tuple into the ::std namespace rather than
+// ::std::tr1.  gtest expects tuple to live in ::std::tr1, so put it there.
+// This causes undefined behavior, but supported compilers react in
+// the way we intend.
+namespace std {
+namespace tr1 {
+using ::std::get;
+using ::std::make_tuple;
+using ::std::tuple;
+using ::std::tuple_element;
+using ::std::tuple_size;
+}
+}
+
+# elif GTEST_OS_SYMBIAN
+
+// On Symbian, BOOST_HAS_TR1_TUPLE causes Boost's TR1 tuple library to
+// use STLport's tuple implementation, which unfortunately doesn't
+// work as the copy of STLport distributed with Symbian is incomplete.
+// By making sure BOOST_HAS_TR1_TUPLE is undefined, we force Boost to
+// use its own tuple implementation.
+#  ifdef BOOST_HAS_TR1_TUPLE
+#   undef BOOST_HAS_TR1_TUPLE
+#  endif  // BOOST_HAS_TR1_TUPLE
+
+// This prevents <boost/tr1/detail/config.hpp>, which defines
+// BOOST_HAS_TR1_TUPLE, from being #included by Boost's <tuple>.
+#  define BOOST_TR1_DETAIL_CONFIG_HPP_INCLUDED
+#  include <tuple>
+
+# elif defined(__GNUC__) && (GTEST_GCC_VER_ >= 40000)
+// GCC 4.0+ implements tr1/tuple in the <tr1/tuple> header.  This does
+// not conform to the TR1 spec, which requires the header to be <tuple>.
+
+#  if !GTEST_HAS_RTTI && GTEST_GCC_VER_ < 40302
+// Until version 4.3.2, gcc has a bug that causes <tr1/functional>,
+// which is #included by <tr1/tuple>, to not compile when RTTI is
+// disabled.  _TR1_FUNCTIONAL is the header guard for
+// <tr1/functional>.  Hence the following #define is a hack to prevent
+// <tr1/functional> from being included.
+#   define _TR1_FUNCTIONAL 1
+#   include <tr1/tuple>
+#   undef _TR1_FUNCTIONAL  // Allows the user to #include
+                        // <tr1/functional> if he chooses to.
+#  else
+#   include <tr1/tuple>  // NOLINT
+#  endif  // !GTEST_HAS_RTTI && GTEST_GCC_VER_ < 40302
+
+# else
+// If the compiler is not GCC 4.0+, we assume the user is using a
+// spec-conforming TR1 implementation.
+#  include <tuple>  // NOLINT
+# endif  // GTEST_USE_OWN_TR1_TUPLE
+
+#endif  // GTEST_HAS_TR1_TUPLE
+
+// Determines whether clone(2) is supported.
+// Usually it will only be available on Linux, excluding
+// Linux on the Itanium architecture.
+// Also see http://linux.die.net/man/2/clone.
+#ifndef GTEST_HAS_CLONE
+// The user didn't tell us, so we need to figure it out.
+
+# if GTEST_OS_LINUX && !defined(__ia64__)
+#  if GTEST_OS_LINUX_ANDROID
+// On Android, clone() is only available on ARM starting with Gingerbread.
+#    if defined(__arm__) && __ANDROID_API__ >= 9
+#     define GTEST_HAS_CLONE 1
+#    else
+#     define GTEST_HAS_CLONE 0
+#    endif
+#  else
+#   define GTEST_HAS_CLONE 1
+#  endif
+# else
+#  define GTEST_HAS_CLONE 0
+# endif  // GTEST_OS_LINUX && !defined(__ia64__)
+
+#endif  // GTEST_HAS_CLONE
+
+// Determines whether to support stream redirection. This is used to test
+// output correctness and to implement death tests.
+#ifndef GTEST_HAS_STREAM_REDIRECTION
+// By default, we assume that stream redirection is supported on all
+// platforms except known mobile ones.
+# if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN
+#  define GTEST_HAS_STREAM_REDIRECTION 0
+# else
+#  define GTEST_HAS_STREAM_REDIRECTION 1
+# endif  // !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_SYMBIAN
+#endif  // GTEST_HAS_STREAM_REDIRECTION
+
+// Determines whether to support death tests.
+// Google Test does not support death tests for VC 7.1 and earlier as
+// abort() in a VC 7.1 application compiled as GUI in debug config
+// pops up a dialog window that cannot be suppressed programmatically.
+#if (GTEST_OS_LINUX || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS || \
+     (GTEST_OS_MAC && !GTEST_OS_IOS) || GTEST_OS_IOS_SIMULATOR || \
+     (GTEST_OS_WINDOWS_DESKTOP && _MSC_VER >= 1400) || \
+     GTEST_OS_WINDOWS_MINGW || GTEST_OS_AIX || GTEST_OS_HPUX || \
+     GTEST_OS_OPENBSD || GTEST_OS_QNX)
+# define GTEST_HAS_DEATH_TEST 1
+# include <vector>  // NOLINT
+#endif
+
+// We don't support MSVC 7.1 with exceptions disabled now.  Therefore
+// all the compilers we care about are adequate for supporting
+// value-parameterized tests.
+#define GTEST_HAS_PARAM_TEST 1
+
+// Determines whether to support type-driven tests.
+
+// Typed tests need <typeinfo> and variadic macros, which GCC, VC++ 8.0,
+// Sun Pro CC, IBM Visual Age, and HP aCC support.
+#if defined(__GNUC__) || (_MSC_VER >= 1400) || defined(__SUNPRO_CC) || \
+    defined(__IBMCPP__) || defined(__HP_aCC)
+# define GTEST_HAS_TYPED_TEST 1
+# define GTEST_HAS_TYPED_TEST_P 1
+#endif
+
+// Determines whether to support Combine(). This only makes sense when
+// value-parameterized tests are enabled.  The implementation doesn't
+// work on Sun Studio since it doesn't understand templated conversion
+// operators.
+#if GTEST_HAS_PARAM_TEST && GTEST_HAS_TR1_TUPLE && !defined(__SUNPRO_CC)
+# define GTEST_HAS_COMBINE 1
+#endif
+
+// Determines whether the system compiler uses UTF-16 for encoding wide strings.
+#define GTEST_WIDE_STRING_USES_UTF16_ \
+    (GTEST_OS_WINDOWS || GTEST_OS_CYGWIN || GTEST_OS_SYMBIAN || GTEST_OS_AIX)
+
+// Determines whether test results can be streamed to a socket.
+#if GTEST_OS_LINUX
+# define GTEST_CAN_STREAM_RESULTS_ 1
+#endif
+
+// Defines some utility macros.
+
+// The GNU compiler emits a warning if nested "if" statements are followed by
+// an "else" statement and braces are not used to explicitly disambiguate the
+// "else" binding.  This leads to problems with code like:
+//
+//   if (gate)
+//     ASSERT_*(condition) << "Some message";
+//
+// The "switch (0) case 0:" idiom is used to suppress this.
+#ifdef __INTEL_COMPILER
+# define GTEST_AMBIGUOUS_ELSE_BLOCKER_
+#else
+# define GTEST_AMBIGUOUS_ELSE_BLOCKER_ switch (0) case 0: default:  // NOLINT
+#endif
+
+// Use this annotation at the end of a struct/class definition to
+// prevent the compiler from optimizing away instances that are never
+// used.  This is useful when all interesting logic happens inside the
+// c'tor and / or d'tor.  Example:
+//
+//   struct Foo {
+//     Foo() { ... }
+//   } GTEST_ATTRIBUTE_UNUSED_;
+//
+// Also use it after a variable or parameter declaration to tell the
+// compiler the variable/parameter does not have to be used.
+#if defined(__GNUC__) && !defined(COMPILER_ICC)
+# define GTEST_ATTRIBUTE_UNUSED_ __attribute__ ((unused))
+#else
+# define GTEST_ATTRIBUTE_UNUSED_
+#endif
+
+// A macro to disallow operator=
+// This should be used in the private: declarations for a class.
+#define GTEST_DISALLOW_ASSIGN_(type)\
+  void operator=(type const &)
+
+// A macro to disallow copy constructor and operator=
+// This should be used in the private: declarations for a class.
+#define GTEST_DISALLOW_COPY_AND_ASSIGN_(type)\
+  type(type const &);\
+  GTEST_DISALLOW_ASSIGN_(type)
+
+// Tell the compiler to warn about unused return values for functions declared
+// with this macro.  The macro should be used on function declarations
+// following the argument list:
+//
+//   Sprocket* AllocateSprocket() GTEST_MUST_USE_RESULT_;
+#if defined(__GNUC__) && (GTEST_GCC_VER_ >= 30400) && !defined(COMPILER_ICC)
+# define GTEST_MUST_USE_RESULT_ __attribute__ ((warn_unused_result))
+#else
+# define GTEST_MUST_USE_RESULT_
+#endif  // __GNUC__ && (GTEST_GCC_VER_ >= 30400) && !COMPILER_ICC
+
+// Determine whether the compiler supports Microsoft's Structured Exception
+// Handling.  This is supported by several Windows compilers but generally
+// does not exist on any other system.
+#ifndef GTEST_HAS_SEH
+// The user didn't tell us, so we need to figure it out.
+
+# if defined(_MSC_VER) || defined(__BORLANDC__)
+// These two compilers are known to support SEH.
+#  define GTEST_HAS_SEH 1
+# else
+// Assume no SEH.
+#  define GTEST_HAS_SEH 0
+# endif
+
+#endif  // GTEST_HAS_SEH
+
+#ifdef _MSC_VER
+
+# if GTEST_LINKED_AS_SHARED_LIBRARY
+#  define GTEST_API_ __declspec(dllimport)
+# elif GTEST_CREATE_SHARED_LIBRARY
+#  define GTEST_API_ __declspec(dllexport)
+# endif
+
+#endif  // _MSC_VER
+
+#ifndef GTEST_API_
+# define GTEST_API_
+#endif
+
+#ifdef __GNUC__
+// Ask the compiler to never inline a given function.
+# define GTEST_NO_INLINE_ __attribute__((noinline))
+#else
+# define GTEST_NO_INLINE_
+#endif
+
+// _LIBCPP_VERSION is defined by the libc++ library from the LLVM project.
+#if defined(__GLIBCXX__) || defined(_LIBCPP_VERSION)
+# define GTEST_HAS_CXXABI_H_ 1
+#else
+# define GTEST_HAS_CXXABI_H_ 0
+#endif
+
+namespace testing {
+
+class Message;
+
+namespace internal {
+
+// A secret type that Google Test users don't know about.  It has no
+// definition on purpose.  Therefore it's impossible to create a
+// Secret object, which is what we want.
+class Secret;
+
+// The GTEST_COMPILE_ASSERT_ macro can be used to verify that a compile time
+// expression is true. For example, you could use it to verify the
+// size of a static array:
+//
+//   GTEST_COMPILE_ASSERT_(ARRAYSIZE(content_type_names) == CONTENT_NUM_TYPES,
+//                         content_type_names_incorrect_size);
+//
+// or to make sure a struct is smaller than a certain size:
+//
+//   GTEST_COMPILE_ASSERT_(sizeof(foo) < 128, foo_too_large);
+//
+// The second argument to the macro is the name of the variable. If
+// the expression is false, most compilers will issue a warning/error
+// containing the name of the variable.
+
+template <bool>
+struct CompileAssert {
+};
+
+#define GTEST_COMPILE_ASSERT_(expr, msg) \
+  typedef ::testing::internal::CompileAssert<(static_cast<bool>(expr))> \
+      msg[static_cast<bool>(expr) ? 1 : -1] GTEST_ATTRIBUTE_UNUSED_
+
+// Implementation details of GTEST_COMPILE_ASSERT_:
+//
+// - GTEST_COMPILE_ASSERT_ works by defining an array type that has -1
+//   elements (and thus is invalid) when the expression is false.
+//
+// - The simpler definition
+//
+//    #define GTEST_COMPILE_ASSERT_(expr, msg) typedef char msg[(expr) ? 1 : -1]
+//
+//   does not work, as gcc supports variable-length arrays whose sizes
+//   are determined at run-time (this is gcc's extension and not part
+//   of the C++ standard).  As a result, gcc fails to reject the
+//   following code with the simple definition:
+//
+//     int foo;
+//     GTEST_COMPILE_ASSERT_(foo, msg); // not supposed to compile as foo is
+//                                      // not a compile-time constant.
+//
+// - By using the type CompileAssert<(bool(expr))>, we ensures that
+//   expr is a compile-time constant.  (Template arguments must be
+//   determined at compile-time.)
+//
+// - The outter parentheses in CompileAssert<(bool(expr))> are necessary
+//   to work around a bug in gcc 3.4.4 and 4.0.1.  If we had written
+//
+//     CompileAssert<bool(expr)>
+//
+//   instead, these compilers will refuse to compile
+//
+//     GTEST_COMPILE_ASSERT_(5 > 0, some_message);
+//
+//   (They seem to think the ">" in "5 > 0" marks the end of the
+//   template argument list.)
+//
+// - The array size is (bool(expr) ? 1 : -1), instead of simply
+//
+//     ((expr) ? 1 : -1).
+//
+//   This is to avoid running into a bug in MS VC 7.1, which
+//   causes ((0.0) ? 1 : -1) to incorrectly evaluate to 1.
+
+// StaticAssertTypeEqHelper is used by StaticAssertTypeEq defined in gtest.h.
+//
+// This template is declared, but intentionally undefined.
+template <typename T1, typename T2>
+struct StaticAssertTypeEqHelper;
+
+template <typename T>
+struct StaticAssertTypeEqHelper<T, T> {};
+
+#if GTEST_HAS_GLOBAL_STRING
+typedef ::string string;
+#else
+typedef ::std::string string;
+#endif  // GTEST_HAS_GLOBAL_STRING
+
+#if GTEST_HAS_GLOBAL_WSTRING
+typedef ::wstring wstring;
+#elif GTEST_HAS_STD_WSTRING
+typedef ::std::wstring wstring;
+#endif  // GTEST_HAS_GLOBAL_WSTRING
+
+// A helper for suppressing warnings on constant condition.  It just
+// returns 'condition'.
+GTEST_API_ bool IsTrue(bool condition);
+
+// Defines scoped_ptr.
+
+// This implementation of scoped_ptr is PARTIAL - it only contains
+// enough stuff to satisfy Google Test's need.
+template <typename T>
+class scoped_ptr {
+ public:
+  typedef T element_type;
+
+  explicit scoped_ptr(T* p = NULL) : ptr_(p) {}
+  ~scoped_ptr() { reset(); }
+
+  T& operator*() const { return *ptr_; }
+  T* operator->() const { return ptr_; }
+  T* get() const { return ptr_; }
+
+  T* release() {
+    T* const ptr = ptr_;
+    ptr_ = NULL;
+    return ptr;
+  }
+
+  void reset(T* p = NULL) {
+    if (p != ptr_) {
+      if (IsTrue(sizeof(T) > 0)) {  // Makes sure T is a complete type.
+        delete ptr_;
+      }
+      ptr_ = p;
+    }
+  }
+
+ private:
+  T* ptr_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(scoped_ptr);
+};
+
+// Defines RE.
+
+// A simple C++ wrapper for <regex.h>.  It uses the POSIX Extended
+// Regular Expression syntax.
+class GTEST_API_ RE {
+ public:
+  // A copy constructor is required by the Standard to initialize object
+  // references from r-values.
+  RE(const RE& other) { Init(other.pattern()); }
+
+  // Constructs an RE from a string.
+  RE(const ::std::string& regex) { Init(regex.c_str()); }  // NOLINT
+
+#if GTEST_HAS_GLOBAL_STRING
+
+  RE(const ::string& regex) { Init(regex.c_str()); }  // NOLINT
+
+#endif  // GTEST_HAS_GLOBAL_STRING
+
+  RE(const char* regex) { Init(regex); }  // NOLINT
+  ~RE();
+
+  // Returns the string representation of the regex.
+  const char* pattern() const { return pattern_; }
+
+  // FullMatch(str, re) returns true iff regular expression re matches
+  // the entire str.
+  // PartialMatch(str, re) returns true iff regular expression re
+  // matches a substring of str (including str itself).
+  //
+  // TODO(wan@google.com): make FullMatch() and PartialMatch() work
+  // when str contains NUL characters.
+  static bool FullMatch(const ::std::string& str, const RE& re) {
+    return FullMatch(str.c_str(), re);
+  }
+  static bool PartialMatch(const ::std::string& str, const RE& re) {
+    return PartialMatch(str.c_str(), re);
+  }
+
+#if GTEST_HAS_GLOBAL_STRING
+
+  static bool FullMatch(const ::string& str, const RE& re) {
+    return FullMatch(str.c_str(), re);
+  }
+  static bool PartialMatch(const ::string& str, const RE& re) {
+    return PartialMatch(str.c_str(), re);
+  }
+
+#endif  // GTEST_HAS_GLOBAL_STRING
+
+  static bool FullMatch(const char* str, const RE& re);
+  static bool PartialMatch(const char* str, const RE& re);
+
+ private:
+  void Init(const char* regex);
+
+  // We use a const char* instead of an std::string, as Google Test used to be
+  // used where std::string is not available.  TODO(wan@google.com): change to
+  // std::string.
+  const char* pattern_;
+  bool is_valid_;
+
+#if GTEST_USES_POSIX_RE
+
+  regex_t full_regex_;     // For FullMatch().
+  regex_t partial_regex_;  // For PartialMatch().
+
+#else  // GTEST_USES_SIMPLE_RE
+
+  const char* full_pattern_;  // For FullMatch();
+
+#endif
+
+  GTEST_DISALLOW_ASSIGN_(RE);
+};
+
+// Formats a source file path and a line number as they would appear
+// in an error message from the compiler used to compile this code.
+GTEST_API_ ::std::string FormatFileLocation(const char* file, int line);
+
+// Formats a file location for compiler-independent XML output.
+// Although this function is not platform dependent, we put it next to
+// FormatFileLocation in order to contrast the two functions.
+GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(const char* file,
+                                                               int line);
+
+// Defines logging utilities:
+//   GTEST_LOG_(severity) - logs messages at the specified severity level. The
+//                          message itself is streamed into the macro.
+//   LogToStderr()  - directs all log messages to stderr.
+//   FlushInfoLog() - flushes informational log messages.
+
+enum GTestLogSeverity {
+  GTEST_INFO,
+  GTEST_WARNING,
+  GTEST_ERROR,
+  GTEST_FATAL
+};
+
+// Formats log entry severity, provides a stream object for streaming the
+// log message, and terminates the message with a newline when going out of
+// scope.
+class GTEST_API_ GTestLog {
+ public:
+  GTestLog(GTestLogSeverity severity, const char* file, int line);
+
+  // Flushes the buffers and, if severity is GTEST_FATAL, aborts the program.
+  ~GTestLog();
+
+  ::std::ostream& GetStream() { return ::std::cerr; }
+
+ private:
+  const GTestLogSeverity severity_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestLog);
+};
+
+#define GTEST_LOG_(severity) \
+    ::testing::internal::GTestLog(::testing::internal::GTEST_##severity, \
+                                  __FILE__, __LINE__).GetStream()
+
+inline void LogToStderr() {}
+inline void FlushInfoLog() { fflush(NULL); }
+
+// INTERNAL IMPLEMENTATION - DO NOT USE.
+//
+// GTEST_CHECK_ is an all-mode assert. It aborts the program if the condition
+// is not satisfied.
+//  Synopsys:
+//    GTEST_CHECK_(boolean_condition);
+//     or
+//    GTEST_CHECK_(boolean_condition) << "Additional message";
+//
+//    This checks the condition and if the condition is not satisfied
+//    it prints message about the condition violation, including the
+//    condition itself, plus additional message streamed into it, if any,
+//    and then it aborts the program. It aborts the program irrespective of
+//    whether it is built in the debug mode or not.
+#define GTEST_CHECK_(condition) \
+    GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+    if (::testing::internal::IsTrue(condition)) \
+      ; \
+    else \
+      GTEST_LOG_(FATAL) << "Condition " #condition " failed. "
+
+// An all-mode assert to verify that the given POSIX-style function
+// call returns 0 (indicating success).  Known limitation: this
+// doesn't expand to a balanced 'if' statement, so enclose the macro
+// in {} if you need to use it as the only statement in an 'if'
+// branch.
+#define GTEST_CHECK_POSIX_SUCCESS_(posix_call) \
+  if (const int gtest_error = (posix_call)) \
+    GTEST_LOG_(FATAL) << #posix_call << "failed with error " \
+                      << gtest_error
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Use ImplicitCast_ as a safe version of static_cast for upcasting in
+// the type hierarchy (e.g. casting a Foo* to a SuperclassOfFoo* or a
+// const Foo*).  When you use ImplicitCast_, the compiler checks that
+// the cast is safe.  Such explicit ImplicitCast_s are necessary in
+// surprisingly many situations where C++ demands an exact type match
+// instead of an argument type convertable to a target type.
+//
+// The syntax for using ImplicitCast_ is the same as for static_cast:
+//
+//   ImplicitCast_<ToType>(expr)
+//
+// ImplicitCast_ would have been part of the C++ standard library,
+// but the proposal was submitted too late.  It will probably make
+// its way into the language in the future.
+//
+// This relatively ugly name is intentional. It prevents clashes with
+// similar functions users may have (e.g., implicit_cast). The internal
+// namespace alone is not enough because the function can be found by ADL.
+template<typename To>
+inline To ImplicitCast_(To x) { return x; }
+
+// When you upcast (that is, cast a pointer from type Foo to type
+// SuperclassOfFoo), it's fine to use ImplicitCast_<>, since upcasts
+// always succeed.  When you downcast (that is, cast a pointer from
+// type Foo to type SubclassOfFoo), static_cast<> isn't safe, because
+// how do you know the pointer is really of type SubclassOfFoo?  It
+// could be a bare Foo, or of type DifferentSubclassOfFoo.  Thus,
+// when you downcast, you should use this macro.  In debug mode, we
+// use dynamic_cast<> to double-check the downcast is legal (we die
+// if it's not).  In normal mode, we do the efficient static_cast<>
+// instead.  Thus, it's important to test in debug mode to make sure
+// the cast is legal!
+//    This is the only place in the code we should use dynamic_cast<>.
+// In particular, you SHOULDN'T be using dynamic_cast<> in order to
+// do RTTI (eg code like this:
+//    if (dynamic_cast<Subclass1>(foo)) HandleASubclass1Object(foo);
+//    if (dynamic_cast<Subclass2>(foo)) HandleASubclass2Object(foo);
+// You should design the code some other way not to need this.
+//
+// This relatively ugly name is intentional. It prevents clashes with
+// similar functions users may have (e.g., down_cast). The internal
+// namespace alone is not enough because the function can be found by ADL.
+template<typename To, typename From>  // use like this: DownCast_<T*>(foo);
+inline To DownCast_(From* f) {  // so we only accept pointers
+  // Ensures that To is a sub-type of From *.  This test is here only
+  // for compile-time type checking, and has no overhead in an
+  // optimized build at run-time, as it will be optimized away
+  // completely.
+  if (false) {
+    const To to = NULL;
+    ::testing::internal::ImplicitCast_<From*>(to);
+  }
+
+#if GTEST_HAS_RTTI
+  // RTTI: debug mode only!
+  GTEST_CHECK_(f == NULL || dynamic_cast<To>(f) != NULL);
+#endif
+  return static_cast<To>(f);
+}
+
+// Downcasts the pointer of type Base to Derived.
+// Derived must be a subclass of Base. The parameter MUST
+// point to a class of type Derived, not any subclass of it.
+// When RTTI is available, the function performs a runtime
+// check to enforce this.
+template <class Derived, class Base>
+Derived* CheckedDowncastToActualType(Base* base) {
+#if GTEST_HAS_RTTI
+  GTEST_CHECK_(typeid(*base) == typeid(Derived));
+  return dynamic_cast<Derived*>(base);  // NOLINT
+#else
+  return static_cast<Derived*>(base);  // Poor man's downcast.
+#endif
+}
+
+#if GTEST_HAS_STREAM_REDIRECTION
+
+// Defines the stderr capturer:
+//   CaptureStdout     - starts capturing stdout.
+//   GetCapturedStdout - stops capturing stdout and returns the captured string.
+//   CaptureStderr     - starts capturing stderr.
+//   GetCapturedStderr - stops capturing stderr and returns the captured string.
+//
+GTEST_API_ void CaptureStdout();
+GTEST_API_ std::string GetCapturedStdout();
+GTEST_API_ void CaptureStderr();
+GTEST_API_ std::string GetCapturedStderr();
+
+#endif  // GTEST_HAS_STREAM_REDIRECTION
+
+
+#if GTEST_HAS_DEATH_TEST
+
+const ::std::vector<testing::internal::string>& GetInjectableArgvs();
+void SetInjectableArgvs(const ::std::vector<testing::internal::string>*
+                             new_argvs);
+
+// A copy of all command line arguments.  Set by InitGoogleTest().
+extern ::std::vector<testing::internal::string> g_argvs;
+
+#endif  // GTEST_HAS_DEATH_TEST
+
+// Defines synchronization primitives.
+
+#if GTEST_HAS_PTHREAD
+
+// Sleeps for (roughly) n milli-seconds.  This function is only for
+// testing Google Test's own constructs.  Don't use it in user tests,
+// either directly or indirectly.
+inline void SleepMilliseconds(int n) {
+  const timespec time = {
+    0,                  // 0 seconds.
+    n * 1000L * 1000L,  // And n ms.
+  };
+  nanosleep(&time, NULL);
+}
+
+// Allows a controller thread to pause execution of newly created
+// threads until notified.  Instances of this class must be created
+// and destroyed in the controller thread.
+//
+// This class is only for testing Google Test's own constructs. Do not
+// use it in user tests, either directly or indirectly.
+class Notification {
+ public:
+  Notification() : notified_(false) {
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, NULL));
+  }
+  ~Notification() {
+    pthread_mutex_destroy(&mutex_);
+  }
+
+  // Notifies all threads created with this notification to start. Must
+  // be called from the controller thread.
+  void Notify() {
+    pthread_mutex_lock(&mutex_);
+    notified_ = true;
+    pthread_mutex_unlock(&mutex_);
+  }
+
+  // Blocks until the controller thread notifies. Must be called from a test
+  // thread.
+  void WaitForNotification() {
+    for (;;) {
+      pthread_mutex_lock(&mutex_);
+      const bool notified = notified_;
+      pthread_mutex_unlock(&mutex_);
+      if (notified)
+        break;
+      SleepMilliseconds(10);
+    }
+  }
+
+ private:
+  pthread_mutex_t mutex_;
+  bool notified_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Notification);
+};
+
+// As a C-function, ThreadFuncWithCLinkage cannot be templated itself.
+// Consequently, it cannot select a correct instantiation of ThreadWithParam
+// in order to call its Run(). Introducing ThreadWithParamBase as a
+// non-templated base class for ThreadWithParam allows us to bypass this
+// problem.
+class ThreadWithParamBase {
+ public:
+  virtual ~ThreadWithParamBase() {}
+  virtual void Run() = 0;
+};
+
+// pthread_create() accepts a pointer to a function type with the C linkage.
+// According to the Standard (7.5/1), function types with different linkages
+// are different even if they are otherwise identical.  Some compilers (for
+// example, SunStudio) treat them as different types.  Since class methods
+// cannot be defined with C-linkage we need to define a free C-function to
+// pass into pthread_create().
+extern "C" inline void* ThreadFuncWithCLinkage(void* thread) {
+  static_cast<ThreadWithParamBase*>(thread)->Run();
+  return NULL;
+}
+
+// Helper class for testing Google Test's multi-threading constructs.
+// To use it, write:
+//
+//   void ThreadFunc(int param) { /* Do things with param */ }
+//   Notification thread_can_start;
+//   ...
+//   // The thread_can_start parameter is optional; you can supply NULL.
+//   ThreadWithParam<int> thread(&ThreadFunc, 5, &thread_can_start);
+//   thread_can_start.Notify();
+//
+// These classes are only for testing Google Test's own constructs. Do
+// not use them in user tests, either directly or indirectly.
+template <typename T>
+class ThreadWithParam : public ThreadWithParamBase {
+ public:
+  typedef void (*UserThreadFunc)(T);
+
+  ThreadWithParam(
+      UserThreadFunc func, T param, Notification* thread_can_start)
+      : func_(func),
+        param_(param),
+        thread_can_start_(thread_can_start),
+        finished_(false) {
+    ThreadWithParamBase* const base = this;
+    // The thread can be created only after all fields except thread_
+    // have been initialized.
+    GTEST_CHECK_POSIX_SUCCESS_(
+        pthread_create(&thread_, 0, &ThreadFuncWithCLinkage, base));
+  }
+  ~ThreadWithParam() { Join(); }
+
+  void Join() {
+    if (!finished_) {
+      GTEST_CHECK_POSIX_SUCCESS_(pthread_join(thread_, 0));
+      finished_ = true;
+    }
+  }
+
+  virtual void Run() {
+    if (thread_can_start_ != NULL)
+      thread_can_start_->WaitForNotification();
+    func_(param_);
+  }
+
+ private:
+  const UserThreadFunc func_;  // User-supplied thread function.
+  const T param_;  // User-supplied parameter to the thread function.
+  // When non-NULL, used to block execution until the controller thread
+  // notifies.
+  Notification* const thread_can_start_;
+  bool finished_;  // true iff we know that the thread function has finished.
+  pthread_t thread_;  // The native thread object.
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParam);
+};
+
+// MutexBase and Mutex implement mutex on pthreads-based platforms. They
+// are used in conjunction with class MutexLock:
+//
+//   Mutex mutex;
+//   ...
+//   MutexLock lock(&mutex);  // Acquires the mutex and releases it at the end
+//                            // of the current scope.
+//
+// MutexBase implements behavior for both statically and dynamically
+// allocated mutexes.  Do not use MutexBase directly.  Instead, write
+// the following to define a static mutex:
+//
+//   GTEST_DEFINE_STATIC_MUTEX_(g_some_mutex);
+//
+// You can forward declare a static mutex like this:
+//
+//   GTEST_DECLARE_STATIC_MUTEX_(g_some_mutex);
+//
+// To create a dynamic mutex, just define an object of type Mutex.
+class MutexBase {
+ public:
+  // Acquires this mutex.
+  void Lock() {
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_lock(&mutex_));
+    owner_ = pthread_self();
+    has_owner_ = true;
+  }
+
+  // Releases this mutex.
+  void Unlock() {
+    // Since the lock is being released the owner_ field should no longer be
+    // considered valid. We don't protect writing to has_owner_ here, as it's
+    // the caller's responsibility to ensure that the current thread holds the
+    // mutex when this is called.
+    has_owner_ = false;
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_unlock(&mutex_));
+  }
+
+  // Does nothing if the current thread holds the mutex. Otherwise, crashes
+  // with high probability.
+  void AssertHeld() const {
+    GTEST_CHECK_(has_owner_ && pthread_equal(owner_, pthread_self()))
+        << "The current thread is not holding the mutex @" << this;
+  }
+
+  // A static mutex may be used before main() is entered.  It may even
+  // be used before the dynamic initialization stage.  Therefore we
+  // must be able to initialize a static mutex object at link time.
+  // This means MutexBase has to be a POD and its member variables
+  // have to be public.
+ public:
+  pthread_mutex_t mutex_;  // The underlying pthread mutex.
+  // has_owner_ indicates whether the owner_ field below contains a valid thread
+  // ID and is therefore safe to inspect (e.g., to use in pthread_equal()). All
+  // accesses to the owner_ field should be protected by a check of this field.
+  // An alternative might be to memset() owner_ to all zeros, but there's no
+  // guarantee that a zero'd pthread_t is necessarily invalid or even different
+  // from pthread_self().
+  bool has_owner_;
+  pthread_t owner_;  // The thread holding the mutex.
+};
+
+// Forward-declares a static mutex.
+# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
+    extern ::testing::internal::MutexBase mutex
+
+// Defines and statically (i.e. at link time) initializes a static mutex.
+// The initialization list here does not explicitly initialize each field,
+// instead relying on default initialization for the unspecified fields. In
+// particular, the owner_ field (a pthread_t) is not explicitly initialized.
+// This allows initialization to work whether pthread_t is a scalar or struct.
+// The flag -Wmissing-field-initializers must not be specified for this to work.
+# define GTEST_DEFINE_STATIC_MUTEX_(mutex) \
+    ::testing::internal::MutexBase mutex = { PTHREAD_MUTEX_INITIALIZER, false }
+
+// The Mutex class can only be used for mutexes created at runtime. It
+// shares its API with MutexBase otherwise.
+class Mutex : public MutexBase {
+ public:
+  Mutex() {
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, NULL));
+    has_owner_ = false;
+  }
+  ~Mutex() {
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_destroy(&mutex_));
+  }
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Mutex);
+};
+
+// We cannot name this class MutexLock as the ctor declaration would
+// conflict with a macro named MutexLock, which is defined on some
+// platforms.  Hence the typedef trick below.
+class GTestMutexLock {
+ public:
+  explicit GTestMutexLock(MutexBase* mutex)
+      : mutex_(mutex) { mutex_->Lock(); }
+
+  ~GTestMutexLock() { mutex_->Unlock(); }
+
+ private:
+  MutexBase* const mutex_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestMutexLock);
+};
+
+typedef GTestMutexLock MutexLock;
+
+// Helpers for ThreadLocal.
+
+// pthread_key_create() requires DeleteThreadLocalValue() to have
+// C-linkage.  Therefore it cannot be templatized to access
+// ThreadLocal<T>.  Hence the need for class
+// ThreadLocalValueHolderBase.
+class ThreadLocalValueHolderBase {
+ public:
+  virtual ~ThreadLocalValueHolderBase() {}
+};
+
+// Called by pthread to delete thread-local data stored by
+// pthread_setspecific().
+extern "C" inline void DeleteThreadLocalValue(void* value_holder) {
+  delete static_cast<ThreadLocalValueHolderBase*>(value_holder);
+}
+
+// Implements thread-local storage on pthreads-based systems.
+//
+//   // Thread 1
+//   ThreadLocal<int> tl(100);  // 100 is the default value for each thread.
+//
+//   // Thread 2
+//   tl.set(150);  // Changes the value for thread 2 only.
+//   EXPECT_EQ(150, tl.get());
+//
+//   // Thread 1
+//   EXPECT_EQ(100, tl.get());  // In thread 1, tl has the original value.
+//   tl.set(200);
+//   EXPECT_EQ(200, tl.get());
+//
+// The template type argument T must have a public copy constructor.
+// In addition, the default ThreadLocal constructor requires T to have
+// a public default constructor.
+//
+// An object managed for a thread by a ThreadLocal instance is deleted
+// when the thread exits.  Or, if the ThreadLocal instance dies in
+// that thread, when the ThreadLocal dies.  It's the user's
+// responsibility to ensure that all other threads using a ThreadLocal
+// have exited when it dies, or the per-thread objects for those
+// threads will not be deleted.
+//
+// Google Test only uses global ThreadLocal objects.  That means they
+// will die after main() has returned.  Therefore, no per-thread
+// object managed by Google Test will be leaked as long as all threads
+// using Google Test have exited when main() returns.
+template <typename T>
+class ThreadLocal {
+ public:
+  ThreadLocal() : key_(CreateKey()),
+                  default_() {}
+  explicit ThreadLocal(const T& value) : key_(CreateKey()),
+                                         default_(value) {}
+
+  ~ThreadLocal() {
+    // Destroys the managed object for the current thread, if any.
+    DeleteThreadLocalValue(pthread_getspecific(key_));
+
+    // Releases resources associated with the key.  This will *not*
+    // delete managed objects for other threads.
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_key_delete(key_));
+  }
+
+  T* pointer() { return GetOrCreateValue(); }
+  const T* pointer() const { return GetOrCreateValue(); }
+  const T& get() const { return *pointer(); }
+  void set(const T& value) { *pointer() = value; }
+
+ private:
+  // Holds a value of type T.
+  class ValueHolder : public ThreadLocalValueHolderBase {
+   public:
+    explicit ValueHolder(const T& value) : value_(value) {}
+
+    T* pointer() { return &value_; }
+
+   private:
+    T value_;
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolder);
+  };
+
+  static pthread_key_t CreateKey() {
+    pthread_key_t key;
+    // When a thread exits, DeleteThreadLocalValue() will be called on
+    // the object managed for that thread.
+    GTEST_CHECK_POSIX_SUCCESS_(
+        pthread_key_create(&key, &DeleteThreadLocalValue));
+    return key;
+  }
+
+  T* GetOrCreateValue() const {
+    ThreadLocalValueHolderBase* const holder =
+        static_cast<ThreadLocalValueHolderBase*>(pthread_getspecific(key_));
+    if (holder != NULL) {
+      return CheckedDowncastToActualType<ValueHolder>(holder)->pointer();
+    }
+
+    ValueHolder* const new_holder = new ValueHolder(default_);
+    ThreadLocalValueHolderBase* const holder_base = new_holder;
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_setspecific(key_, holder_base));
+    return new_holder->pointer();
+  }
+
+  // A key pthreads uses for looking up per-thread values.
+  const pthread_key_t key_;
+  const T default_;  // The default value for each thread.
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocal);
+};
+
+# define GTEST_IS_THREADSAFE 1
+
+#else  // GTEST_HAS_PTHREAD
+
+// A dummy implementation of synchronization primitives (mutex, lock,
+// and thread-local variable).  Necessary for compiling Google Test where
+// mutex is not supported - using Google Test in multiple threads is not
+// supported on such platforms.
+
+class Mutex {
+ public:
+  Mutex() {}
+  void Lock() {}
+  void Unlock() {}
+  void AssertHeld() const {}
+};
+
+# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
+  extern ::testing::internal::Mutex mutex
+
+# define GTEST_DEFINE_STATIC_MUTEX_(mutex) ::testing::internal::Mutex mutex
+
+class GTestMutexLock {
+ public:
+  explicit GTestMutexLock(Mutex*) {}  // NOLINT
+};
+
+typedef GTestMutexLock MutexLock;
+
+template <typename T>
+class ThreadLocal {
+ public:
+  ThreadLocal() : value_() {}
+  explicit ThreadLocal(const T& value) : value_(value) {}
+  T* pointer() { return &value_; }
+  const T* pointer() const { return &value_; }
+  const T& get() const { return value_; }
+  void set(const T& value) { value_ = value; }
+ private:
+  T value_;
+};
+
+// The above synchronization primitives have dummy implementations.
+// Therefore Google Test is not thread-safe.
+# define GTEST_IS_THREADSAFE 0
+
+#endif  // GTEST_HAS_PTHREAD
+
+// Returns the number of threads running in the process, or 0 to indicate that
+// we cannot detect it.
+GTEST_API_ size_t GetThreadCount();
+
+// Passing non-POD classes through ellipsis (...) crashes the ARM
+// compiler and generates a warning in Sun Studio.  The Nokia Symbian
+// and the IBM XL C/C++ compiler try to instantiate a copy constructor
+// for objects passed through ellipsis (...), failing for uncopyable
+// objects.  We define this to ensure that only POD is passed through
+// ellipsis on these systems.
+#if defined(__SYMBIAN32__) || defined(__IBMCPP__) || defined(__SUNPRO_CC)
+// We lose support for NULL detection where the compiler doesn't like
+// passing non-POD classes through ellipsis (...).
+# define GTEST_ELLIPSIS_NEEDS_POD_ 1
+#else
+# define GTEST_CAN_COMPARE_NULL 1
+#endif
+
+// The Nokia Symbian and IBM XL C/C++ compilers cannot decide between
+// const T& and const T* in a function template.  These compilers
+// _can_ decide between class template specializations for T and T*,
+// so a tr1::type_traits-like is_pointer works.
+#if defined(__SYMBIAN32__) || defined(__IBMCPP__)
+# define GTEST_NEEDS_IS_POINTER_ 1
+#endif
+
+template <bool bool_value>
+struct bool_constant {
+  typedef bool_constant<bool_value> type;
+  static const bool value = bool_value;
+};
+template <bool bool_value> const bool bool_constant<bool_value>::value;
+
+typedef bool_constant<false> false_type;
+typedef bool_constant<true> true_type;
+
+template <typename T>
+struct is_pointer : public false_type {};
+
+template <typename T>
+struct is_pointer<T*> : public true_type {};
+
+template <typename Iterator>
+struct IteratorTraits {
+  typedef typename Iterator::value_type value_type;
+};
+
+template <typename T>
+struct IteratorTraits<T*> {
+  typedef T value_type;
+};
+
+template <typename T>
+struct IteratorTraits<const T*> {
+  typedef T value_type;
+};
+
+#if GTEST_OS_WINDOWS
+# define GTEST_PATH_SEP_ "\\"
+# define GTEST_HAS_ALT_PATH_SEP_ 1
+// The biggest signed integer type the compiler supports.
+typedef __int64 BiggestInt;
+#else
+# define GTEST_PATH_SEP_ "/"
+# define GTEST_HAS_ALT_PATH_SEP_ 0
+typedef long long BiggestInt;  // NOLINT
+#endif  // GTEST_OS_WINDOWS
+
+// Utilities for char.
+
+// isspace(int ch) and friends accept an unsigned char or EOF.  char
+// may be signed, depending on the compiler (or compiler flags).
+// Therefore we need to cast a char to unsigned char before calling
+// isspace(), etc.
+
+inline bool IsAlpha(char ch) {
+  return isalpha(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsAlNum(char ch) {
+  return isalnum(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsDigit(char ch) {
+  return isdigit(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsLower(char ch) {
+  return islower(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsSpace(char ch) {
+  return isspace(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsUpper(char ch) {
+  return isupper(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsXDigit(char ch) {
+  return isxdigit(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsXDigit(wchar_t ch) {
+  const unsigned char low_byte = static_cast<unsigned char>(ch);
+  return ch == low_byte && isxdigit(low_byte) != 0;
+}
+
+inline char ToLower(char ch) {
+  return static_cast<char>(tolower(static_cast<unsigned char>(ch)));
+}
+inline char ToUpper(char ch) {
+  return static_cast<char>(toupper(static_cast<unsigned char>(ch)));
+}
+
+// The testing::internal::posix namespace holds wrappers for common
+// POSIX functions.  These wrappers hide the differences between
+// Windows/MSVC and POSIX systems.  Since some compilers define these
+// standard functions as macros, the wrapper cannot have the same name
+// as the wrapped function.
+
+namespace posix {
+
+// Functions with a different name on Windows.
+
+#if GTEST_OS_WINDOWS
+
+typedef struct _stat StatStruct;
+
+# ifdef __BORLANDC__
+inline int IsATTY(int fd) { return isatty(fd); }
+inline int StrCaseCmp(const char* s1, const char* s2) {
+  return stricmp(s1, s2);
+}
+inline char* StrDup(const char* src) { return strdup(src); }
+# else  // !__BORLANDC__
+#  if GTEST_OS_WINDOWS_MOBILE
+inline int IsATTY(int /* fd */) { return 0; }
+#  else
+inline int IsATTY(int fd) { return _isatty(fd); }
+#  endif  // GTEST_OS_WINDOWS_MOBILE
+inline int StrCaseCmp(const char* s1, const char* s2) {
+  return _stricmp(s1, s2);
+}
+inline char* StrDup(const char* src) { return _strdup(src); }
+# endif  // __BORLANDC__
+
+# if GTEST_OS_WINDOWS_MOBILE
+inline int FileNo(FILE* file) { return reinterpret_cast<int>(_fileno(file)); }
+// Stat(), RmDir(), and IsDir() are not needed on Windows CE at this
+// time and thus not defined there.
+# else
+inline int FileNo(FILE* file) { return _fileno(file); }
+inline int Stat(const char* path, StatStruct* buf) { return _stat(path, buf); }
+inline int RmDir(const char* dir) { return _rmdir(dir); }
+inline bool IsDir(const StatStruct& st) {
+  return (_S_IFDIR & st.st_mode) != 0;
+}
+# endif  // GTEST_OS_WINDOWS_MOBILE
+
+#else
+
+typedef struct stat StatStruct;
+
+inline int FileNo(FILE* file) { return fileno(file); }
+inline int IsATTY(int fd) { return isatty(fd); }
+inline int Stat(const char* path, StatStruct* buf) { return stat(path, buf); }
+inline int StrCaseCmp(const char* s1, const char* s2) {
+  return strcasecmp(s1, s2);
+}
+inline char* StrDup(const char* src) { return strdup(src); }
+inline int RmDir(const char* dir) { return rmdir(dir); }
+inline bool IsDir(const StatStruct& st) { return S_ISDIR(st.st_mode); }
+
+#endif  // GTEST_OS_WINDOWS
+
+// Functions deprecated by MSVC 8.0.
+
+#ifdef _MSC_VER
+// Temporarily disable warning 4996 (deprecated function).
+# pragma warning(push)
+# pragma warning(disable:4996)
+#endif
+
+inline const char* StrNCpy(char* dest, const char* src, size_t n) {
+  return strncpy(dest, src, n);
+}
+
+// ChDir(), FReopen(), FDOpen(), Read(), Write(), Close(), and
+// StrError() aren't needed on Windows CE at this time and thus not
+// defined there.
+
+#if !GTEST_OS_WINDOWS_MOBILE
+inline int ChDir(const char* dir) { return chdir(dir); }
+#endif
+inline FILE* FOpen(const char* path, const char* mode) {
+  return fopen(path, mode);
+}
+#if !GTEST_OS_WINDOWS_MOBILE
+inline FILE *FReopen(const char* path, const char* mode, FILE* stream) {
+  return freopen(path, mode, stream);
+}
+inline FILE* FDOpen(int fd, const char* mode) { return fdopen(fd, mode); }
+#endif
+inline int FClose(FILE* fp) { return fclose(fp); }
+#if !GTEST_OS_WINDOWS_MOBILE
+inline int Read(int fd, void* buf, unsigned int count) {
+  return static_cast<int>(read(fd, buf, count));
+}
+inline int Write(int fd, const void* buf, unsigned int count) {
+  return static_cast<int>(write(fd, buf, count));
+}
+inline int Close(int fd) { return close(fd); }
+inline const char* StrError(int errnum) { return strerror(errnum); }
+#endif
+inline const char* GetEnv(const char* name) {
+#if GTEST_OS_WINDOWS_MOBILE
+  // We are on Windows CE, which has no environment variables.
+  return NULL;
+#elif defined(__BORLANDC__) || defined(__SunOS_5_8) || defined(__SunOS_5_9)
+  // Environment variables which we programmatically clear will be set to the
+  // empty string rather than unset (NULL).  Handle that case.
+  const char* const env = getenv(name);
+  return (env != NULL && env[0] != '\0') ? env : NULL;
+#else
+  return getenv(name);
+#endif
+}
+
+#ifdef _MSC_VER
+# pragma warning(pop)  // Restores the warning state.
+#endif
+
+#if GTEST_OS_WINDOWS_MOBILE
+// Windows CE has no C library. The abort() function is used in
+// several places in Google Test. This implementation provides a reasonable
+// imitation of standard behaviour.
+void Abort();
+#else
+inline void Abort() { abort(); }
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+}  // namespace posix
+
+// MSVC "deprecates" snprintf and issues warnings wherever it is used.  In
+// order to avoid these warnings, we need to use _snprintf or _snprintf_s on
+// MSVC-based platforms.  We map the GTEST_SNPRINTF_ macro to the appropriate
+// function in order to achieve that.  We use macro definition here because
+// snprintf is a variadic function.
+#if _MSC_VER >= 1400 && !GTEST_OS_WINDOWS_MOBILE
+// MSVC 2005 and above support variadic macros.
+# define GTEST_SNPRINTF_(buffer, size, format, ...) \
+     _snprintf_s(buffer, size, size, format, __VA_ARGS__)
+#elif defined(_MSC_VER)
+// Windows CE does not define _snprintf_s and MSVC prior to 2005 doesn't
+// complain about _snprintf.
+# define GTEST_SNPRINTF_ _snprintf
+#else
+# define GTEST_SNPRINTF_ snprintf
+#endif
+
+// The maximum number a BiggestInt can represent.  This definition
+// works no matter BiggestInt is represented in one's complement or
+// two's complement.
+//
+// We cannot rely on numeric_limits in STL, as __int64 and long long
+// are not part of standard C++ and numeric_limits doesn't need to be
+// defined for them.
+const BiggestInt kMaxBiggestInt =
+    ~(static_cast<BiggestInt>(1) << (8*sizeof(BiggestInt) - 1));
+
+// This template class serves as a compile-time function from size to
+// type.  It maps a size in bytes to a primitive type with that
+// size. e.g.
+//
+//   TypeWithSize<4>::UInt
+//
+// is typedef-ed to be unsigned int (unsigned integer made up of 4
+// bytes).
+//
+// Such functionality should belong to STL, but I cannot find it
+// there.
+//
+// Google Test uses this class in the implementation of floating-point
+// comparison.
+//
+// For now it only handles UInt (unsigned int) as that's all Google Test
+// needs.  Other types can be easily added in the future if need
+// arises.
+template <size_t size>
+class TypeWithSize {
+ public:
+  // This prevents the user from using TypeWithSize<N> with incorrect
+  // values of N.
+  typedef void UInt;
+};
+
+// The specialization for size 4.
+template <>
+class TypeWithSize<4> {
+ public:
+  // unsigned int has size 4 in both gcc and MSVC.
+  //
+  // As base/basictypes.h doesn't compile on Windows, we cannot use
+  // uint32, uint64, and etc here.
+  typedef int Int;
+  typedef unsigned int UInt;
+};
+
+// The specialization for size 8.
+template <>
+class TypeWithSize<8> {
+ public:
+#if GTEST_OS_WINDOWS
+  typedef __int64 Int;
+  typedef unsigned __int64 UInt;
+#else
+  typedef long long Int;  // NOLINT
+  typedef unsigned long long UInt;  // NOLINT
+#endif  // GTEST_OS_WINDOWS
+};
+
+// Integer types of known sizes.
+typedef TypeWithSize<4>::Int Int32;
+typedef TypeWithSize<4>::UInt UInt32;
+typedef TypeWithSize<8>::Int Int64;
+typedef TypeWithSize<8>::UInt UInt64;
+typedef TypeWithSize<8>::Int TimeInMillis;  // Represents time in milliseconds.
+
+// Utilities for command line flags and environment variables.
+
+// Macro for referencing flags.
+#define GTEST_FLAG(name) FLAGS_gtest_##name
+
+// Macros for declaring flags.
+#define GTEST_DECLARE_bool_(name) GTEST_API_ extern bool GTEST_FLAG(name)
+#define GTEST_DECLARE_int32_(name) \
+    GTEST_API_ extern ::testing::internal::Int32 GTEST_FLAG(name)
+#define GTEST_DECLARE_string_(name) \
+    GTEST_API_ extern ::std::string GTEST_FLAG(name)
+
+// Macros for defining flags.
+#define GTEST_DEFINE_bool_(name, default_val, doc) \
+    GTEST_API_ bool GTEST_FLAG(name) = (default_val)
+#define GTEST_DEFINE_int32_(name, default_val, doc) \
+    GTEST_API_ ::testing::internal::Int32 GTEST_FLAG(name) = (default_val)
+#define GTEST_DEFINE_string_(name, default_val, doc) \
+    GTEST_API_ ::std::string GTEST_FLAG(name) = (default_val)
+
+// Thread annotations
+#define GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks)
+#define GTEST_LOCK_EXCLUDED_(locks)
+
+// Parses 'str' for a 32-bit signed integer.  If successful, writes the result
+// to *value and returns true; otherwise leaves *value unchanged and returns
+// false.
+// TODO(chandlerc): Find a better way to refactor flag and environment parsing
+// out of both gtest-port.cc and gtest.cc to avoid exporting this utility
+// function.
+bool ParseInt32(const Message& src_text, const char* str, Int32* value);
+
+// Parses a bool/Int32/string from the environment variable
+// corresponding to the given Google Test flag.
+bool BoolFromGTestEnv(const char* flag, bool default_val);
+GTEST_API_ Int32 Int32FromGTestEnv(const char* flag, Int32 default_val);
+const char* StringFromGTestEnv(const char* flag, const char* default_val);
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
+
+#if GTEST_OS_LINUX
+# include <stdlib.h>
+# include <sys/types.h>
+# include <sys/wait.h>
+# include <unistd.h>
+#endif  // GTEST_OS_LINUX
+
+#if GTEST_HAS_EXCEPTIONS
+# include <stdexcept>
+#endif
+
+#include <ctype.h>
+#include <float.h>
+#include <string.h>
+#include <iomanip>
+#include <limits>
+#include <set>
+
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+//
+// The Google C++ Testing Framework (Google Test)
+//
+// This header file defines the Message class.
+//
+// IMPORTANT NOTE: Due to limitation of the C++ language, we have to
+// leave some internal implementation details in this header file.
+// They are clearly marked by comments like this:
+//
+//   // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+//
+// Such code is NOT meant to be used by a user directly, and is subject
+// to CHANGE WITHOUT NOTICE.  Therefore DO NOT DEPEND ON IT in a user
+// program!
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
+#define GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
+
+#include <limits>
+
+
+// Ensures that there is at least one operator<< in the global namespace.
+// See Message& operator<<(...) below for why.
+void operator<<(const testing::internal::Secret&, int);
+
+namespace testing {
+
+// The Message class works like an ostream repeater.
+//
+// Typical usage:
+//
+//   1. You stream a bunch of values to a Message object.
+//      It will remember the text in a stringstream.
+//   2. Then you stream the Message object to an ostream.
+//      This causes the text in the Message to be streamed
+//      to the ostream.
+//
+// For example;
+//
+//   testing::Message foo;
+//   foo << 1 << " != " << 2;
+//   std::cout << foo;
+//
+// will print "1 != 2".
+//
+// Message is not intended to be inherited from.  In particular, its
+// destructor is not virtual.
+//
+// Note that stringstream behaves differently in gcc and in MSVC.  You
+// can stream a NULL char pointer to it in the former, but not in the
+// latter (it causes an access violation if you do).  The Message
+// class hides this difference by treating a NULL char pointer as
+// "(null)".
+class GTEST_API_ Message {
+ private:
+  // The type of basic IO manipulators (endl, ends, and flush) for
+  // narrow streams.
+  typedef std::ostream& (*BasicNarrowIoManip)(std::ostream&);
+
+ public:
+  // Constructs an empty Message.
+  Message();
+
+  // Copy constructor.
+  Message(const Message& msg) : ss_(new ::std::stringstream) {  // NOLINT
+    *ss_ << msg.GetString();
+  }
+
+  // Constructs a Message from a C-string.
+  explicit Message(const char* str) : ss_(new ::std::stringstream) {
+    *ss_ << str;
+  }
+
+#if GTEST_OS_SYMBIAN
+  // Streams a value (either a pointer or not) to this object.
+  template <typename T>
+  inline Message& operator <<(const T& value) {
+    StreamHelper(typename internal::is_pointer<T>::type(), value);
+    return *this;
+  }
+#else
+  // Streams a non-pointer value to this object.
+  template <typename T>
+  inline Message& operator <<(const T& val) {
+    // Some libraries overload << for STL containers.  These
+    // overloads are defined in the global namespace instead of ::std.
+    //
+    // C++'s symbol lookup rule (i.e. Koenig lookup) says that these
+    // overloads are visible in either the std namespace or the global
+    // namespace, but not other namespaces, including the testing
+    // namespace which Google Test's Message class is in.
+    //
+    // To allow STL containers (and other types that has a << operator
+    // defined in the global namespace) to be used in Google Test
+    // assertions, testing::Message must access the custom << operator
+    // from the global namespace.  With this using declaration,
+    // overloads of << defined in the global namespace and those
+    // visible via Koenig lookup are both exposed in this function.
+    using ::operator <<;
+    *ss_ << val;
+    return *this;
+  }
+
+  // Streams a pointer value to this object.
+  //
+  // This function is an overload of the previous one.  When you
+  // stream a pointer to a Message, this definition will be used as it
+  // is more specialized.  (The C++ Standard, section
+  // [temp.func.order].)  If you stream a non-pointer, then the
+  // previous definition will be used.
+  //
+  // The reason for this overload is that streaming a NULL pointer to
+  // ostream is undefined behavior.  Depending on the compiler, you
+  // may get "0", "(nil)", "(null)", or an access violation.  To
+  // ensure consistent result across compilers, we always treat NULL
+  // as "(null)".
+  template <typename T>
+  inline Message& operator <<(T* const& pointer) {  // NOLINT
+    if (pointer == NULL) {
+      *ss_ << "(null)";
+    } else {
+      *ss_ << pointer;
+    }
+    return *this;
+  }
+#endif  // GTEST_OS_SYMBIAN
+
+  // Since the basic IO manipulators are overloaded for both narrow
+  // and wide streams, we have to provide this specialized definition
+  // of operator <<, even though its body is the same as the
+  // templatized version above.  Without this definition, streaming
+  // endl or other basic IO manipulators to Message will confuse the
+  // compiler.
+  Message& operator <<(BasicNarrowIoManip val) {
+    *ss_ << val;
+    return *this;
+  }
+
+  // Instead of 1/0, we want to see true/false for bool values.
+  Message& operator <<(bool b) {
+    return *this << (b ? "true" : "false");
+  }
+
+  // These two overloads allow streaming a wide C string to a Message
+  // using the UTF-8 encoding.
+  Message& operator <<(const wchar_t* wide_c_str);
+  Message& operator <<(wchar_t* wide_c_str);
+
+#if GTEST_HAS_STD_WSTRING
+  // Converts the given wide string to a narrow string using the UTF-8
+  // encoding, and streams the result to this Message object.
+  Message& operator <<(const ::std::wstring& wstr);
+#endif  // GTEST_HAS_STD_WSTRING
+
+#if GTEST_HAS_GLOBAL_WSTRING
+  // Converts the given wide string to a narrow string using the UTF-8
+  // encoding, and streams the result to this Message object.
+  Message& operator <<(const ::wstring& wstr);
+#endif  // GTEST_HAS_GLOBAL_WSTRING
+
+  // Gets the text streamed to this object so far as an std::string.
+  // Each '\0' character in the buffer is replaced with "\\0".
+  //
+  // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+  std::string GetString() const;
+
+ private:
+
+#if GTEST_OS_SYMBIAN
+  // These are needed as the Nokia Symbian Compiler cannot decide between
+  // const T& and const T* in a function template. The Nokia compiler _can_
+  // decide between class template specializations for T and T*, so a
+  // tr1::type_traits-like is_pointer works, and we can overload on that.
+  template <typename T>
+  inline void StreamHelper(internal::true_type /*is_pointer*/, T* pointer) {
+    if (pointer == NULL) {
+      *ss_ << "(null)";
+    } else {
+      *ss_ << pointer;
+    }
+  }
+  template <typename T>
+  inline void StreamHelper(internal::false_type /*is_pointer*/,
+                           const T& value) {
+    // See the comments in Message& operator <<(const T&) above for why
+    // we need this using statement.
+    using ::operator <<;
+    *ss_ << value;
+  }
+#endif  // GTEST_OS_SYMBIAN
+
+  // We'll hold the text streamed to this object here.
+  const internal::scoped_ptr< ::std::stringstream> ss_;
+
+  // We declare (but don't implement) this to prevent the compiler
+  // from implementing the assignment operator.
+  void operator=(const Message&);
+};
+
+// Streams a Message to an ostream.
+inline std::ostream& operator <<(std::ostream& os, const Message& sb) {
+  return os << sb.GetString();
+}
+
+namespace internal {
+
+// Converts a streamable value to an std::string.  A NULL pointer is
+// converted to "(null)".  When the input value is a ::string,
+// ::std::string, ::wstring, or ::std::wstring object, each NUL
+// character in it is replaced with "\\0".
+template <typename T>
+std::string StreamableToString(const T& streamable) {
+  return (Message() << streamable).GetString();
+}
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: wan@google.com (Zhanyong Wan), eefacm@gmail.com (Sean Mcafee)
+//
+// The Google C++ Testing Framework (Google Test)
+//
+// This header file declares the String class and functions used internally by
+// Google Test.  They are subject to change without notice. They should not used
+// by code external to Google Test.
+//
+// This header file is #included by <gtest/internal/gtest-internal.h>.
+// It should not be #included by other files.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
+
+#ifdef __BORLANDC__
+// string.h is not guaranteed to provide strcpy on C++ Builder.
+# include <mem.h>
+#endif
+
+#include <string.h>
+#include <string>
+
+
+namespace testing {
+namespace internal {
+
+// String - an abstract class holding static string utilities.
+class GTEST_API_ String {
+ public:
+  // Static utility methods
+
+  // Clones a 0-terminated C string, allocating memory using new.  The
+  // caller is responsible for deleting the return value using
+  // delete[].  Returns the cloned string, or NULL if the input is
+  // NULL.
+  //
+  // This is different from strdup() in string.h, which allocates
+  // memory using malloc().
+  static const char* CloneCString(const char* c_str);
+
+#if GTEST_OS_WINDOWS_MOBILE
+  // Windows CE does not have the 'ANSI' versions of Win32 APIs. To be
+  // able to pass strings to Win32 APIs on CE we need to convert them
+  // to 'Unicode', UTF-16.
+
+  // Creates a UTF-16 wide string from the given ANSI string, allocating
+  // memory using new. The caller is responsible for deleting the return
+  // value using delete[]. Returns the wide string, or NULL if the
+  // input is NULL.
+  //
+  // The wide string is created using the ANSI codepage (CP_ACP) to
+  // match the behaviour of the ANSI versions of Win32 calls and the
+  // C runtime.
+  static LPCWSTR AnsiToUtf16(const char* c_str);
+
+  // Creates an ANSI string from the given wide string, allocating
+  // memory using new. The caller is responsible for deleting the return
+  // value using delete[]. Returns the ANSI string, or NULL if the
+  // input is NULL.
+  //
+  // The returned string is created using the ANSI codepage (CP_ACP) to
+  // match the behaviour of the ANSI versions of Win32 calls and the
+  // C runtime.
+  static const char* Utf16ToAnsi(LPCWSTR utf16_str);
+#endif
+
+  // Compares two C strings.  Returns true iff they have the same content.
+  //
+  // Unlike strcmp(), this function can handle NULL argument(s).  A
+  // NULL C string is considered different to any non-NULL C string,
+  // including the empty string.
+  static bool CStringEquals(const char* lhs, const char* rhs);
+
+  // Converts a wide C string to a String using the UTF-8 encoding.
+  // NULL will be converted to "(null)".  If an error occurred during
+  // the conversion, "(failed to convert from wide string)" is
+  // returned.
+  static std::string ShowWideCString(const wchar_t* wide_c_str);
+
+  // Compares two wide C strings.  Returns true iff they have the same
+  // content.
+  //
+  // Unlike wcscmp(), this function can handle NULL argument(s).  A
+  // NULL C string is considered different to any non-NULL C string,
+  // including the empty string.
+  static bool WideCStringEquals(const wchar_t* lhs, const wchar_t* rhs);
+
+  // Compares two C strings, ignoring case.  Returns true iff they
+  // have the same content.
+  //
+  // Unlike strcasecmp(), this function can handle NULL argument(s).
+  // A NULL C string is considered different to any non-NULL C string,
+  // including the empty string.
+  static bool CaseInsensitiveCStringEquals(const char* lhs,
+                                           const char* rhs);
+
+  // Compares two wide C strings, ignoring case.  Returns true iff they
+  // have the same content.
+  //
+  // Unlike wcscasecmp(), this function can handle NULL argument(s).
+  // A NULL C string is considered different to any non-NULL wide C string,
+  // including the empty string.
+  // NB: The implementations on different platforms slightly differ.
+  // On windows, this method uses _wcsicmp which compares according to LC_CTYPE
+  // environment variable. On GNU platform this method uses wcscasecmp
+  // which compares according to LC_CTYPE category of the current locale.
+  // On MacOS X, it uses towlower, which also uses LC_CTYPE category of the
+  // current locale.
+  static bool CaseInsensitiveWideCStringEquals(const wchar_t* lhs,
+                                               const wchar_t* rhs);
+
+  // Returns true iff the given string ends with the given suffix, ignoring
+  // case. Any string is considered to end with an empty suffix.
+  static bool EndsWithCaseInsensitive(
+      const std::string& str, const std::string& suffix);
+
+  // Formats an int value as "%02d".
+  static std::string FormatIntWidth2(int value);  // "%02d" for width == 2
+
+  // Formats an int value as "%X".
+  static std::string FormatHexInt(int value);
+
+  // Formats a byte as "%02X".
+  static std::string FormatByte(unsigned char value);
+
+ private:
+  String();  // Not meant to be instantiated.
+};  // class String
+
+// Gets the content of the stringstream's buffer as an std::string.  Each '\0'
+// character in the buffer is replaced with "\\0".
+GTEST_API_ std::string StringStreamToString(::std::stringstream* stream);
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: keith.ray@gmail.com (Keith Ray)
+//
+// Google Test filepath utilities
+//
+// This header file declares classes and functions used internally by
+// Google Test.  They are subject to change without notice.
+//
+// This file is #included in <gtest/internal/gtest-internal.h>.
+// Do not include this header file separately!
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
+
+
+namespace testing {
+namespace internal {
+
+// FilePath - a class for file and directory pathname manipulation which
+// handles platform-specific conventions (like the pathname separator).
+// Used for helper functions for naming files in a directory for xml output.
+// Except for Set methods, all methods are const or static, which provides an
+// "immutable value object" -- useful for peace of mind.
+// A FilePath with a value ending in a path separator ("like/this/") represents
+// a directory, otherwise it is assumed to represent a file. In either case,
+// it may or may not represent an actual file or directory in the file system.
+// Names are NOT checked for syntax correctness -- no checking for illegal
+// characters, malformed paths, etc.
+
+class GTEST_API_ FilePath {
+ public:
+  FilePath() : pathname_("") { }
+  FilePath(const FilePath& rhs) : pathname_(rhs.pathname_) { }
+
+  explicit FilePath(const std::string& pathname) : pathname_(pathname) {
+    Normalize();
+  }
+
+  FilePath& operator=(const FilePath& rhs) {
+    Set(rhs);
+    return *this;
+  }
+
+  void Set(const FilePath& rhs) {
+    pathname_ = rhs.pathname_;
+  }
+
+  const std::string& string() const { return pathname_; }
+  const char* c_str() const { return pathname_.c_str(); }
+
+  // Returns the current working directory, or "" if unsuccessful.
+  static FilePath GetCurrentDir();
+
+  // Given directory = "dir", base_name = "test", number = 0,
+  // extension = "xml", returns "dir/test.xml". If number is greater
+  // than zero (e.g., 12), returns "dir/test_12.xml".
+  // On Windows platform, uses \ as the separator rather than /.
+  static FilePath MakeFileName(const FilePath& directory,
+                               const FilePath& base_name,
+                               int number,
+                               const char* extension);
+
+  // Given directory = "dir", relative_path = "test.xml",
+  // returns "dir/test.xml".
+  // On Windows, uses \ as the separator rather than /.
+  static FilePath ConcatPaths(const FilePath& directory,
+                              const FilePath& relative_path);
+
+  // Returns a pathname for a file that does not currently exist. The pathname
+  // will be directory/base_name.extension or
+  // directory/base_name_<number>.extension if directory/base_name.extension
+  // already exists. The number will be incremented until a pathname is found
+  // that does not already exist.
+  // Examples: 'dir/foo_test.xml' or 'dir/foo_test_1.xml'.
+  // There could be a race condition if two or more processes are calling this
+  // function at the same time -- they could both pick the same filename.
+  static FilePath GenerateUniqueFileName(const FilePath& directory,
+                                         const FilePath& base_name,
+                                         const char* extension);
+
+  // Returns true iff the path is "".
+  bool IsEmpty() const { return pathname_.empty(); }
+
+  // If input name has a trailing separator character, removes it and returns
+  // the name, otherwise return the name string unmodified.
+  // On Windows platform, uses \ as the separator, other platforms use /.
+  FilePath RemoveTrailingPathSeparator() const;
+
+  // Returns a copy of the FilePath with the directory part removed.
+  // Example: FilePath("path/to/file").RemoveDirectoryName() returns
+  // FilePath("file"). If there is no directory part ("just_a_file"), it returns
+  // the FilePath unmodified. If there is no file part ("just_a_dir/") it
+  // returns an empty FilePath ("").
+  // On Windows platform, '\' is the path separator, otherwise it is '/'.
+  FilePath RemoveDirectoryName() const;
+
+  // RemoveFileName returns the directory path with the filename removed.
+  // Example: FilePath("path/to/file").RemoveFileName() returns "path/to/".
+  // If the FilePath is "a_file" or "/a_file", RemoveFileName returns
+  // FilePath("./") or, on Windows, FilePath(".\\"). If the filepath does
+  // not have a file, like "just/a/dir/", it returns the FilePath unmodified.
+  // On Windows platform, '\' is the path separator, otherwise it is '/'.
+  FilePath RemoveFileName() const;
+
+  // Returns a copy of the FilePath with the case-insensitive extension removed.
+  // Example: FilePath("dir/file.exe").RemoveExtension("EXE") returns
+  // FilePath("dir/file"). If a case-insensitive extension is not
+  // found, returns a copy of the original FilePath.
+  FilePath RemoveExtension(const char* extension) const;
+
+  // Creates directories so that path exists. Returns true if successful or if
+  // the directories already exist; returns false if unable to create
+  // directories for any reason. Will also return false if the FilePath does
+  // not represent a directory (that is, it doesn't end with a path separator).
+  bool CreateDirectoriesRecursively() const;
+
+  // Create the directory so that path exists. Returns true if successful or
+  // if the directory already exists; returns false if unable to create the
+  // directory for any reason, including if the parent directory does not
+  // exist. Not named "CreateDirectory" because that's a macro on Windows.
+  bool CreateFolder() const;
+
+  // Returns true if FilePath describes something in the file-system,
+  // either a file, directory, or whatever, and that something exists.
+  bool FileOrDirectoryExists() const;
+
+  // Returns true if pathname describes a directory in the file-system
+  // that exists.
+  bool DirectoryExists() const;
+
+  // Returns true if FilePath ends with a path separator, which indicates that
+  // it is intended to represent a directory. Returns false otherwise.
+  // This does NOT check that a directory (or file) actually exists.
+  bool IsDirectory() const;
+
+  // Returns true if pathname describes a root directory. (Windows has one
+  // root directory per disk drive.)
+  bool IsRootDirectory() const;
+
+  // Returns true if pathname describes an absolute path.
+  bool IsAbsolutePath() const;
+
+ private:
+  // Replaces multiple consecutive separators with a single separator.
+  // For example, "bar///foo" becomes "bar/foo". Does not eliminate other
+  // redundancies that might be in a pathname involving "." or "..".
+  //
+  // A pathname with multiple consecutive separators may occur either through
+  // user error or as a result of some scripts or APIs that generate a pathname
+  // with a trailing separator. On other platforms the same API or script
+  // may NOT generate a pathname with a trailing "/". Then elsewhere that
+  // pathname may have another "/" and pathname components added to it,
+  // without checking for the separator already being there.
+  // The script language and operating system may allow paths like "foo//bar"
+  // but some of the functions in FilePath will not handle that correctly. In
+  // particular, RemoveTrailingPathSeparator() only removes one separator, and
+  // it is called in CreateDirectoriesRecursively() assuming that it will change
+  // a pathname from directory syntax (trailing separator) to filename syntax.
+  //
+  // On Windows this method also replaces the alternate path separator '/' with
+  // the primary path separator '\\', so that for example "bar\\/\\foo" becomes
+  // "bar\\foo".
+
+  void Normalize();
+
+  // Returns a pointer to the last occurence of a valid path separator in
+  // the FilePath. On Windows, for example, both '/' and '\' are valid path
+  // separators. Returns NULL if no path separator was found.
+  const char* FindLastPathSeparator() const;
+
+  std::string pathname_;
+};  // class FilePath
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
+// This file was GENERATED by command:
+//     pump.py gtest-type-util.h.pump
+// DO NOT EDIT BY HAND!!!
+
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+
+// Type utilities needed for implementing typed and type-parameterized
+// tests.  This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
+//
+// Currently we support at most 50 types in a list, and at most 50
+// type-parameterized tests in one type-parameterized test case.
+// Please contact googletestframework@googlegroups.com if you need
+// more.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
+
+
+// #ifdef __GNUC__ is too general here.  It is possible to use gcc without using
+// libstdc++ (which is where cxxabi.h comes from).
+# if GTEST_HAS_CXXABI_H_
+#  include <cxxabi.h>
+# elif defined(__HP_aCC)
+#  include <acxx_demangle.h>
+# endif  // GTEST_HASH_CXXABI_H_
+
+namespace testing {
+namespace internal {
+
+// GetTypeName<T>() returns a human-readable name of type T.
+// NB: This function is also used in Google Mock, so don't move it inside of
+// the typed-test-only section below.
+template <typename T>
+std::string GetTypeName() {
+# if GTEST_HAS_RTTI
+
+  const char* const name = typeid(T).name();
+#  if GTEST_HAS_CXXABI_H_ || defined(__HP_aCC)
+  int status = 0;
+  // gcc's implementation of typeid(T).name() mangles the type name,
+  // so we have to demangle it.
+#   if GTEST_HAS_CXXABI_H_
+  using abi::__cxa_demangle;
+#   endif  // GTEST_HAS_CXXABI_H_
+  char* const readable_name = __cxa_demangle(name, 0, 0, &status);
+  const std::string name_str(status == 0 ? readable_name : name);
+  free(readable_name);
+  return name_str;
+#  else
+  return name;
+#  endif  // GTEST_HAS_CXXABI_H_ || __HP_aCC
+
+# else
+
+  return "<type>";
+
+# endif  // GTEST_HAS_RTTI
+}
+
+#if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
+
+// AssertyTypeEq<T1, T2>::type is defined iff T1 and T2 are the same
+// type.  This can be used as a compile-time assertion to ensure that
+// two types are equal.
+
+template <typename T1, typename T2>
+struct AssertTypeEq;
+
+template <typename T>
+struct AssertTypeEq<T, T> {
+  typedef bool type;
+};
+
+// A unique type used as the default value for the arguments of class
+// template Types.  This allows us to simulate variadic templates
+// (e.g. Types<int>, Type<int, double>, and etc), which C++ doesn't
+// support directly.
+struct None {};
+
+// The following family of struct and struct templates are used to
+// represent type lists.  In particular, TypesN<T1, T2, ..., TN>
+// represents a type list with N types (T1, T2, ..., and TN) in it.
+// Except for Types0, every struct in the family has two member types:
+// Head for the first type in the list, and Tail for the rest of the
+// list.
+
+// The empty type list.
+struct Types0 {};
+
+// Type lists of length 1, 2, 3, and so on.
+
+template <typename T1>
+struct Types1 {
+  typedef T1 Head;
+  typedef Types0 Tail;
+};
+template <typename T1, typename T2>
+struct Types2 {
+  typedef T1 Head;
+  typedef Types1<T2> Tail;
+};
+
+template <typename T1, typename T2, typename T3>
+struct Types3 {
+  typedef T1 Head;
+  typedef Types2<T2, T3> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4>
+struct Types4 {
+  typedef T1 Head;
+  typedef Types3<T2, T3, T4> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+struct Types5 {
+  typedef T1 Head;
+  typedef Types4<T2, T3, T4, T5> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6>
+struct Types6 {
+  typedef T1 Head;
+  typedef Types5<T2, T3, T4, T5, T6> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7>
+struct Types7 {
+  typedef T1 Head;
+  typedef Types6<T2, T3, T4, T5, T6, T7> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8>
+struct Types8 {
+  typedef T1 Head;
+  typedef Types7<T2, T3, T4, T5, T6, T7, T8> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9>
+struct Types9 {
+  typedef T1 Head;
+  typedef Types8<T2, T3, T4, T5, T6, T7, T8, T9> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10>
+struct Types10 {
+  typedef T1 Head;
+  typedef Types9<T2, T3, T4, T5, T6, T7, T8, T9, T10> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11>
+struct Types11 {
+  typedef T1 Head;
+  typedef Types10<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12>
+struct Types12 {
+  typedef T1 Head;
+  typedef Types11<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13>
+struct Types13 {
+  typedef T1 Head;
+  typedef Types12<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14>
+struct Types14 {
+  typedef T1 Head;
+  typedef Types13<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15>
+struct Types15 {
+  typedef T1 Head;
+  typedef Types14<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16>
+struct Types16 {
+  typedef T1 Head;
+  typedef Types15<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17>
+struct Types17 {
+  typedef T1 Head;
+  typedef Types16<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18>
+struct Types18 {
+  typedef T1 Head;
+  typedef Types17<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19>
+struct Types19 {
+  typedef T1 Head;
+  typedef Types18<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20>
+struct Types20 {
+  typedef T1 Head;
+  typedef Types19<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21>
+struct Types21 {
+  typedef T1 Head;
+  typedef Types20<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22>
+struct Types22 {
+  typedef T1 Head;
+  typedef Types21<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23>
+struct Types23 {
+  typedef T1 Head;
+  typedef Types22<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24>
+struct Types24 {
+  typedef T1 Head;
+  typedef Types23<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25>
+struct Types25 {
+  typedef T1 Head;
+  typedef Types24<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26>
+struct Types26 {
+  typedef T1 Head;
+  typedef Types25<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27>
+struct Types27 {
+  typedef T1 Head;
+  typedef Types26<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28>
+struct Types28 {
+  typedef T1 Head;
+  typedef Types27<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29>
+struct Types29 {
+  typedef T1 Head;
+  typedef Types28<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30>
+struct Types30 {
+  typedef T1 Head;
+  typedef Types29<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31>
+struct Types31 {
+  typedef T1 Head;
+  typedef Types30<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32>
+struct Types32 {
+  typedef T1 Head;
+  typedef Types31<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33>
+struct Types33 {
+  typedef T1 Head;
+  typedef Types32<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34>
+struct Types34 {
+  typedef T1 Head;
+  typedef Types33<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35>
+struct Types35 {
+  typedef T1 Head;
+  typedef Types34<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36>
+struct Types36 {
+  typedef T1 Head;
+  typedef Types35<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37>
+struct Types37 {
+  typedef T1 Head;
+  typedef Types36<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38>
+struct Types38 {
+  typedef T1 Head;
+  typedef Types37<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39>
+struct Types39 {
+  typedef T1 Head;
+  typedef Types38<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40>
+struct Types40 {
+  typedef T1 Head;
+  typedef Types39<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41>
+struct Types41 {
+  typedef T1 Head;
+  typedef Types40<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42>
+struct Types42 {
+  typedef T1 Head;
+  typedef Types41<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43>
+struct Types43 {
+  typedef T1 Head;
+  typedef Types42<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44>
+struct Types44 {
+  typedef T1 Head;
+  typedef Types43<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+      T44> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45>
+struct Types45 {
+  typedef T1 Head;
+  typedef Types44<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+      T44, T45> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46>
+struct Types46 {
+  typedef T1 Head;
+  typedef Types45<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+      T44, T45, T46> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47>
+struct Types47 {
+  typedef T1 Head;
+  typedef Types46<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+      T44, T45, T46, T47> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48>
+struct Types48 {
+  typedef T1 Head;
+  typedef Types47<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+      T44, T45, T46, T47, T48> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48, typename T49>
+struct Types49 {
+  typedef T1 Head;
+  typedef Types48<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+      T44, T45, T46, T47, T48, T49> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48, typename T49, typename T50>
+struct Types50 {
+  typedef T1 Head;
+  typedef Types49<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+      T44, T45, T46, T47, T48, T49, T50> Tail;
+};
+
+
+}  // namespace internal
+
+// We don't want to require the users to write TypesN<...> directly,
+// as that would require them to count the length.  Types<...> is much
+// easier to write, but generates horrible messages when there is a
+// compiler error, as gcc insists on printing out each template
+// argument, even if it has the default value (this means Types<int>
+// will appear as Types<int, None, None, ..., None> in the compiler
+// errors).
+//
+// Our solution is to combine the best part of the two approaches: a
+// user would write Types<T1, ..., TN>, and Google Test will translate
+// that to TypesN<T1, ..., TN> internally to make error messages
+// readable.  The translation is done by the 'type' member of the
+// Types template.
+template <typename T1 = internal::None, typename T2 = internal::None,
+    typename T3 = internal::None, typename T4 = internal::None,
+    typename T5 = internal::None, typename T6 = internal::None,
+    typename T7 = internal::None, typename T8 = internal::None,
+    typename T9 = internal::None, typename T10 = internal::None,
+    typename T11 = internal::None, typename T12 = internal::None,
+    typename T13 = internal::None, typename T14 = internal::None,
+    typename T15 = internal::None, typename T16 = internal::None,
+    typename T17 = internal::None, typename T18 = internal::None,
+    typename T19 = internal::None, typename T20 = internal::None,
+    typename T21 = internal::None, typename T22 = internal::None,
+    typename T23 = internal::None, typename T24 = internal::None,
+    typename T25 = internal::None, typename T26 = internal::None,
+    typename T27 = internal::None, typename T28 = internal::None,
+    typename T29 = internal::None, typename T30 = internal::None,
+    typename T31 = internal::None, typename T32 = internal::None,
+    typename T33 = internal::None, typename T34 = internal::None,
+    typename T35 = internal::None, typename T36 = internal::None,
+    typename T37 = internal::None, typename T38 = internal::None,
+    typename T39 = internal::None, typename T40 = internal::None,
+    typename T41 = internal::None, typename T42 = internal::None,
+    typename T43 = internal::None, typename T44 = internal::None,
+    typename T45 = internal::None, typename T46 = internal::None,
+    typename T47 = internal::None, typename T48 = internal::None,
+    typename T49 = internal::None, typename T50 = internal::None>
+struct Types {
+  typedef internal::Types50<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44, T45, T46, T47, T48, T49, T50> type;
+};
+
+template <>
+struct Types<internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types0 type;
+};
+template <typename T1>
+struct Types<T1, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types1<T1> type;
+};
+template <typename T1, typename T2>
+struct Types<T1, T2, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types2<T1, T2> type;
+};
+template <typename T1, typename T2, typename T3>
+struct Types<T1, T2, T3, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types3<T1, T2, T3> type;
+};
+template <typename T1, typename T2, typename T3, typename T4>
+struct Types<T1, T2, T3, T4, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types4<T1, T2, T3, T4> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+struct Types<T1, T2, T3, T4, T5, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types5<T1, T2, T3, T4, T5> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6>
+struct Types<T1, T2, T3, T4, T5, T6, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types6<T1, T2, T3, T4, T5, T6> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7>
+struct Types<T1, T2, T3, T4, T5, T6, T7, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types7<T1, T2, T3, T4, T5, T6, T7> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types8<T1, T2, T3, T4, T5, T6, T7, T8> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types9<T1, T2, T3, T4, T5, T6, T7, T8, T9> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types10<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types11<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types12<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types13<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types14<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types15<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types16<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types17<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types18<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types19<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types20<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types21<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types22<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types23<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types24<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types25<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types26<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types27<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types28<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types29<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types30<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types31<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types32<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types33<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types34<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types35<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types36<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types37<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types38<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types39<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types40<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types41<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types42<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types43<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types44<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types45<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44, T45> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
+    T46, internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types46<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44, T45, T46> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
+    T46, T47, internal::None, internal::None, internal::None> {
+  typedef internal::Types47<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44, T45, T46, T47> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
+    T46, T47, T48, internal::None, internal::None> {
+  typedef internal::Types48<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44, T45, T46, T47, T48> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48, typename T49>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
+    T46, T47, T48, T49, internal::None> {
+  typedef internal::Types49<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44, T45, T46, T47, T48, T49> type;
+};
+
+namespace internal {
+
+# define GTEST_TEMPLATE_ template <typename T> class
+
+// The template "selector" struct TemplateSel<Tmpl> is used to
+// represent Tmpl, which must be a class template with one type
+// parameter, as a type.  TemplateSel<Tmpl>::Bind<T>::type is defined
+// as the type Tmpl<T>.  This allows us to actually instantiate the
+// template "selected" by TemplateSel<Tmpl>.
+//
+// This trick is necessary for simulating typedef for class templates,
+// which C++ doesn't support directly.
+template <GTEST_TEMPLATE_ Tmpl>
+struct TemplateSel {
+  template <typename T>
+  struct Bind {
+    typedef Tmpl<T> type;
+  };
+};
+
+# define GTEST_BIND_(TmplSel, T) \
+  TmplSel::template Bind<T>::type
+
+// A unique struct template used as the default value for the
+// arguments of class template Templates.  This allows us to simulate
+// variadic templates (e.g. Templates<int>, Templates<int, double>,
+// and etc), which C++ doesn't support directly.
+template <typename T>
+struct NoneT {};
+
+// The following family of struct and struct templates are used to
+// represent template lists.  In particular, TemplatesN<T1, T2, ...,
+// TN> represents a list of N templates (T1, T2, ..., and TN).  Except
+// for Templates0, every struct in the family has two member types:
+// Head for the selector of the first template in the list, and Tail
+// for the rest of the list.
+
+// The empty template list.
+struct Templates0 {};
+
+// Template lists of length 1, 2, 3, and so on.
+
+template <GTEST_TEMPLATE_ T1>
+struct Templates1 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates0 Tail;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2>
+struct Templates2 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates1<T2> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3>
+struct Templates3 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates2<T2, T3> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4>
+struct Templates4 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates3<T2, T3, T4> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5>
+struct Templates5 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates4<T2, T3, T4, T5> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6>
+struct Templates6 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates5<T2, T3, T4, T5, T6> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7>
+struct Templates7 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates6<T2, T3, T4, T5, T6, T7> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8>
+struct Templates8 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates7<T2, T3, T4, T5, T6, T7, T8> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9>
+struct Templates9 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates8<T2, T3, T4, T5, T6, T7, T8, T9> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10>
+struct Templates10 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates9<T2, T3, T4, T5, T6, T7, T8, T9, T10> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11>
+struct Templates11 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates10<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12>
+struct Templates12 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates11<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13>
+struct Templates13 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates12<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14>
+struct Templates14 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates13<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15>
+struct Templates15 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates14<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16>
+struct Templates16 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates15<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17>
+struct Templates17 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates16<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18>
+struct Templates18 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates17<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19>
+struct Templates19 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates18<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20>
+struct Templates20 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates19<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21>
+struct Templates21 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates20<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22>
+struct Templates22 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates21<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23>
+struct Templates23 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates22<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24>
+struct Templates24 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates23<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25>
+struct Templates25 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates24<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26>
+struct Templates26 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates25<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27>
+struct Templates27 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates26<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28>
+struct Templates28 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates27<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29>
+struct Templates29 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates28<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30>
+struct Templates30 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates29<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31>
+struct Templates31 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates30<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32>
+struct Templates32 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates31<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33>
+struct Templates33 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates32<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34>
+struct Templates34 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates33<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35>
+struct Templates35 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates34<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36>
+struct Templates36 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates35<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37>
+struct Templates37 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates36<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38>
+struct Templates38 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates37<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39>
+struct Templates39 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates38<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40>
+struct Templates40 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates39<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41>
+struct Templates41 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates40<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42>
+struct Templates42 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates41<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43>
+struct Templates43 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates42<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44>
+struct Templates44 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates43<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43, T44> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45>
+struct Templates45 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates44<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43, T44, T45> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46>
+struct Templates46 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates45<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43, T44, T45, T46> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47>
+struct Templates47 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates46<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43, T44, T45, T46, T47> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48>
+struct Templates48 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates47<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43, T44, T45, T46, T47, T48> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48,
+    GTEST_TEMPLATE_ T49>
+struct Templates49 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates48<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43, T44, T45, T46, T47, T48, T49> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48,
+    GTEST_TEMPLATE_ T49, GTEST_TEMPLATE_ T50>
+struct Templates50 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates49<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43, T44, T45, T46, T47, T48, T49, T50> Tail;
+};
+
+
+// We don't want to require the users to write TemplatesN<...> directly,
+// as that would require them to count the length.  Templates<...> is much
+// easier to write, but generates horrible messages when there is a
+// compiler error, as gcc insists on printing out each template
+// argument, even if it has the default value (this means Templates<list>
+// will appear as Templates<list, NoneT, NoneT, ..., NoneT> in the compiler
+// errors).
+//
+// Our solution is to combine the best part of the two approaches: a
+// user would write Templates<T1, ..., TN>, and Google Test will translate
+// that to TemplatesN<T1, ..., TN> internally to make error messages
+// readable.  The translation is done by the 'type' member of the
+// Templates template.
+template <GTEST_TEMPLATE_ T1 = NoneT, GTEST_TEMPLATE_ T2 = NoneT,
+    GTEST_TEMPLATE_ T3 = NoneT, GTEST_TEMPLATE_ T4 = NoneT,
+    GTEST_TEMPLATE_ T5 = NoneT, GTEST_TEMPLATE_ T6 = NoneT,
+    GTEST_TEMPLATE_ T7 = NoneT, GTEST_TEMPLATE_ T8 = NoneT,
+    GTEST_TEMPLATE_ T9 = NoneT, GTEST_TEMPLATE_ T10 = NoneT,
+    GTEST_TEMPLATE_ T11 = NoneT, GTEST_TEMPLATE_ T12 = NoneT,
+    GTEST_TEMPLATE_ T13 = NoneT, GTEST_TEMPLATE_ T14 = NoneT,
+    GTEST_TEMPLATE_ T15 = NoneT, GTEST_TEMPLATE_ T16 = NoneT,
+    GTEST_TEMPLATE_ T17 = NoneT, GTEST_TEMPLATE_ T18 = NoneT,
+    GTEST_TEMPLATE_ T19 = NoneT, GTEST_TEMPLATE_ T20 = NoneT,
+    GTEST_TEMPLATE_ T21 = NoneT, GTEST_TEMPLATE_ T22 = NoneT,
+    GTEST_TEMPLATE_ T23 = NoneT, GTEST_TEMPLATE_ T24 = NoneT,
+    GTEST_TEMPLATE_ T25 = NoneT, GTEST_TEMPLATE_ T26 = NoneT,
+    GTEST_TEMPLATE_ T27 = NoneT, GTEST_TEMPLATE_ T28 = NoneT,
+    GTEST_TEMPLATE_ T29 = NoneT, GTEST_TEMPLATE_ T30 = NoneT,
+    GTEST_TEMPLATE_ T31 = NoneT, GTEST_TEMPLATE_ T32 = NoneT,
+    GTEST_TEMPLATE_ T33 = NoneT, GTEST_TEMPLATE_ T34 = NoneT,
+    GTEST_TEMPLATE_ T35 = NoneT, GTEST_TEMPLATE_ T36 = NoneT,
+    GTEST_TEMPLATE_ T37 = NoneT, GTEST_TEMPLATE_ T38 = NoneT,
+    GTEST_TEMPLATE_ T39 = NoneT, GTEST_TEMPLATE_ T40 = NoneT,
+    GTEST_TEMPLATE_ T41 = NoneT, GTEST_TEMPLATE_ T42 = NoneT,
+    GTEST_TEMPLATE_ T43 = NoneT, GTEST_TEMPLATE_ T44 = NoneT,
+    GTEST_TEMPLATE_ T45 = NoneT, GTEST_TEMPLATE_ T46 = NoneT,
+    GTEST_TEMPLATE_ T47 = NoneT, GTEST_TEMPLATE_ T48 = NoneT,
+    GTEST_TEMPLATE_ T49 = NoneT, GTEST_TEMPLATE_ T50 = NoneT>
+struct Templates {
+  typedef Templates50<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43, T44, T45, T46, T47, T48, T49, T50> type;
+};
+
+template <>
+struct Templates<NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT> {
+  typedef Templates0 type;
+};
+template <GTEST_TEMPLATE_ T1>
+struct Templates<T1, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT> {
+  typedef Templates1<T1> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2>
+struct Templates<T1, T2, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT> {
+  typedef Templates2<T1, T2> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3>
+struct Templates<T1, T2, T3, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates3<T1, T2, T3> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4>
+struct Templates<T1, T2, T3, T4, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates4<T1, T2, T3, T4> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5>
+struct Templates<T1, T2, T3, T4, T5, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates5<T1, T2, T3, T4, T5> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6>
+struct Templates<T1, T2, T3, T4, T5, T6, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates6<T1, T2, T3, T4, T5, T6> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates7<T1, T2, T3, T4, T5, T6, T7> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates8<T1, T2, T3, T4, T5, T6, T7, T8> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates9<T1, T2, T3, T4, T5, T6, T7, T8, T9> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates10<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates11<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates12<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates13<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates14<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates15<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates16<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates17<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates18<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates19<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates20<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates21<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT> {
+  typedef Templates22<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT> {
+  typedef Templates23<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT> {
+  typedef Templates24<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT> {
+  typedef Templates25<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT> {
+  typedef Templates26<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT> {
+  typedef Templates27<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT> {
+  typedef Templates28<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT> {
+  typedef Templates29<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates30<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates31<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates32<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates33<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates34<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates35<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates36<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates37<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates38<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates39<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates40<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates41<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates42<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates43<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates44<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43, T44> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
+    T45, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates45<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43, T44, T45> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
+    T45, T46, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates46<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43, T44, T45, T46> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
+    T45, T46, T47, NoneT, NoneT, NoneT> {
+  typedef Templates47<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43, T44, T45, T46, T47> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
+    T45, T46, T47, T48, NoneT, NoneT> {
+  typedef Templates48<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43, T44, T45, T46, T47, T48> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48,
+    GTEST_TEMPLATE_ T49>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
+    T45, T46, T47, T48, T49, NoneT> {
+  typedef Templates49<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43, T44, T45, T46, T47, T48, T49> type;
+};
+
+// The TypeList template makes it possible to use either a single type
+// or a Types<...> list in TYPED_TEST_CASE() and
+// INSTANTIATE_TYPED_TEST_CASE_P().
+
+template <typename T>
+struct TypeList {
+  typedef Types1<T> type;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48, typename T49, typename T50>
+struct TypeList<Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    T44, T45, T46, T47, T48, T49, T50> > {
+  typedef typename Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44, T45, T46, T47, T48, T49, T50>::type type;
+};
+
+#endif  // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
+
+// Due to C++ preprocessor weirdness, we need double indirection to
+// concatenate two tokens when one of them is __LINE__.  Writing
+//
+//   foo ## __LINE__
+//
+// will result in the token foo__LINE__, instead of foo followed by
+// the current line number.  For more details, see
+// http://www.parashift.com/c++-faq-lite/misc-technical-issues.html#faq-39.6
+#define GTEST_CONCAT_TOKEN_(foo, bar) GTEST_CONCAT_TOKEN_IMPL_(foo, bar)
+#define GTEST_CONCAT_TOKEN_IMPL_(foo, bar) foo ## bar
+
+class ProtocolMessage;
+namespace proto2 { class Message; }
+
+namespace testing {
+
+// Forward declarations.
+
+class AssertionResult;                 // Result of an assertion.
+class Message;                         // Represents a failure message.
+class Test;                            // Represents a test.
+class TestInfo;                        // Information about a test.
+class TestPartResult;                  // Result of a test part.
+class UnitTest;                        // A collection of test cases.
+
+template <typename T>
+::std::string PrintToString(const T& value);
+
+namespace internal {
+
+struct TraceInfo;                      // Information about a trace point.
+class ScopedTrace;                     // Implements scoped trace.
+class TestInfoImpl;                    // Opaque implementation of TestInfo
+class UnitTestImpl;                    // Opaque implementation of UnitTest
+
+// How many times InitGoogleTest() has been called.
+GTEST_API_ extern int g_init_gtest_count;
+
+// The text used in failure messages to indicate the start of the
+// stack trace.
+GTEST_API_ extern const char kStackTraceMarker[];
+
+// Two overloaded helpers for checking at compile time whether an
+// expression is a null pointer literal (i.e. NULL or any 0-valued
+// compile-time integral constant).  Their return values have
+// different sizes, so we can use sizeof() to test which version is
+// picked by the compiler.  These helpers have no implementations, as
+// we only need their signatures.
+//
+// Given IsNullLiteralHelper(x), the compiler will pick the first
+// version if x can be implicitly converted to Secret*, and pick the
+// second version otherwise.  Since Secret is a secret and incomplete
+// type, the only expression a user can write that has type Secret* is
+// a null pointer literal.  Therefore, we know that x is a null
+// pointer literal if and only if the first version is picked by the
+// compiler.
+char IsNullLiteralHelper(Secret* p);
+char (&IsNullLiteralHelper(...))[2];  // NOLINT
+
+// A compile-time bool constant that is true if and only if x is a
+// null pointer literal (i.e. NULL or any 0-valued compile-time
+// integral constant).
+#ifdef GTEST_ELLIPSIS_NEEDS_POD_
+// We lose support for NULL detection where the compiler doesn't like
+// passing non-POD classes through ellipsis (...).
+# define GTEST_IS_NULL_LITERAL_(x) false
+#else
+# define GTEST_IS_NULL_LITERAL_(x) \
+    (sizeof(::testing::internal::IsNullLiteralHelper(x)) == 1)
+#endif  // GTEST_ELLIPSIS_NEEDS_POD_
+
+// Appends the user-supplied message to the Google-Test-generated message.
+GTEST_API_ std::string AppendUserMessage(
+    const std::string& gtest_msg, const Message& user_msg);
+
+#if GTEST_HAS_EXCEPTIONS
+
+// This exception is thrown by (and only by) a failed Google Test
+// assertion when GTEST_FLAG(throw_on_failure) is true (if exceptions
+// are enabled).  We derive it from std::runtime_error, which is for
+// errors presumably detectable only at run time.  Since
+// std::runtime_error inherits from std::exception, many testing
+// frameworks know how to extract and print the message inside it.
+class GTEST_API_ GoogleTestFailureException : public ::std::runtime_error {
+ public:
+  explicit GoogleTestFailureException(const TestPartResult& failure);
+};
+
+#endif  // GTEST_HAS_EXCEPTIONS
+
+// A helper class for creating scoped traces in user programs.
+class GTEST_API_ ScopedTrace {
+ public:
+  // The c'tor pushes the given source file location and message onto
+  // a trace stack maintained by Google Test.
+  ScopedTrace(const char* file, int line, const Message& message);
+
+  // The d'tor pops the info pushed by the c'tor.
+  //
+  // Note that the d'tor is not virtual in order to be efficient.
+  // Don't inherit from ScopedTrace!
+  ~ScopedTrace();
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedTrace);
+} GTEST_ATTRIBUTE_UNUSED_;  // A ScopedTrace object does its job in its
+                            // c'tor and d'tor.  Therefore it doesn't
+                            // need to be used otherwise.
+
+// Constructs and returns the message for an equality assertion
+// (e.g. ASSERT_EQ, EXPECT_STREQ, etc) failure.
+//
+// The first four parameters are the expressions used in the assertion
+// and their values, as strings.  For example, for ASSERT_EQ(foo, bar)
+// where foo is 5 and bar is 6, we have:
+//
+//   expected_expression: "foo"
+//   actual_expression:   "bar"
+//   expected_value:      "5"
+//   actual_value:        "6"
+//
+// The ignoring_case parameter is true iff the assertion is a
+// *_STRCASEEQ*.  When it's true, the string " (ignoring case)" will
+// be inserted into the message.
+GTEST_API_ AssertionResult EqFailure(const char* expected_expression,
+                                     const char* actual_expression,
+                                     const std::string& expected_value,
+                                     const std::string& actual_value,
+                                     bool ignoring_case);
+
+// Constructs a failure message for Boolean assertions such as EXPECT_TRUE.
+GTEST_API_ std::string GetBoolAssertionFailureMessage(
+    const AssertionResult& assertion_result,
+    const char* expression_text,
+    const char* actual_predicate_value,
+    const char* expected_predicate_value);
+
+// This template class represents an IEEE floating-point number
+// (either single-precision or double-precision, depending on the
+// template parameters).
+//
+// The purpose of this class is to do more sophisticated number
+// comparison.  (Due to round-off error, etc, it's very unlikely that
+// two floating-points will be equal exactly.  Hence a naive
+// comparison by the == operation often doesn't work.)
+//
+// Format of IEEE floating-point:
+//
+//   The most-significant bit being the leftmost, an IEEE
+//   floating-point looks like
+//
+//     sign_bit exponent_bits fraction_bits
+//
+//   Here, sign_bit is a single bit that designates the sign of the
+//   number.
+//
+//   For float, there are 8 exponent bits and 23 fraction bits.
+//
+//   For double, there are 11 exponent bits and 52 fraction bits.
+//
+//   More details can be found at
+//   http://en.wikipedia.org/wiki/IEEE_floating-point_standard.
+//
+// Template parameter:
+//
+//   RawType: the raw floating-point type (either float or double)
+template <typename RawType>
+class FloatingPoint {
+ public:
+  // Defines the unsigned integer type that has the same size as the
+  // floating point number.
+  typedef typename TypeWithSize<sizeof(RawType)>::UInt Bits;
+
+  // Constants.
+
+  // # of bits in a number.
+  static const size_t kBitCount = 8*sizeof(RawType);
+
+  // # of fraction bits in a number.
+  static const size_t kFractionBitCount =
+    std::numeric_limits<RawType>::digits - 1;
+
+  // # of exponent bits in a number.
+  static const size_t kExponentBitCount = kBitCount - 1 - kFractionBitCount;
+
+  // The mask for the sign bit.
+  static const Bits kSignBitMask = static_cast<Bits>(1) << (kBitCount - 1);
+
+  // The mask for the fraction bits.
+  static const Bits kFractionBitMask =
+    ~static_cast<Bits>(0) >> (kExponentBitCount + 1);
+
+  // The mask for the exponent bits.
+  static const Bits kExponentBitMask = ~(kSignBitMask | kFractionBitMask);
+
+  // How many ULP's (Units in the Last Place) we want to tolerate when
+  // comparing two numbers.  The larger the value, the more error we
+  // allow.  A 0 value means that two numbers must be exactly the same
+  // to be considered equal.
+  //
+  // The maximum error of a single floating-point operation is 0.5
+  // units in the last place.  On Intel CPU's, all floating-point
+  // calculations are done with 80-bit precision, while double has 64
+  // bits.  Therefore, 4 should be enough for ordinary use.
+  //
+  // See the following article for more details on ULP:
+  // http://randomascii.wordpress.com/2012/02/25/comparing-floating-point-numbers-2012-edition/
+  static const size_t kMaxUlps = 4;
+
+  // Constructs a FloatingPoint from a raw floating-point number.
+  //
+  // On an Intel CPU, passing a non-normalized NAN (Not a Number)
+  // around may change its bits, although the new value is guaranteed
+  // to be also a NAN.  Therefore, don't expect this constructor to
+  // preserve the bits in x when x is a NAN.
+  explicit FloatingPoint(const RawType& x) { u_.value_ = x; }
+
+  // Static methods
+
+  // Reinterprets a bit pattern as a floating-point number.
+  //
+  // This function is needed to test the AlmostEquals() method.
+  static RawType ReinterpretBits(const Bits bits) {
+    FloatingPoint fp(0);
+    fp.u_.bits_ = bits;
+    return fp.u_.value_;
+  }
+
+  // Returns the floating-point number that represent positive infinity.
+  static RawType Infinity() {
+    return ReinterpretBits(kExponentBitMask);
+  }
+
+  // Returns the maximum representable finite floating-point number.
+  static RawType Max();
+
+  // Non-static methods
+
+  // Returns the bits that represents this number.
+  const Bits &bits() const { return u_.bits_; }
+
+  // Returns the exponent bits of this number.
+  Bits exponent_bits() const { return kExponentBitMask & u_.bits_; }
+
+  // Returns the fraction bits of this number.
+  Bits fraction_bits() const { return kFractionBitMask & u_.bits_; }
+
+  // Returns the sign bit of this number.
+  Bits sign_bit() const { return kSignBitMask & u_.bits_; }
+
+  // Returns true iff this is NAN (not a number).
+  bool is_nan() const {
+    // It's a NAN if the exponent bits are all ones and the fraction
+    // bits are not entirely zeros.
+    return (exponent_bits() == kExponentBitMask) && (fraction_bits() != 0);
+  }
+
+  // Returns true iff this number is at most kMaxUlps ULP's away from
+  // rhs.  In particular, this function:
+  //
+  //   - returns false if either number is (or both are) NAN.
+  //   - treats really large numbers as almost equal to infinity.
+  //   - thinks +0.0 and -0.0 are 0 DLP's apart.
+  bool AlmostEquals(const FloatingPoint& rhs) const {
+    // The IEEE standard says that any comparison operation involving
+    // a NAN must return false.
+    if (is_nan() || rhs.is_nan()) return false;
+
+    return DistanceBetweenSignAndMagnitudeNumbers(u_.bits_, rhs.u_.bits_)
+        <= kMaxUlps;
+  }
+
+ private:
+  // The data type used to store the actual floating-point number.
+  union FloatingPointUnion {
+    RawType value_;  // The raw floating-point number.
+    Bits bits_;      // The bits that represent the number.
+  };
+
+  // Converts an integer from the sign-and-magnitude representation to
+  // the biased representation.  More precisely, let N be 2 to the
+  // power of (kBitCount - 1), an integer x is represented by the
+  // unsigned number x + N.
+  //
+  // For instance,
+  //
+  //   -N + 1 (the most negative number representable using
+  //          sign-and-magnitude) is represented by 1;
+  //   0      is represented by N; and
+  //   N - 1  (the biggest number representable using
+  //          sign-and-magnitude) is represented by 2N - 1.
+  //
+  // Read http://en.wikipedia.org/wiki/Signed_number_representations
+  // for more details on signed number representations.
+  static Bits SignAndMagnitudeToBiased(const Bits &sam) {
+    if (kSignBitMask & sam) {
+      // sam represents a negative number.
+      return ~sam + 1;
+    } else {
+      // sam represents a positive number.
+      return kSignBitMask | sam;
+    }
+  }
+
+  // Given two numbers in the sign-and-magnitude representation,
+  // returns the distance between them as an unsigned number.
+  static Bits DistanceBetweenSignAndMagnitudeNumbers(const Bits &sam1,
+                                                     const Bits &sam2) {
+    const Bits biased1 = SignAndMagnitudeToBiased(sam1);
+    const Bits biased2 = SignAndMagnitudeToBiased(sam2);
+    return (biased1 >= biased2) ? (biased1 - biased2) : (biased2 - biased1);
+  }
+
+  FloatingPointUnion u_;
+};
+
+// We cannot use std::numeric_limits<T>::max() as it clashes with the max()
+// macro defined by <windows.h>.
+template <>
+inline float FloatingPoint<float>::Max() { return FLT_MAX; }
+template <>
+inline double FloatingPoint<double>::Max() { return DBL_MAX; }
+
+// Typedefs the instances of the FloatingPoint template class that we
+// care to use.
+typedef FloatingPoint<float> Float;
+typedef FloatingPoint<double> Double;
+
+// In order to catch the mistake of putting tests that use different
+// test fixture classes in the same test case, we need to assign
+// unique IDs to fixture classes and compare them.  The TypeId type is
+// used to hold such IDs.  The user should treat TypeId as an opaque
+// type: the only operation allowed on TypeId values is to compare
+// them for equality using the == operator.
+typedef const void* TypeId;
+
+template <typename T>
+class TypeIdHelper {
+ public:
+  // dummy_ must not have a const type.  Otherwise an overly eager
+  // compiler (e.g. MSVC 7.1 & 8.0) may try to merge
+  // TypeIdHelper<T>::dummy_ for different Ts as an "optimization".
+  static bool dummy_;
+};
+
+template <typename T>
+bool TypeIdHelper<T>::dummy_ = false;
+
+// GetTypeId<T>() returns the ID of type T.  Different values will be
+// returned for different types.  Calling the function twice with the
+// same type argument is guaranteed to return the same ID.
+template <typename T>
+TypeId GetTypeId() {
+  // The compiler is required to allocate a different
+  // TypeIdHelper<T>::dummy_ variable for each T used to instantiate
+  // the template.  Therefore, the address of dummy_ is guaranteed to
+  // be unique.
+  return &(TypeIdHelper<T>::dummy_);
+}
+
+// Returns the type ID of ::testing::Test.  Always call this instead
+// of GetTypeId< ::testing::Test>() to get the type ID of
+// ::testing::Test, as the latter may give the wrong result due to a
+// suspected linker bug when compiling Google Test as a Mac OS X
+// framework.
+GTEST_API_ TypeId GetTestTypeId();
+
+// Defines the abstract factory interface that creates instances
+// of a Test object.
+class TestFactoryBase {
+ public:
+  virtual ~TestFactoryBase() {}
+
+  // Creates a test instance to run. The instance is both created and destroyed
+  // within TestInfoImpl::Run()
+  virtual Test* CreateTest() = 0;
+
+ protected:
+  TestFactoryBase() {}
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestFactoryBase);
+};
+
+// This class provides implementation of TeastFactoryBase interface.
+// It is used in TEST and TEST_F macros.
+template <class TestClass>
+class TestFactoryImpl : public TestFactoryBase {
+ public:
+  virtual Test* CreateTest() { return new TestClass; }
+};
+
+#if GTEST_OS_WINDOWS
+
+// Predicate-formatters for implementing the HRESULT checking macros
+// {ASSERT|EXPECT}_HRESULT_{SUCCEEDED|FAILED}
+// We pass a long instead of HRESULT to avoid causing an
+// include dependency for the HRESULT type.
+GTEST_API_ AssertionResult IsHRESULTSuccess(const char* expr,
+                                            long hr);  // NOLINT
+GTEST_API_ AssertionResult IsHRESULTFailure(const char* expr,
+                                            long hr);  // NOLINT
+
+#endif  // GTEST_OS_WINDOWS
+
+// Types of SetUpTestCase() and TearDownTestCase() functions.
+typedef void (*SetUpTestCaseFunc)();
+typedef void (*TearDownTestCaseFunc)();
+
+// Creates a new TestInfo object and registers it with Google Test;
+// returns the created object.
+//
+// Arguments:
+//
+//   test_case_name:   name of the test case
+//   name:             name of the test
+//   type_param        the name of the test's type parameter, or NULL if
+//                     this is not a typed or a type-parameterized test.
+//   value_param       text representation of the test's value parameter,
+//                     or NULL if this is not a type-parameterized test.
+//   fixture_class_id: ID of the test fixture class
+//   set_up_tc:        pointer to the function that sets up the test case
+//   tear_down_tc:     pointer to the function that tears down the test case
+//   factory:          pointer to the factory that creates a test object.
+//                     The newly created TestInfo instance will assume
+//                     ownership of the factory object.
+GTEST_API_ TestInfo* MakeAndRegisterTestInfo(
+    const char* test_case_name,
+    const char* name,
+    const char* type_param,
+    const char* value_param,
+    TypeId fixture_class_id,
+    SetUpTestCaseFunc set_up_tc,
+    TearDownTestCaseFunc tear_down_tc,
+    TestFactoryBase* factory);
+
+// If *pstr starts with the given prefix, modifies *pstr to be right
+// past the prefix and returns true; otherwise leaves *pstr unchanged
+// and returns false.  None of pstr, *pstr, and prefix can be NULL.
+GTEST_API_ bool SkipPrefix(const char* prefix, const char** pstr);
+
+#if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
+
+// State of the definition of a type-parameterized test case.
+class GTEST_API_ TypedTestCasePState {
+ public:
+  TypedTestCasePState() : registered_(false) {}
+
+  // Adds the given test name to defined_test_names_ and return true
+  // if the test case hasn't been registered; otherwise aborts the
+  // program.
+  bool AddTestName(const char* file, int line, const char* case_name,
+                   const char* test_name) {
+    if (registered_) {
+      fprintf(stderr, "%s Test %s must be defined before "
+              "REGISTER_TYPED_TEST_CASE_P(%s, ...).\n",
+              FormatFileLocation(file, line).c_str(), test_name, case_name);
+      fflush(stderr);
+      posix::Abort();
+    }
+    defined_test_names_.insert(test_name);
+    return true;
+  }
+
+  // Verifies that registered_tests match the test names in
+  // defined_test_names_; returns registered_tests if successful, or
+  // aborts the program otherwise.
+  const char* VerifyRegisteredTestNames(
+      const char* file, int line, const char* registered_tests);
+
+ private:
+  bool registered_;
+  ::std::set<const char*> defined_test_names_;
+};
+
+// Skips to the first non-space char after the first comma in 'str';
+// returns NULL if no comma is found in 'str'.
+inline const char* SkipComma(const char* str) {
+  const char* comma = strchr(str, ',');
+  if (comma == NULL) {
+    return NULL;
+  }
+  while (IsSpace(*(++comma))) {}
+  return comma;
+}
+
+// Returns the prefix of 'str' before the first comma in it; returns
+// the entire string if it contains no comma.
+inline std::string GetPrefixUntilComma(const char* str) {
+  const char* comma = strchr(str, ',');
+  return comma == NULL ? str : std::string(str, comma);
+}
+
+// TypeParameterizedTest<Fixture, TestSel, Types>::Register()
+// registers a list of type-parameterized tests with Google Test.  The
+// return value is insignificant - we just need to return something
+// such that we can call this function in a namespace scope.
+//
+// Implementation note: The GTEST_TEMPLATE_ macro declares a template
+// template parameter.  It's defined in gtest-type-util.h.
+template <GTEST_TEMPLATE_ Fixture, class TestSel, typename Types>
+class TypeParameterizedTest {
+ public:
+  // 'index' is the index of the test in the type list 'Types'
+  // specified in INSTANTIATE_TYPED_TEST_CASE_P(Prefix, TestCase,
+  // Types).  Valid values for 'index' are [0, N - 1] where N is the
+  // length of Types.
+  static bool Register(const char* prefix, const char* case_name,
+                       const char* test_names, int index) {
+    typedef typename Types::Head Type;
+    typedef Fixture<Type> FixtureClass;
+    typedef typename GTEST_BIND_(TestSel, Type) TestClass;
+
+    // First, registers the first type-parameterized test in the type
+    // list.
+    MakeAndRegisterTestInfo(
+        (std::string(prefix) + (prefix[0] == '\0' ? "" : "/") + case_name + "/"
+         + StreamableToString(index)).c_str(),
+        GetPrefixUntilComma(test_names).c_str(),
+        GetTypeName<Type>().c_str(),
+        NULL,  // No value parameter.
+        GetTypeId<FixtureClass>(),
+        TestClass::SetUpTestCase,
+        TestClass::TearDownTestCase,
+        new TestFactoryImpl<TestClass>);
+
+    // Next, recurses (at compile time) with the tail of the type list.
+    return TypeParameterizedTest<Fixture, TestSel, typename Types::Tail>
+        ::Register(prefix, case_name, test_names, index + 1);
+  }
+};
+
+// The base case for the compile time recursion.
+template <GTEST_TEMPLATE_ Fixture, class TestSel>
+class TypeParameterizedTest<Fixture, TestSel, Types0> {
+ public:
+  static bool Register(const char* /*prefix*/, const char* /*case_name*/,
+                       const char* /*test_names*/, int /*index*/) {
+    return true;
+  }
+};
+
+// TypeParameterizedTestCase<Fixture, Tests, Types>::Register()
+// registers *all combinations* of 'Tests' and 'Types' with Google
+// Test.  The return value is insignificant - we just need to return
+// something such that we can call this function in a namespace scope.
+template <GTEST_TEMPLATE_ Fixture, typename Tests, typename Types>
+class TypeParameterizedTestCase {
+ public:
+  static bool Register(const char* prefix, const char* case_name,
+                       const char* test_names) {
+    typedef typename Tests::Head Head;
+
+    // First, register the first test in 'Test' for each type in 'Types'.
+    TypeParameterizedTest<Fixture, Head, Types>::Register(
+        prefix, case_name, test_names, 0);
+
+    // Next, recurses (at compile time) with the tail of the test list.
+    return TypeParameterizedTestCase<Fixture, typename Tests::Tail, Types>
+        ::Register(prefix, case_name, SkipComma(test_names));
+  }
+};
+
+// The base case for the compile time recursion.
+template <GTEST_TEMPLATE_ Fixture, typename Types>
+class TypeParameterizedTestCase<Fixture, Templates0, Types> {
+ public:
+  static bool Register(const char* /*prefix*/, const char* /*case_name*/,
+                       const char* /*test_names*/) {
+    return true;
+  }
+};
+
+#endif  // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
+
+// Returns the current OS stack trace as an std::string.
+//
+// The maximum number of stack frames to be included is specified by
+// the gtest_stack_trace_depth flag.  The skip_count parameter
+// specifies the number of top frames to be skipped, which doesn't
+// count against the number of frames to be included.
+//
+// For example, if Foo() calls Bar(), which in turn calls
+// GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in
+// the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't.
+GTEST_API_ std::string GetCurrentOsStackTraceExceptTop(
+    UnitTest* unit_test, int skip_count);
+
+// Helpers for suppressing warnings on unreachable code or constant
+// condition.
+
+// Always returns true.
+GTEST_API_ bool AlwaysTrue();
+
+// Always returns false.
+inline bool AlwaysFalse() { return !AlwaysTrue(); }
+
+// Helper for suppressing false warning from Clang on a const char*
+// variable declared in a conditional expression always being NULL in
+// the else branch.
+struct GTEST_API_ ConstCharPtr {
+  ConstCharPtr(const char* str) : value(str) {}
+  operator bool() const { return true; }
+  const char* value;
+};
+
+// A simple Linear Congruential Generator for generating random
+// numbers with a uniform distribution.  Unlike rand() and srand(), it
+// doesn't use global state (and therefore can't interfere with user
+// code).  Unlike rand_r(), it's portable.  An LCG isn't very random,
+// but it's good enough for our purposes.
+class GTEST_API_ Random {
+ public:
+  static const UInt32 kMaxRange = 1u << 31;
+
+  explicit Random(UInt32 seed) : state_(seed) {}
+
+  void Reseed(UInt32 seed) { state_ = seed; }
+
+  // Generates a random number from [0, range).  Crashes if 'range' is
+  // 0 or greater than kMaxRange.
+  UInt32 Generate(UInt32 range);
+
+ private:
+  UInt32 state_;
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Random);
+};
+
+// Defining a variable of type CompileAssertTypesEqual<T1, T2> will cause a
+// compiler error iff T1 and T2 are different types.
+template <typename T1, typename T2>
+struct CompileAssertTypesEqual;
+
+template <typename T>
+struct CompileAssertTypesEqual<T, T> {
+};
+
+// Removes the reference from a type if it is a reference type,
+// otherwise leaves it unchanged.  This is the same as
+// tr1::remove_reference, which is not widely available yet.
+template <typename T>
+struct RemoveReference { typedef T type; };  // NOLINT
+template <typename T>
+struct RemoveReference<T&> { typedef T type; };  // NOLINT
+
+// A handy wrapper around RemoveReference that works when the argument
+// T depends on template parameters.
+#define GTEST_REMOVE_REFERENCE_(T) \
+    typename ::testing::internal::RemoveReference<T>::type
+
+// Removes const from a type if it is a const type, otherwise leaves
+// it unchanged.  This is the same as tr1::remove_const, which is not
+// widely available yet.
+template <typename T>
+struct RemoveConst { typedef T type; };  // NOLINT
+template <typename T>
+struct RemoveConst<const T> { typedef T type; };  // NOLINT
+
+// MSVC 8.0, Sun C++, and IBM XL C++ have a bug which causes the above
+// definition to fail to remove the const in 'const int[3]' and 'const
+// char[3][4]'.  The following specialization works around the bug.
+template <typename T, size_t N>
+struct RemoveConst<const T[N]> {
+  typedef typename RemoveConst<T>::type type[N];
+};
+
+#if defined(_MSC_VER) && _MSC_VER < 1400
+// This is the only specialization that allows VC++ 7.1 to remove const in
+// 'const int[3] and 'const int[3][4]'.  However, it causes trouble with GCC
+// and thus needs to be conditionally compiled.
+template <typename T, size_t N>
+struct RemoveConst<T[N]> {
+  typedef typename RemoveConst<T>::type type[N];
+};
+#endif
+
+// A handy wrapper around RemoveConst that works when the argument
+// T depends on template parameters.
+#define GTEST_REMOVE_CONST_(T) \
+    typename ::testing::internal::RemoveConst<T>::type
+
+// Turns const U&, U&, const U, and U all into U.
+#define GTEST_REMOVE_REFERENCE_AND_CONST_(T) \
+    GTEST_REMOVE_CONST_(GTEST_REMOVE_REFERENCE_(T))
+
+// Adds reference to a type if it is not a reference type,
+// otherwise leaves it unchanged.  This is the same as
+// tr1::add_reference, which is not widely available yet.
+template <typename T>
+struct AddReference { typedef T& type; };  // NOLINT
+template <typename T>
+struct AddReference<T&> { typedef T& type; };  // NOLINT
+
+// A handy wrapper around AddReference that works when the argument T
+// depends on template parameters.
+#define GTEST_ADD_REFERENCE_(T) \
+    typename ::testing::internal::AddReference<T>::type
+
+// Adds a reference to const on top of T as necessary.  For example,
+// it transforms
+//
+//   char         ==> const char&
+//   const char   ==> const char&
+//   char&        ==> const char&
+//   const char&  ==> const char&
+//
+// The argument T must depend on some template parameters.
+#define GTEST_REFERENCE_TO_CONST_(T) \
+    GTEST_ADD_REFERENCE_(const GTEST_REMOVE_REFERENCE_(T))
+
+// ImplicitlyConvertible<From, To>::value is a compile-time bool
+// constant that's true iff type From can be implicitly converted to
+// type To.
+template <typename From, typename To>
+class ImplicitlyConvertible {
+ private:
+  // We need the following helper functions only for their types.
+  // They have no implementations.
+
+  // MakeFrom() is an expression whose type is From.  We cannot simply
+  // use From(), as the type From may not have a public default
+  // constructor.
+  static From MakeFrom();
+
+  // These two functions are overloaded.  Given an expression
+  // Helper(x), the compiler will pick the first version if x can be
+  // implicitly converted to type To; otherwise it will pick the
+  // second version.
+  //
+  // The first version returns a value of size 1, and the second
+  // version returns a value of size 2.  Therefore, by checking the
+  // size of Helper(x), which can be done at compile time, we can tell
+  // which version of Helper() is used, and hence whether x can be
+  // implicitly converted to type To.
+  static char Helper(To);
+  static char (&Helper(...))[2];  // NOLINT
+
+  // We have to put the 'public' section after the 'private' section,
+  // or MSVC refuses to compile the code.
+ public:
+  // MSVC warns about implicitly converting from double to int for
+  // possible loss of data, so we need to temporarily disable the
+  // warning.
+#ifdef _MSC_VER
+# pragma warning(push)          // Saves the current warning state.
+# pragma warning(disable:4244)  // Temporarily disables warning 4244.
+
+  static const bool value =
+      sizeof(Helper(ImplicitlyConvertible::MakeFrom())) == 1;
+# pragma warning(pop)           // Restores the warning state.
+#elif defined(__BORLANDC__)
+  // C++Builder cannot use member overload resolution during template
+  // instantiation.  The simplest workaround is to use its C++0x type traits
+  // functions (C++Builder 2009 and above only).
+  static const bool value = __is_convertible(From, To);
+#else
+  static const bool value =
+      sizeof(Helper(ImplicitlyConvertible::MakeFrom())) == 1;
+#endif  // _MSV_VER
+};
+template <typename From, typename To>
+const bool ImplicitlyConvertible<From, To>::value;
+
+// IsAProtocolMessage<T>::value is a compile-time bool constant that's
+// true iff T is type ProtocolMessage, proto2::Message, or a subclass
+// of those.
+template <typename T>
+struct IsAProtocolMessage
+    : public bool_constant<
+  ImplicitlyConvertible<const T*, const ::ProtocolMessage*>::value ||
+  ImplicitlyConvertible<const T*, const ::proto2::Message*>::value> {
+};
+
+// When the compiler sees expression IsContainerTest<C>(0), if C is an
+// STL-style container class, the first overload of IsContainerTest
+// will be viable (since both C::iterator* and C::const_iterator* are
+// valid types and NULL can be implicitly converted to them).  It will
+// be picked over the second overload as 'int' is a perfect match for
+// the type of argument 0.  If C::iterator or C::const_iterator is not
+// a valid type, the first overload is not viable, and the second
+// overload will be picked.  Therefore, we can determine whether C is
+// a container class by checking the type of IsContainerTest<C>(0).
+// The value of the expression is insignificant.
+//
+// Note that we look for both C::iterator and C::const_iterator.  The
+// reason is that C++ injects the name of a class as a member of the
+// class itself (e.g. you can refer to class iterator as either
+// 'iterator' or 'iterator::iterator').  If we look for C::iterator
+// only, for example, we would mistakenly think that a class named
+// iterator is an STL container.
+//
+// Also note that the simpler approach of overloading
+// IsContainerTest(typename C::const_iterator*) and
+// IsContainerTest(...) doesn't work with Visual Age C++ and Sun C++.
+typedef int IsContainer;
+template <class C>
+IsContainer IsContainerTest(int /* dummy */,
+                            typename C::iterator* /* it */ = NULL,
+                            typename C::const_iterator* /* const_it */ = NULL) {
+  return 0;
+}
+
+typedef char IsNotContainer;
+template <class C>
+IsNotContainer IsContainerTest(long /* dummy */) { return '\0'; }
+
+// EnableIf<condition>::type is void when 'Cond' is true, and
+// undefined when 'Cond' is false.  To use SFINAE to make a function
+// overload only apply when a particular expression is true, add
+// "typename EnableIf<expression>::type* = 0" as the last parameter.
+template<bool> struct EnableIf;
+template<> struct EnableIf<true> { typedef void type; };  // NOLINT
+
+// Utilities for native arrays.
+
+// ArrayEq() compares two k-dimensional native arrays using the
+// elements' operator==, where k can be any integer >= 0.  When k is
+// 0, ArrayEq() degenerates into comparing a single pair of values.
+
+template <typename T, typename U>
+bool ArrayEq(const T* lhs, size_t size, const U* rhs);
+
+// This generic version is used when k is 0.
+template <typename T, typename U>
+inline bool ArrayEq(const T& lhs, const U& rhs) { return lhs == rhs; }
+
+// This overload is used when k >= 1.
+template <typename T, typename U, size_t N>
+inline bool ArrayEq(const T(&lhs)[N], const U(&rhs)[N]) {
+  return internal::ArrayEq(lhs, N, rhs);
+}
+
+// This helper reduces code bloat.  If we instead put its logic inside
+// the previous ArrayEq() function, arrays with different sizes would
+// lead to different copies of the template code.
+template <typename T, typename U>
+bool ArrayEq(const T* lhs, size_t size, const U* rhs) {
+  for (size_t i = 0; i != size; i++) {
+    if (!internal::ArrayEq(lhs[i], rhs[i]))
+      return false;
+  }
+  return true;
+}
+
+// Finds the first element in the iterator range [begin, end) that
+// equals elem.  Element may be a native array type itself.
+template <typename Iter, typename Element>
+Iter ArrayAwareFind(Iter begin, Iter end, const Element& elem) {
+  for (Iter it = begin; it != end; ++it) {
+    if (internal::ArrayEq(*it, elem))
+      return it;
+  }
+  return end;
+}
+
+// CopyArray() copies a k-dimensional native array using the elements'
+// operator=, where k can be any integer >= 0.  When k is 0,
+// CopyArray() degenerates into copying a single value.
+
+template <typename T, typename U>
+void CopyArray(const T* from, size_t size, U* to);
+
+// This generic version is used when k is 0.
+template <typename T, typename U>
+inline void CopyArray(const T& from, U* to) { *to = from; }
+
+// This overload is used when k >= 1.
+template <typename T, typename U, size_t N>
+inline void CopyArray(const T(&from)[N], U(*to)[N]) {
+  internal::CopyArray(from, N, *to);
+}
+
+// This helper reduces code bloat.  If we instead put its logic inside
+// the previous CopyArray() function, arrays with different sizes
+// would lead to different copies of the template code.
+template <typename T, typename U>
+void CopyArray(const T* from, size_t size, U* to) {
+  for (size_t i = 0; i != size; i++) {
+    internal::CopyArray(from[i], to + i);
+  }
+}
+
+// The relation between an NativeArray object (see below) and the
+// native array it represents.
+enum RelationToSource {
+  kReference,  // The NativeArray references the native array.
+  kCopy        // The NativeArray makes a copy of the native array and
+               // owns the copy.
+};
+
+// Adapts a native array to a read-only STL-style container.  Instead
+// of the complete STL container concept, this adaptor only implements
+// members useful for Google Mock's container matchers.  New members
+// should be added as needed.  To simplify the implementation, we only
+// support Element being a raw type (i.e. having no top-level const or
+// reference modifier).  It's the client's responsibility to satisfy
+// this requirement.  Element can be an array type itself (hence
+// multi-dimensional arrays are supported).
+template <typename Element>
+class NativeArray {
+ public:
+  // STL-style container typedefs.
+  typedef Element value_type;
+  typedef Element* iterator;
+  typedef const Element* const_iterator;
+
+  // Constructs from a native array.
+  NativeArray(const Element* array, size_t count, RelationToSource relation) {
+    Init(array, count, relation);
+  }
+
+  // Copy constructor.
+  NativeArray(const NativeArray& rhs) {
+    Init(rhs.array_, rhs.size_, rhs.relation_to_source_);
+  }
+
+  ~NativeArray() {
+    // Ensures that the user doesn't instantiate NativeArray with a
+    // const or reference type.
+    static_cast<void>(StaticAssertTypeEqHelper<Element,
+        GTEST_REMOVE_REFERENCE_AND_CONST_(Element)>());
+    if (relation_to_source_ == kCopy)
+      delete[] array_;
+  }
+
+  // STL-style container methods.
+  size_t size() const { return size_; }
+  const_iterator begin() const { return array_; }
+  const_iterator end() const { return array_ + size_; }
+  bool operator==(const NativeArray& rhs) const {
+    return size() == rhs.size() &&
+        ArrayEq(begin(), size(), rhs.begin());
+  }
+
+ private:
+  // Initializes this object; makes a copy of the input array if
+  // 'relation' is kCopy.
+  void Init(const Element* array, size_t a_size, RelationToSource relation) {
+    if (relation == kReference) {
+      array_ = array;
+    } else {
+      Element* const copy = new Element[a_size];
+      CopyArray(array, a_size, copy);
+      array_ = copy;
+    }
+    size_ = a_size;
+    relation_to_source_ = relation;
+  }
+
+  const Element* array_;
+  size_t size_;
+  RelationToSource relation_to_source_;
+
+  GTEST_DISALLOW_ASSIGN_(NativeArray);
+};
+
+}  // namespace internal
+}  // namespace testing
+
+#define GTEST_MESSAGE_AT_(file, line, message, result_type) \
+  ::testing::internal::AssertHelper(result_type, file, line, message) \
+    = ::testing::Message()
+
+#define GTEST_MESSAGE_(message, result_type) \
+  GTEST_MESSAGE_AT_(__FILE__, __LINE__, message, result_type)
+
+#define GTEST_FATAL_FAILURE_(message) \
+  return GTEST_MESSAGE_(message, ::testing::TestPartResult::kFatalFailure)
+
+#define GTEST_NONFATAL_FAILURE_(message) \
+  GTEST_MESSAGE_(message, ::testing::TestPartResult::kNonFatalFailure)
+
+#define GTEST_SUCCESS_(message) \
+  GTEST_MESSAGE_(message, ::testing::TestPartResult::kSuccess)
+
+// Suppresses MSVC warnings 4072 (unreachable code) for the code following
+// statement if it returns or throws (or doesn't return or throw in some
+// situations).
+#define GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement) \
+  if (::testing::internal::AlwaysTrue()) { statement; }
+
+#define GTEST_TEST_THROW_(statement, expected_exception, fail) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (::testing::internal::ConstCharPtr gtest_msg = "") { \
+    bool gtest_caught_expected = false; \
+    try { \
+      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+    } \
+    catch (expected_exception const&) { \
+      gtest_caught_expected = true; \
+    } \
+    catch (...) { \
+      gtest_msg.value = \
+          "Expected: " #statement " throws an exception of type " \
+          #expected_exception ".\n  Actual: it throws a different type."; \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__); \
+    } \
+    if (!gtest_caught_expected) { \
+      gtest_msg.value = \
+          "Expected: " #statement " throws an exception of type " \
+          #expected_exception ".\n  Actual: it throws nothing."; \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__); \
+    } \
+  } else \
+    GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__): \
+      fail(gtest_msg.value)
+
+#define GTEST_TEST_NO_THROW_(statement, fail) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (::testing::internal::AlwaysTrue()) { \
+    try { \
+      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+    } \
+    catch (...) { \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__); \
+    } \
+  } else \
+    GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__): \
+      fail("Expected: " #statement " doesn't throw an exception.\n" \
+           "  Actual: it throws.")
+
+#define GTEST_TEST_ANY_THROW_(statement, fail) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (::testing::internal::AlwaysTrue()) { \
+    bool gtest_caught_any = false; \
+    try { \
+      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+    } \
+    catch (...) { \
+      gtest_caught_any = true; \
+    } \
+    if (!gtest_caught_any) { \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__); \
+    } \
+  } else \
+    GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__): \
+      fail("Expected: " #statement " throws an exception.\n" \
+           "  Actual: it doesn't.")
+
+
+// Implements Boolean test assertions such as EXPECT_TRUE. expression can be
+// either a boolean expression or an AssertionResult. text is a textual
+// represenation of expression as it was passed into the EXPECT_TRUE.
+#define GTEST_TEST_BOOLEAN_(expression, text, actual, expected, fail) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (const ::testing::AssertionResult gtest_ar_ = \
+      ::testing::AssertionResult(expression)) \
+    ; \
+  else \
+    fail(::testing::internal::GetBoolAssertionFailureMessage(\
+        gtest_ar_, text, #actual, #expected).c_str())
+
+#define GTEST_TEST_NO_FATAL_FAILURE_(statement, fail) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (::testing::internal::AlwaysTrue()) { \
+    ::testing::internal::HasNewFatalFailureHelper gtest_fatal_failure_checker; \
+    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+    if (gtest_fatal_failure_checker.has_new_fatal_failure()) { \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__); \
+    } \
+  } else \
+    GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__): \
+      fail("Expected: " #statement " doesn't generate new fatal " \
+           "failures in the current thread.\n" \
+           "  Actual: it does.")
+
+// Expands to the name of the class that implements the given test.
+#define GTEST_TEST_CLASS_NAME_(test_case_name, test_name) \
+  test_case_name##_##test_name##_Test
+
+// Helper macro for defining tests.
+#define GTEST_TEST_(test_case_name, test_name, parent_class, parent_id)\
+class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) : public parent_class {\
+ public:\
+  GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {}\
+ private:\
+  virtual void TestBody();\
+  static ::testing::TestInfo* const test_info_ GTEST_ATTRIBUTE_UNUSED_;\
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(\
+      GTEST_TEST_CLASS_NAME_(test_case_name, test_name));\
+};\
+\
+::testing::TestInfo* const GTEST_TEST_CLASS_NAME_(test_case_name, test_name)\
+  ::test_info_ =\
+    ::testing::internal::MakeAndRegisterTestInfo(\
+        #test_case_name, #test_name, NULL, NULL, \
+        (parent_id), \
+        parent_class::SetUpTestCase, \
+        parent_class::TearDownTestCase, \
+        new ::testing::internal::TestFactoryImpl<\
+            GTEST_TEST_CLASS_NAME_(test_case_name, test_name)>);\
+void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody()
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+//
+// The Google C++ Testing Framework (Google Test)
+//
+// This header file defines the public API for death tests.  It is
+// #included by gtest.h so a user doesn't need to include this
+// directly.
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
+#define GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
+
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: wan@google.com (Zhanyong Wan), eefacm@gmail.com (Sean Mcafee)
+//
+// The Google C++ Testing Framework (Google Test)
+//
+// This header file defines internal utilities needed for implementing
+// death tests.  They are subject to change without notice.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
+
+
+#include <stdio.h>
+
+namespace testing {
+namespace internal {
+
+GTEST_DECLARE_string_(internal_run_death_test);
+
+// Names of the flags (needed for parsing Google Test flags).
+const char kDeathTestStyleFlag[] = "death_test_style";
+const char kDeathTestUseFork[] = "death_test_use_fork";
+const char kInternalRunDeathTestFlag[] = "internal_run_death_test";
+
+#if GTEST_HAS_DEATH_TEST
+
+// DeathTest is a class that hides much of the complexity of the
+// GTEST_DEATH_TEST_ macro.  It is abstract; its static Create method
+// returns a concrete class that depends on the prevailing death test
+// style, as defined by the --gtest_death_test_style and/or
+// --gtest_internal_run_death_test flags.
+
+// In describing the results of death tests, these terms are used with
+// the corresponding definitions:
+//
+// exit status:  The integer exit information in the format specified
+//               by wait(2)
+// exit code:    The integer code passed to exit(3), _exit(2), or
+//               returned from main()
+class GTEST_API_ DeathTest {
+ public:
+  // Create returns false if there was an error determining the
+  // appropriate action to take for the current death test; for example,
+  // if the gtest_death_test_style flag is set to an invalid value.
+  // The LastMessage method will return a more detailed message in that
+  // case.  Otherwise, the DeathTest pointer pointed to by the "test"
+  // argument is set.  If the death test should be skipped, the pointer
+  // is set to NULL; otherwise, it is set to the address of a new concrete
+  // DeathTest object that controls the execution of the current test.
+  static bool Create(const char* statement, const RE* regex,
+                     const char* file, int line, DeathTest** test);
+  DeathTest();
+  virtual ~DeathTest() { }
+
+  // A helper class that aborts a death test when it's deleted.
+  class ReturnSentinel {
+   public:
+    explicit ReturnSentinel(DeathTest* test) : test_(test) { }
+    ~ReturnSentinel() { test_->Abort(TEST_ENCOUNTERED_RETURN_STATEMENT); }
+   private:
+    DeathTest* const test_;
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(ReturnSentinel);
+  } GTEST_ATTRIBUTE_UNUSED_;
+
+  // An enumeration of possible roles that may be taken when a death
+  // test is encountered.  EXECUTE means that the death test logic should
+  // be executed immediately.  OVERSEE means that the program should prepare
+  // the appropriate environment for a child process to execute the death
+  // test, then wait for it to complete.
+  enum TestRole { OVERSEE_TEST, EXECUTE_TEST };
+
+  // An enumeration of the three reasons that a test might be aborted.
+  enum AbortReason {
+    TEST_ENCOUNTERED_RETURN_STATEMENT,
+    TEST_THREW_EXCEPTION,
+    TEST_DID_NOT_DIE
+  };
+
+  // Assumes one of the above roles.
+  virtual TestRole AssumeRole() = 0;
+
+  // Waits for the death test to finish and returns its status.
+  virtual int Wait() = 0;
+
+  // Returns true if the death test passed; that is, the test process
+  // exited during the test, its exit status matches a user-supplied
+  // predicate, and its stderr output matches a user-supplied regular
+  // expression.
+  // The user-supplied predicate may be a macro expression rather
+  // than a function pointer or functor, or else Wait and Passed could
+  // be combined.
+  virtual bool Passed(bool exit_status_ok) = 0;
+
+  // Signals that the death test did not die as expected.
+  virtual void Abort(AbortReason reason) = 0;
+
+  // Returns a human-readable outcome message regarding the outcome of
+  // the last death test.
+  static const char* LastMessage();
+
+  static void set_last_death_test_message(const std::string& message);
+
+ private:
+  // A string containing a description of the outcome of the last death test.
+  static std::string last_death_test_message_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(DeathTest);
+};
+
+// Factory interface for death tests.  May be mocked out for testing.
+class DeathTestFactory {
+ public:
+  virtual ~DeathTestFactory() { }
+  virtual bool Create(const char* statement, const RE* regex,
+                      const char* file, int line, DeathTest** test) = 0;
+};
+
+// A concrete DeathTestFactory implementation for normal use.
+class DefaultDeathTestFactory : public DeathTestFactory {
+ public:
+  virtual bool Create(const char* statement, const RE* regex,
+                      const char* file, int line, DeathTest** test);
+};
+
+// Returns true if exit_status describes a process that was terminated
+// by a signal, or exited normally with a nonzero exit code.
+GTEST_API_ bool ExitedUnsuccessfully(int exit_status);
+
+// Traps C++ exceptions escaping statement and reports them as test
+// failures. Note that trapping SEH exceptions is not implemented here.
+# if GTEST_HAS_EXCEPTIONS
+#  define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \
+  try { \
+    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+  } catch (const ::std::exception& gtest_exception) { \
+    fprintf(\
+        stderr, \
+        "\n%s: Caught std::exception-derived exception escaping the " \
+        "death test statement. Exception message: %s\n", \
+        ::testing::internal::FormatFileLocation(__FILE__, __LINE__).c_str(), \
+        gtest_exception.what()); \
+    fflush(stderr); \
+    death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \
+  } catch (...) { \
+    death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \
+  }
+
+# else
+#  define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \
+  GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement)
+
+# endif
+
+// This macro is for implementing ASSERT_DEATH*, EXPECT_DEATH*,
+// ASSERT_EXIT*, and EXPECT_EXIT*.
+# define GTEST_DEATH_TEST_(statement, predicate, regex, fail) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (::testing::internal::AlwaysTrue()) { \
+    const ::testing::internal::RE& gtest_regex = (regex); \
+    ::testing::internal::DeathTest* gtest_dt; \
+    if (!::testing::internal::DeathTest::Create(#statement, &gtest_regex, \
+        __FILE__, __LINE__, &gtest_dt)) { \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__); \
+    } \
+    if (gtest_dt != NULL) { \
+      ::testing::internal::scoped_ptr< ::testing::internal::DeathTest> \
+          gtest_dt_ptr(gtest_dt); \
+      switch (gtest_dt->AssumeRole()) { \
+        case ::testing::internal::DeathTest::OVERSEE_TEST: \
+          if (!gtest_dt->Passed(predicate(gtest_dt->Wait()))) { \
+            goto GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__); \
+          } \
+          break; \
+        case ::testing::internal::DeathTest::EXECUTE_TEST: { \
+          ::testing::internal::DeathTest::ReturnSentinel \
+              gtest_sentinel(gtest_dt); \
+          GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, gtest_dt); \
+          gtest_dt->Abort(::testing::internal::DeathTest::TEST_DID_NOT_DIE); \
+          break; \
+        } \
+        default: \
+          break; \
+      } \
+    } \
+  } else \
+    GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__): \
+      fail(::testing::internal::DeathTest::LastMessage())
+// The symbol "fail" here expands to something into which a message
+// can be streamed.
+
+// This macro is for implementing ASSERT/EXPECT_DEBUG_DEATH when compiled in
+// NDEBUG mode. In this case we need the statements to be executed, the regex is
+// ignored, and the macro must accept a streamed message even though the message
+// is never printed.
+# define GTEST_EXECUTE_STATEMENT_(statement, regex) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (::testing::internal::AlwaysTrue()) { \
+     GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+  } else \
+    ::testing::Message()
+
+// A class representing the parsed contents of the
+// --gtest_internal_run_death_test flag, as it existed when
+// RUN_ALL_TESTS was called.
+class InternalRunDeathTestFlag {
+ public:
+  InternalRunDeathTestFlag(const std::string& a_file,
+                           int a_line,
+                           int an_index,
+                           int a_write_fd)
+      : file_(a_file), line_(a_line), index_(an_index),
+        write_fd_(a_write_fd) {}
+
+  ~InternalRunDeathTestFlag() {
+    if (write_fd_ >= 0)
+      posix::Close(write_fd_);
+  }
+
+  const std::string& file() const { return file_; }
+  int line() const { return line_; }
+  int index() const { return index_; }
+  int write_fd() const { return write_fd_; }
+
+ private:
+  std::string file_;
+  int line_;
+  int index_;
+  int write_fd_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(InternalRunDeathTestFlag);
+};
+
+// Returns a newly created InternalRunDeathTestFlag object with fields
+// initialized from the GTEST_FLAG(internal_run_death_test) flag if
+// the flag is specified; otherwise returns NULL.
+InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag();
+
+#else  // GTEST_HAS_DEATH_TEST
+
+// This macro is used for implementing macros such as
+// EXPECT_DEATH_IF_SUPPORTED and ASSERT_DEATH_IF_SUPPORTED on systems where
+// death tests are not supported. Those macros must compile on such systems
+// iff EXPECT_DEATH and ASSERT_DEATH compile with the same parameters on
+// systems that support death tests. This allows one to write such a macro
+// on a system that does not support death tests and be sure that it will
+// compile on a death-test supporting system.
+//
+// Parameters:
+//   statement -  A statement that a macro such as EXPECT_DEATH would test
+//                for program termination. This macro has to make sure this
+//                statement is compiled but not executed, to ensure that
+//                EXPECT_DEATH_IF_SUPPORTED compiles with a certain
+//                parameter iff EXPECT_DEATH compiles with it.
+//   regex     -  A regex that a macro such as EXPECT_DEATH would use to test
+//                the output of statement.  This parameter has to be
+//                compiled but not evaluated by this macro, to ensure that
+//                this macro only accepts expressions that a macro such as
+//                EXPECT_DEATH would accept.
+//   terminator - Must be an empty statement for EXPECT_DEATH_IF_SUPPORTED
+//                and a return statement for ASSERT_DEATH_IF_SUPPORTED.
+//                This ensures that ASSERT_DEATH_IF_SUPPORTED will not
+//                compile inside functions where ASSERT_DEATH doesn't
+//                compile.
+//
+//  The branch that has an always false condition is used to ensure that
+//  statement and regex are compiled (and thus syntactically correct) but
+//  never executed. The unreachable code macro protects the terminator
+//  statement from generating an 'unreachable code' warning in case
+//  statement unconditionally returns or throws. The Message constructor at
+//  the end allows the syntax of streaming additional messages into the
+//  macro, for compilational compatibility with EXPECT_DEATH/ASSERT_DEATH.
+# define GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, terminator) \
+    GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+    if (::testing::internal::AlwaysTrue()) { \
+      GTEST_LOG_(WARNING) \
+          << "Death tests are not supported on this platform.\n" \
+          << "Statement '" #statement "' cannot be verified."; \
+    } else if (::testing::internal::AlwaysFalse()) { \
+      ::testing::internal::RE::PartialMatch(".*", (regex)); \
+      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+      terminator; \
+    } else \
+      ::testing::Message()
+
+#endif  // GTEST_HAS_DEATH_TEST
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
+
+namespace testing {
+
+// This flag controls the style of death tests.  Valid values are "threadsafe",
+// meaning that the death test child process will re-execute the test binary
+// from the start, running only a single death test, or "fast",
+// meaning that the child process will execute the test logic immediately
+// after forking.
+GTEST_DECLARE_string_(death_test_style);
+
+#if GTEST_HAS_DEATH_TEST
+
+namespace internal {
+
+// Returns a Boolean value indicating whether the caller is currently
+// executing in the context of the death test child process.  Tools such as
+// Valgrind heap checkers may need this to modify their behavior in death
+// tests.  IMPORTANT: This is an internal utility.  Using it may break the
+// implementation of death tests.  User code MUST NOT use it.
+GTEST_API_ bool InDeathTestChild();
+
+}  // namespace internal
+
+// The following macros are useful for writing death tests.
+
+// Here's what happens when an ASSERT_DEATH* or EXPECT_DEATH* is
+// executed:
+//
+//   1. It generates a warning if there is more than one active
+//   thread.  This is because it's safe to fork() or clone() only
+//   when there is a single thread.
+//
+//   2. The parent process clone()s a sub-process and runs the death
+//   test in it; the sub-process exits with code 0 at the end of the
+//   death test, if it hasn't exited already.
+//
+//   3. The parent process waits for the sub-process to terminate.
+//
+//   4. The parent process checks the exit code and error message of
+//   the sub-process.
+//
+// Examples:
+//
+//   ASSERT_DEATH(server.SendMessage(56, "Hello"), "Invalid port number");
+//   for (int i = 0; i < 5; i++) {
+//     EXPECT_DEATH(server.ProcessRequest(i),
+//                  "Invalid request .* in ProcessRequest()")
+//                  << "Failed to die on request " << i;
+//   }
+//
+//   ASSERT_EXIT(server.ExitNow(), ::testing::ExitedWithCode(0), "Exiting");
+//
+//   bool KilledBySIGHUP(int exit_code) {
+//     return WIFSIGNALED(exit_code) && WTERMSIG(exit_code) == SIGHUP;
+//   }
+//
+//   ASSERT_EXIT(client.HangUpServer(), KilledBySIGHUP, "Hanging up!");
+//
+// On the regular expressions used in death tests:
+//
+//   On POSIX-compliant systems (*nix), we use the <regex.h> library,
+//   which uses the POSIX extended regex syntax.
+//
+//   On other platforms (e.g. Windows), we only support a simple regex
+//   syntax implemented as part of Google Test.  This limited
+//   implementation should be enough most of the time when writing
+//   death tests; though it lacks many features you can find in PCRE
+//   or POSIX extended regex syntax.  For example, we don't support
+//   union ("x|y"), grouping ("(xy)"), brackets ("[xy]"), and
+//   repetition count ("x{5,7}"), among others.
+//
+//   Below is the syntax that we do support.  We chose it to be a
+//   subset of both PCRE and POSIX extended regex, so it's easy to
+//   learn wherever you come from.  In the following: 'A' denotes a
+//   literal character, period (.), or a single \\ escape sequence;
+//   'x' and 'y' denote regular expressions; 'm' and 'n' are for
+//   natural numbers.
+//
+//     c     matches any literal character c
+//     \\d   matches any decimal digit
+//     \\D   matches any character that's not a decimal digit
+//     \\f   matches \f
+//     \\n   matches \n
+//     \\r   matches \r
+//     \\s   matches any ASCII whitespace, including \n
+//     \\S   matches any character that's not a whitespace
+//     \\t   matches \t
+//     \\v   matches \v
+//     \\w   matches any letter, _, or decimal digit
+//     \\W   matches any character that \\w doesn't match
+//     \\c   matches any literal character c, which must be a punctuation
+//     .     matches any single character except \n
+//     A?    matches 0 or 1 occurrences of A
+//     A*    matches 0 or many occurrences of A
+//     A+    matches 1 or many occurrences of A
+//     ^     matches the beginning of a string (not that of each line)
+//     $     matches the end of a string (not that of each line)
+//     xy    matches x followed by y
+//
+//   If you accidentally use PCRE or POSIX extended regex features
+//   not implemented by us, you will get a run-time failure.  In that
+//   case, please try to rewrite your regular expression within the
+//   above syntax.
+//
+//   This implementation is *not* meant to be as highly tuned or robust
+//   as a compiled regex library, but should perform well enough for a
+//   death test, which already incurs significant overhead by launching
+//   a child process.
+//
+// Known caveats:
+//
+//   A "threadsafe" style death test obtains the path to the test
+//   program from argv[0] and re-executes it in the sub-process.  For
+//   simplicity, the current implementation doesn't search the PATH
+//   when launching the sub-process.  This means that the user must
+//   invoke the test program via a path that contains at least one
+//   path separator (e.g. path/to/foo_test and
+//   /absolute/path/to/bar_test are fine, but foo_test is not).  This
+//   is rarely a problem as people usually don't put the test binary
+//   directory in PATH.
+//
+// TODO(wan@google.com): make thread-safe death tests search the PATH.
+
+// Asserts that a given statement causes the program to exit, with an
+// integer exit status that satisfies predicate, and emitting error output
+// that matches regex.
+# define ASSERT_EXIT(statement, predicate, regex) \
+    GTEST_DEATH_TEST_(statement, predicate, regex, GTEST_FATAL_FAILURE_)
+
+// Like ASSERT_EXIT, but continues on to successive tests in the
+// test case, if any:
+# define EXPECT_EXIT(statement, predicate, regex) \
+    GTEST_DEATH_TEST_(statement, predicate, regex, GTEST_NONFATAL_FAILURE_)
+
+// Asserts that a given statement causes the program to exit, either by
+// explicitly exiting with a nonzero exit code or being killed by a
+// signal, and emitting error output that matches regex.
+# define ASSERT_DEATH(statement, regex) \
+    ASSERT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, regex)
+
+// Like ASSERT_DEATH, but continues on to successive tests in the
+// test case, if any:
+# define EXPECT_DEATH(statement, regex) \
+    EXPECT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, regex)
+
+// Two predicate classes that can be used in {ASSERT,EXPECT}_EXIT*:
+
+// Tests that an exit code describes a normal exit with a given exit code.
+class GTEST_API_ ExitedWithCode {
+ public:
+  explicit ExitedWithCode(int exit_code);
+  bool operator()(int exit_status) const;
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ExitedWithCode& other);
+
+  const int exit_code_;
+};
+
+# if !GTEST_OS_WINDOWS
+// Tests that an exit code describes an exit due to termination by a
+// given signal.
+class GTEST_API_ KilledBySignal {
+ public:
+  explicit KilledBySignal(int signum);
+  bool operator()(int exit_status) const;
+ private:
+  const int signum_;
+};
+# endif  // !GTEST_OS_WINDOWS
+
+// EXPECT_DEBUG_DEATH asserts that the given statements die in debug mode.
+// The death testing framework causes this to have interesting semantics,
+// since the sideeffects of the call are only visible in opt mode, and not
+// in debug mode.
+//
+// In practice, this can be used to test functions that utilize the
+// LOG(DFATAL) macro using the following style:
+//
+// int DieInDebugOr12(int* sideeffect) {
+//   if (sideeffect) {
+//     *sideeffect = 12;
+//   }
+//   LOG(DFATAL) << "death";
+//   return 12;
+// }
+//
+// TEST(TestCase, TestDieOr12WorksInDgbAndOpt) {
+//   int sideeffect = 0;
+//   // Only asserts in dbg.
+//   EXPECT_DEBUG_DEATH(DieInDebugOr12(&sideeffect), "death");
+//
+// #ifdef NDEBUG
+//   // opt-mode has sideeffect visible.
+//   EXPECT_EQ(12, sideeffect);
+// #else
+//   // dbg-mode no visible sideeffect.
+//   EXPECT_EQ(0, sideeffect);
+// #endif
+// }
+//
+// This will assert that DieInDebugReturn12InOpt() crashes in debug
+// mode, usually due to a DCHECK or LOG(DFATAL), but returns the
+// appropriate fallback value (12 in this case) in opt mode. If you
+// need to test that a function has appropriate side-effects in opt
+// mode, include assertions against the side-effects.  A general
+// pattern for this is:
+//
+// EXPECT_DEBUG_DEATH({
+//   // Side-effects here will have an effect after this statement in
+//   // opt mode, but none in debug mode.
+//   EXPECT_EQ(12, DieInDebugOr12(&sideeffect));
+// }, "death");
+//
+# ifdef NDEBUG
+
+#  define EXPECT_DEBUG_DEATH(statement, regex) \
+  GTEST_EXECUTE_STATEMENT_(statement, regex)
+
+#  define ASSERT_DEBUG_DEATH(statement, regex) \
+  GTEST_EXECUTE_STATEMENT_(statement, regex)
+
+# else
+
+#  define EXPECT_DEBUG_DEATH(statement, regex) \
+  EXPECT_DEATH(statement, regex)
+
+#  define ASSERT_DEBUG_DEATH(statement, regex) \
+  ASSERT_DEATH(statement, regex)
+
+# endif  // NDEBUG for EXPECT_DEBUG_DEATH
+#endif  // GTEST_HAS_DEATH_TEST
+
+// EXPECT_DEATH_IF_SUPPORTED(statement, regex) and
+// ASSERT_DEATH_IF_SUPPORTED(statement, regex) expand to real death tests if
+// death tests are supported; otherwise they just issue a warning.  This is
+// useful when you are combining death test assertions with normal test
+// assertions in one test.
+#if GTEST_HAS_DEATH_TEST
+# define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
+    EXPECT_DEATH(statement, regex)
+# define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
+    ASSERT_DEATH(statement, regex)
+#else
+# define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
+    GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, )
+# define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
+    GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, return)
+#endif
+
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
+// This file was GENERATED by command:
+//     pump.py gtest-param-test.h.pump
+// DO NOT EDIT BY HAND!!!
+
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: vladl@google.com (Vlad Losev)
+//
+// Macros and functions for implementing parameterized tests
+// in Google C++ Testing Framework (Google Test)
+//
+// This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
+//
+#ifndef GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
+#define GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
+
+
+// Value-parameterized tests allow you to test your code with different
+// parameters without writing multiple copies of the same test.
+//
+// Here is how you use value-parameterized tests:
+
+#if 0
+
+// To write value-parameterized tests, first you should define a fixture
+// class. It is usually derived from testing::TestWithParam<T> (see below for
+// another inheritance scheme that's sometimes useful in more complicated
+// class hierarchies), where the type of your parameter values.
+// TestWithParam<T> is itself derived from testing::Test. T can be any
+// copyable type. If it's a raw pointer, you are responsible for managing the
+// lifespan of the pointed values.
+
+class FooTest : public ::testing::TestWithParam<const char*> {
+  // You can implement all the usual class fixture members here.
+};
+
+// Then, use the TEST_P macro to define as many parameterized tests
+// for this fixture as you want. The _P suffix is for "parameterized"
+// or "pattern", whichever you prefer to think.
+
+TEST_P(FooTest, DoesBlah) {
+  // Inside a test, access the test parameter with the GetParam() method
+  // of the TestWithParam<T> class:
+  EXPECT_TRUE(foo.Blah(GetParam()));
+  ...
+}
+
+TEST_P(FooTest, HasBlahBlah) {
+  ...
+}
+
+// Finally, you can use INSTANTIATE_TEST_CASE_P to instantiate the test
+// case with any set of parameters you want. Google Test defines a number
+// of functions for generating test parameters. They return what we call
+// (surprise!) parameter generators. Here is a  summary of them, which
+// are all in the testing namespace:
+//
+//
+//  Range(begin, end [, step]) - Yields values {begin, begin+step,
+//                               begin+step+step, ...}. The values do not
+//                               include end. step defaults to 1.
+//  Values(v1, v2, ..., vN)    - Yields values {v1, v2, ..., vN}.
+//  ValuesIn(container)        - Yields values from a C-style array, an STL
+//  ValuesIn(begin,end)          container, or an iterator range [begin, end).
+//  Bool()                     - Yields sequence {false, true}.
+//  Combine(g1, g2, ..., gN)   - Yields all combinations (the Cartesian product
+//                               for the math savvy) of the values generated
+//                               by the N generators.
+//
+// For more details, see comments at the definitions of these functions below
+// in this file.
+//
+// The following statement will instantiate tests from the FooTest test case
+// each with parameter values "meeny", "miny", and "moe".
+
+INSTANTIATE_TEST_CASE_P(InstantiationName,
+                        FooTest,
+                        Values("meeny", "miny", "moe"));
+
+// To distinguish different instances of the pattern, (yes, you
+// can instantiate it more then once) the first argument to the
+// INSTANTIATE_TEST_CASE_P macro is a prefix that will be added to the
+// actual test case name. Remember to pick unique prefixes for different
+// instantiations. The tests from the instantiation above will have
+// these names:
+//
+//    * InstantiationName/FooTest.DoesBlah/0 for "meeny"
+//    * InstantiationName/FooTest.DoesBlah/1 for "miny"
+//    * InstantiationName/FooTest.DoesBlah/2 for "moe"
+//    * InstantiationName/FooTest.HasBlahBlah/0 for "meeny"
+//    * InstantiationName/FooTest.HasBlahBlah/1 for "miny"
+//    * InstantiationName/FooTest.HasBlahBlah/2 for "moe"
+//
+// You can use these names in --gtest_filter.
+//
+// This statement will instantiate all tests from FooTest again, each
+// with parameter values "cat" and "dog":
+
+const char* pets[] = {"cat", "dog"};
+INSTANTIATE_TEST_CASE_P(AnotherInstantiationName, FooTest, ValuesIn(pets));
+
+// The tests from the instantiation above will have these names:
+//
+//    * AnotherInstantiationName/FooTest.DoesBlah/0 for "cat"
+//    * AnotherInstantiationName/FooTest.DoesBlah/1 for "dog"
+//    * AnotherInstantiationName/FooTest.HasBlahBlah/0 for "cat"
+//    * AnotherInstantiationName/FooTest.HasBlahBlah/1 for "dog"
+//
+// Please note that INSTANTIATE_TEST_CASE_P will instantiate all tests
+// in the given test case, whether their definitions come before or
+// AFTER the INSTANTIATE_TEST_CASE_P statement.
+//
+// Please also note that generator expressions (including parameters to the
+// generators) are evaluated in InitGoogleTest(), after main() has started.
+// This allows the user on one hand, to adjust generator parameters in order
+// to dynamically determine a set of tests to run and on the other hand,
+// give the user a chance to inspect the generated tests with Google Test
+// reflection API before RUN_ALL_TESTS() is executed.
+//
+// You can see samples/sample7_unittest.cc and samples/sample8_unittest.cc
+// for more examples.
+//
+// In the future, we plan to publish the API for defining new parameter
+// generators. But for now this interface remains part of the internal
+// implementation and is subject to change.
+//
+//
+// A parameterized test fixture must be derived from testing::Test and from
+// testing::WithParamInterface<T>, where T is the type of the parameter
+// values. Inheriting from TestWithParam<T> satisfies that requirement because
+// TestWithParam<T> inherits from both Test and WithParamInterface. In more
+// complicated hierarchies, however, it is occasionally useful to inherit
+// separately from Test and WithParamInterface. For example:
+
+class BaseTest : public ::testing::Test {
+  // You can inherit all the usual members for a non-parameterized test
+  // fixture here.
+};
+
+class DerivedTest : public BaseTest, public ::testing::WithParamInterface<int> {
+  // The usual test fixture members go here too.
+};
+
+TEST_F(BaseTest, HasFoo) {
+  // This is an ordinary non-parameterized test.
+}
+
+TEST_P(DerivedTest, DoesBlah) {
+  // GetParam works just the same here as if you inherit from TestWithParam.
+  EXPECT_TRUE(foo.Blah(GetParam()));
+}
+
+#endif  // 0
+
+
+#if !GTEST_OS_SYMBIAN
+# include <utility>
+#endif
+
+// scripts/fuse_gtest.py depends on gtest's own header being #included
+// *unconditionally*.  Therefore these #includes cannot be moved
+// inside #if GTEST_HAS_PARAM_TEST.
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: vladl@google.com (Vlad Losev)
+
+// Type and function utilities for implementing parameterized tests.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
+
+#include <iterator>
+#include <utility>
+#include <vector>
+
+// scripts/fuse_gtest.py depends on gtest's own header being #included
+// *unconditionally*.  Therefore these #includes cannot be moved
+// inside #if GTEST_HAS_PARAM_TEST.
+// Copyright 2003 Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: Dan Egnor (egnor@google.com)
+//
+// A "smart" pointer type with reference tracking.  Every pointer to a
+// particular object is kept on a circular linked list.  When the last pointer
+// to an object is destroyed or reassigned, the object is deleted.
+//
+// Used properly, this deletes the object when the last reference goes away.
+// There are several caveats:
+// - Like all reference counting schemes, cycles lead to leaks.
+// - Each smart pointer is actually two pointers (8 bytes instead of 4).
+// - Every time a pointer is assigned, the entire list of pointers to that
+//   object is traversed.  This class is therefore NOT SUITABLE when there
+//   will often be more than two or three pointers to a particular object.
+// - References are only tracked as long as linked_ptr<> objects are copied.
+//   If a linked_ptr<> is converted to a raw pointer and back, BAD THINGS
+//   will happen (double deletion).
+//
+// A good use of this class is storing object references in STL containers.
+// You can safely put linked_ptr<> in a vector<>.
+// Other uses may not be as good.
+//
+// Note: If you use an incomplete type with linked_ptr<>, the class
+// *containing* linked_ptr<> must have a constructor and destructor (even
+// if they do nothing!).
+//
+// Bill Gibbons suggested we use something like this.
+//
+// Thread Safety:
+//   Unlike other linked_ptr implementations, in this implementation
+//   a linked_ptr object is thread-safe in the sense that:
+//     - it's safe to copy linked_ptr objects concurrently,
+//     - it's safe to copy *from* a linked_ptr and read its underlying
+//       raw pointer (e.g. via get()) concurrently, and
+//     - it's safe to write to two linked_ptrs that point to the same
+//       shared object concurrently.
+// TODO(wan@google.com): rename this to safe_linked_ptr to avoid
+// confusion with normal linked_ptr.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_
+
+#include <stdlib.h>
+#include <assert.h>
+
+
+namespace testing {
+namespace internal {
+
+// Protects copying of all linked_ptr objects.
+GTEST_API_ GTEST_DECLARE_STATIC_MUTEX_(g_linked_ptr_mutex);
+
+// This is used internally by all instances of linked_ptr<>.  It needs to be
+// a non-template class because different types of linked_ptr<> can refer to
+// the same object (linked_ptr<Superclass>(obj) vs linked_ptr<Subclass>(obj)).
+// So, it needs to be possible for different types of linked_ptr to participate
+// in the same circular linked list, so we need a single class type here.
+//
+// DO NOT USE THIS CLASS DIRECTLY YOURSELF.  Use linked_ptr<T>.
+class linked_ptr_internal {
+ public:
+  // Create a new circle that includes only this instance.
+  void join_new() {
+    next_ = this;
+  }
+
+  // Many linked_ptr operations may change p.link_ for some linked_ptr
+  // variable p in the same circle as this object.  Therefore we need
+  // to prevent two such operations from occurring concurrently.
+  //
+  // Note that different types of linked_ptr objects can coexist in a
+  // circle (e.g. linked_ptr<Base>, linked_ptr<Derived1>, and
+  // linked_ptr<Derived2>).  Therefore we must use a single mutex to
+  // protect all linked_ptr objects.  This can create serious
+  // contention in production code, but is acceptable in a testing
+  // framework.
+
+  // Join an existing circle.
+  void join(linked_ptr_internal const* ptr)
+      GTEST_LOCK_EXCLUDED_(g_linked_ptr_mutex) {
+    MutexLock lock(&g_linked_ptr_mutex);
+
+    linked_ptr_internal const* p = ptr;
+    while (p->next_ != ptr) p = p->next_;
+    p->next_ = this;
+    next_ = ptr;
+  }
+
+  // Leave whatever circle we're part of.  Returns true if we were the
+  // last member of the circle.  Once this is done, you can join() another.
+  bool depart()
+      GTEST_LOCK_EXCLUDED_(g_linked_ptr_mutex) {
+    MutexLock lock(&g_linked_ptr_mutex);
+
+    if (next_ == this) return true;
+    linked_ptr_internal const* p = next_;
+    while (p->next_ != this) p = p->next_;
+    p->next_ = next_;
+    return false;
+  }
+
+ private:
+  mutable linked_ptr_internal const* next_;
+};
+
+template <typename T>
+class linked_ptr {
+ public:
+  typedef T element_type;
+
+  // Take over ownership of a raw pointer.  This should happen as soon as
+  // possible after the object is created.
+  explicit linked_ptr(T* ptr = NULL) { capture(ptr); }
+  ~linked_ptr() { depart(); }
+
+  // Copy an existing linked_ptr<>, adding ourselves to the list of references.
+  template <typename U> linked_ptr(linked_ptr<U> const& ptr) { copy(&ptr); }
+  linked_ptr(linked_ptr const& ptr) {  // NOLINT
+    assert(&ptr != this);
+    copy(&ptr);
+  }
+
+  // Assignment releases the old value and acquires the new.
+  template <typename U> linked_ptr& operator=(linked_ptr<U> const& ptr) {
+    depart();
+    copy(&ptr);
+    return *this;
+  }
+
+  linked_ptr& operator=(linked_ptr const& ptr) {
+    if (&ptr != this) {
+      depart();
+      copy(&ptr);
+    }
+    return *this;
+  }
+
+  // Smart pointer members.
+  void reset(T* ptr = NULL) {
+    depart();
+    capture(ptr);
+  }
+  T* get() const { return value_; }
+  T* operator->() const { return value_; }
+  T& operator*() const { return *value_; }
+
+  bool operator==(T* p) const { return value_ == p; }
+  bool operator!=(T* p) const { return value_ != p; }
+  template <typename U>
+  bool operator==(linked_ptr<U> const& ptr) const {
+    return value_ == ptr.get();
+  }
+  template <typename U>
+  bool operator!=(linked_ptr<U> const& ptr) const {
+    return value_ != ptr.get();
+  }
+
+ private:
+  template <typename U>
+  friend class linked_ptr;
+
+  T* value_;
+  linked_ptr_internal link_;
+
+  void depart() {
+    if (link_.depart()) delete value_;
+  }
+
+  void capture(T* ptr) {
+    value_ = ptr;
+    link_.join_new();
+  }
+
+  template <typename U> void copy(linked_ptr<U> const* ptr) {
+    value_ = ptr->get();
+    if (value_)
+      link_.join(&ptr->link_);
+    else
+      link_.join_new();
+  }
+};
+
+template<typename T> inline
+bool operator==(T* ptr, const linked_ptr<T>& x) {
+  return ptr == x.get();
+}
+
+template<typename T> inline
+bool operator!=(T* ptr, const linked_ptr<T>& x) {
+  return ptr != x.get();
+}
+
+// A function to convert T* into linked_ptr<T>
+// Doing e.g. make_linked_ptr(new FooBarBaz<type>(arg)) is a shorter notation
+// for linked_ptr<FooBarBaz<type> >(new FooBarBaz<type>(arg))
+template <typename T>
+linked_ptr<T> make_linked_ptr(T* ptr) {
+  return linked_ptr<T>(ptr);
+}
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+
+// Google Test - The Google C++ Testing Framework
+//
+// This file implements a universal value printer that can print a
+// value of any type T:
+//
+//   void ::testing::internal::UniversalPrinter<T>::Print(value, ostream_ptr);
+//
+// A user can teach this function how to print a class type T by
+// defining either operator<<() or PrintTo() in the namespace that
+// defines T.  More specifically, the FIRST defined function in the
+// following list will be used (assuming T is defined in namespace
+// foo):
+//
+//   1. foo::PrintTo(const T&, ostream*)
+//   2. operator<<(ostream&, const T&) defined in either foo or the
+//      global namespace.
+//
+// If none of the above is defined, it will print the debug string of
+// the value if it is a protocol buffer, or print the raw bytes in the
+// value otherwise.
+//
+// To aid debugging: when T is a reference type, the address of the
+// value is also printed; when T is a (const) char pointer, both the
+// pointer value and the NUL-terminated string it points to are
+// printed.
+//
+// We also provide some convenient wrappers:
+//
+//   // Prints a value to a string.  For a (const or not) char
+//   // pointer, the NUL-terminated string (but not the pointer) is
+//   // printed.
+//   std::string ::testing::PrintToString(const T& value);
+//
+//   // Prints a value tersely: for a reference type, the referenced
+//   // value (but not the address) is printed; for a (const or not) char
+//   // pointer, the NUL-terminated string (but not the pointer) is
+//   // printed.
+//   void ::testing::internal::UniversalTersePrint(const T& value, ostream*);
+//
+//   // Prints value using the type inferred by the compiler.  The difference
+//   // from UniversalTersePrint() is that this function prints both the
+//   // pointer and the NUL-terminated string for a (const or not) char pointer.
+//   void ::testing::internal::UniversalPrint(const T& value, ostream*);
+//
+//   // Prints the fields of a tuple tersely to a string vector, one
+//   // element for each field. Tuple support must be enabled in
+//   // gtest-port.h.
+//   std::vector<string> UniversalTersePrintTupleFieldsToStrings(
+//       const Tuple& value);
+//
+// Known limitation:
+//
+// The print primitives print the elements of an STL-style container
+// using the compiler-inferred type of *iter where iter is a
+// const_iterator of the container.  When const_iterator is an input
+// iterator but not a forward iterator, this inferred type may not
+// match value_type, and the print output may be incorrect.  In
+// practice, this is rarely a problem as for most containers
+// const_iterator is a forward iterator.  We'll fix this if there's an
+// actual need for it.  Note that this fix cannot rely on value_type
+// being defined as many user-defined container types don't have
+// value_type.
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
+#define GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
+
+#include <ostream>  // NOLINT
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace testing {
+
+// Definitions in the 'internal' and 'internal2' name spaces are
+// subject to change without notice.  DO NOT USE THEM IN USER CODE!
+namespace internal2 {
+
+// Prints the given number of bytes in the given object to the given
+// ostream.
+GTEST_API_ void PrintBytesInObjectTo(const unsigned char* obj_bytes,
+                                     size_t count,
+                                     ::std::ostream* os);
+
+// For selecting which printer to use when a given type has neither <<
+// nor PrintTo().
+enum TypeKind {
+  kProtobuf,              // a protobuf type
+  kConvertibleToInteger,  // a type implicitly convertible to BiggestInt
+                          // (e.g. a named or unnamed enum type)
+  kOtherType              // anything else
+};
+
+// TypeWithoutFormatter<T, kTypeKind>::PrintValue(value, os) is called
+// by the universal printer to print a value of type T when neither
+// operator<< nor PrintTo() is defined for T, where kTypeKind is the
+// "kind" of T as defined by enum TypeKind.
+template <typename T, TypeKind kTypeKind>
+class TypeWithoutFormatter {
+ public:
+  // This default version is called when kTypeKind is kOtherType.
+  static void PrintValue(const T& value, ::std::ostream* os) {
+    PrintBytesInObjectTo(reinterpret_cast<const unsigned char*>(&value),
+                         sizeof(value), os);
+  }
+};
+
+// We print a protobuf using its ShortDebugString() when the string
+// doesn't exceed this many characters; otherwise we print it using
+// DebugString() for better readability.
+const size_t kProtobufOneLinerMaxLength = 50;
+
+template <typename T>
+class TypeWithoutFormatter<T, kProtobuf> {
+ public:
+  static void PrintValue(const T& value, ::std::ostream* os) {
+    const ::testing::internal::string short_str = value.ShortDebugString();
+    const ::testing::internal::string pretty_str =
+        short_str.length() <= kProtobufOneLinerMaxLength ?
+        short_str : ("\n" + value.DebugString());
+    *os << ("<" + pretty_str + ">");
+  }
+};
+
+template <typename T>
+class TypeWithoutFormatter<T, kConvertibleToInteger> {
+ public:
+  // Since T has no << operator or PrintTo() but can be implicitly
+  // converted to BiggestInt, we print it as a BiggestInt.
+  //
+  // Most likely T is an enum type (either named or unnamed), in which
+  // case printing it as an integer is the desired behavior.  In case
+  // T is not an enum, printing it as an integer is the best we can do
+  // given that it has no user-defined printer.
+  static void PrintValue(const T& value, ::std::ostream* os) {
+    const internal::BiggestInt kBigInt = value;
+    *os << kBigInt;
+  }
+};
+
+// Prints the given value to the given ostream.  If the value is a
+// protocol message, its debug string is printed; if it's an enum or
+// of a type implicitly convertible to BiggestInt, it's printed as an
+// integer; otherwise the bytes in the value are printed.  This is
+// what UniversalPrinter<T>::Print() does when it knows nothing about
+// type T and T has neither << operator nor PrintTo().
+//
+// A user can override this behavior for a class type Foo by defining
+// a << operator in the namespace where Foo is defined.
+//
+// We put this operator in namespace 'internal2' instead of 'internal'
+// to simplify the implementation, as much code in 'internal' needs to
+// use << in STL, which would conflict with our own << were it defined
+// in 'internal'.
+//
+// Note that this operator<< takes a generic std::basic_ostream<Char,
+// CharTraits> type instead of the more restricted std::ostream.  If
+// we define it to take an std::ostream instead, we'll get an
+// "ambiguous overloads" compiler error when trying to print a type
+// Foo that supports streaming to std::basic_ostream<Char,
+// CharTraits>, as the compiler cannot tell whether
+// operator<<(std::ostream&, const T&) or
+// operator<<(std::basic_stream<Char, CharTraits>, const Foo&) is more
+// specific.
+template <typename Char, typename CharTraits, typename T>
+::std::basic_ostream<Char, CharTraits>& operator<<(
+    ::std::basic_ostream<Char, CharTraits>& os, const T& x) {
+  TypeWithoutFormatter<T,
+      (internal::IsAProtocolMessage<T>::value ? kProtobuf :
+       internal::ImplicitlyConvertible<const T&, internal::BiggestInt>::value ?
+       kConvertibleToInteger : kOtherType)>::PrintValue(x, &os);
+  return os;
+}
+
+}  // namespace internal2
+}  // namespace testing
+
+// This namespace MUST NOT BE NESTED IN ::testing, or the name look-up
+// magic needed for implementing UniversalPrinter won't work.
+namespace testing_internal {
+
+// Used to print a value that is not an STL-style container when the
+// user doesn't define PrintTo() for it.
+template <typename T>
+void DefaultPrintNonContainerTo(const T& value, ::std::ostream* os) {
+  // With the following statement, during unqualified name lookup,
+  // testing::internal2::operator<< appears as if it was declared in
+  // the nearest enclosing namespace that contains both
+  // ::testing_internal and ::testing::internal2, i.e. the global
+  // namespace.  For more details, refer to the C++ Standard section
+  // 7.3.4-1 [namespace.udir].  This allows us to fall back onto
+  // testing::internal2::operator<< in case T doesn't come with a <<
+  // operator.
+  //
+  // We cannot write 'using ::testing::internal2::operator<<;', which
+  // gcc 3.3 fails to compile due to a compiler bug.
+  using namespace ::testing::internal2;  // NOLINT
+
+  // Assuming T is defined in namespace foo, in the next statement,
+  // the compiler will consider all of:
+  //
+  //   1. foo::operator<< (thanks to Koenig look-up),
+  //   2. ::operator<< (as the current namespace is enclosed in ::),
+  //   3. testing::internal2::operator<< (thanks to the using statement above).
+  //
+  // The operator<< whose type matches T best will be picked.
+  //
+  // We deliberately allow #2 to be a candidate, as sometimes it's
+  // impossible to define #1 (e.g. when foo is ::std, defining
+  // anything in it is undefined behavior unless you are a compiler
+  // vendor.).
+  *os << value;
+}
+
+}  // namespace testing_internal
+
+namespace testing {
+namespace internal {
+
+// UniversalPrinter<T>::Print(value, ostream_ptr) prints the given
+// value to the given ostream.  The caller must ensure that
+// 'ostream_ptr' is not NULL, or the behavior is undefined.
+//
+// We define UniversalPrinter as a class template (as opposed to a
+// function template), as we need to partially specialize it for
+// reference types, which cannot be done with function templates.
+template <typename T>
+class UniversalPrinter;
+
+template <typename T>
+void UniversalPrint(const T& value, ::std::ostream* os);
+
+// Used to print an STL-style container when the user doesn't define
+// a PrintTo() for it.
+template <typename C>
+void DefaultPrintTo(IsContainer /* dummy */,
+                    false_type /* is not a pointer */,
+                    const C& container, ::std::ostream* os) {
+  const size_t kMaxCount = 32;  // The maximum number of elements to print.
+  *os << '{';
+  size_t count = 0;
+  for (typename C::const_iterator it = container.begin();
+       it != container.end(); ++it, ++count) {
+    if (count > 0) {
+      *os << ',';
+      if (count == kMaxCount) {  // Enough has been printed.
+        *os << " ...";
+        break;
+      }
+    }
+    *os << ' ';
+    // We cannot call PrintTo(*it, os) here as PrintTo() doesn't
+    // handle *it being a native array.
+    internal::UniversalPrint(*it, os);
+  }
+
+  if (count > 0) {
+    *os << ' ';
+  }
+  *os << '}';
+}
+
+// Used to print a pointer that is neither a char pointer nor a member
+// pointer, when the user doesn't define PrintTo() for it.  (A member
+// variable pointer or member function pointer doesn't really point to
+// a location in the address space.  Their representation is
+// implementation-defined.  Therefore they will be printed as raw
+// bytes.)
+template <typename T>
+void DefaultPrintTo(IsNotContainer /* dummy */,
+                    true_type /* is a pointer */,
+                    T* p, ::std::ostream* os) {
+  if (p == NULL) {
+    *os << "NULL";
+  } else {
+    // C++ doesn't allow casting from a function pointer to any object
+    // pointer.
+    //
+    // IsTrue() silences warnings: "Condition is always true",
+    // "unreachable code".
+    if (IsTrue(ImplicitlyConvertible<T*, const void*>::value)) {
+      // T is not a function type.  We just call << to print p,
+      // relying on ADL to pick up user-defined << for their pointer
+      // types, if any.
+      *os << p;
+    } else {
+      // T is a function type, so '*os << p' doesn't do what we want
+      // (it just prints p as bool).  We want to print p as a const
+      // void*.  However, we cannot cast it to const void* directly,
+      // even using reinterpret_cast, as earlier versions of gcc
+      // (e.g. 3.4.5) cannot compile the cast when p is a function
+      // pointer.  Casting to UInt64 first solves the problem.
+      *os << reinterpret_cast<const void*>(
+          reinterpret_cast<internal::UInt64>(p));
+    }
+  }
+}
+
+// Used to print a non-container, non-pointer value when the user
+// doesn't define PrintTo() for it.
+template <typename T>
+void DefaultPrintTo(IsNotContainer /* dummy */,
+                    false_type /* is not a pointer */,
+                    const T& value, ::std::ostream* os) {
+  ::testing_internal::DefaultPrintNonContainerTo(value, os);
+}
+
+// Prints the given value using the << operator if it has one;
+// otherwise prints the bytes in it.  This is what
+// UniversalPrinter<T>::Print() does when PrintTo() is not specialized
+// or overloaded for type T.
+//
+// A user can override this behavior for a class type Foo by defining
+// an overload of PrintTo() in the namespace where Foo is defined.  We
+// give the user this option as sometimes defining a << operator for
+// Foo is not desirable (e.g. the coding style may prevent doing it,
+// or there is already a << operator but it doesn't do what the user
+// wants).
+template <typename T>
+void PrintTo(const T& value, ::std::ostream* os) {
+  // DefaultPrintTo() is overloaded.  The type of its first two
+  // arguments determine which version will be picked.  If T is an
+  // STL-style container, the version for container will be called; if
+  // T is a pointer, the pointer version will be called; otherwise the
+  // generic version will be called.
+  //
+  // Note that we check for container types here, prior to we check
+  // for protocol message types in our operator<<.  The rationale is:
+  //
+  // For protocol messages, we want to give people a chance to
+  // override Google Mock's format by defining a PrintTo() or
+  // operator<<.  For STL containers, other formats can be
+  // incompatible with Google Mock's format for the container
+  // elements; therefore we check for container types here to ensure
+  // that our format is used.
+  //
+  // The second argument of DefaultPrintTo() is needed to bypass a bug
+  // in Symbian's C++ compiler that prevents it from picking the right
+  // overload between:
+  //
+  //   PrintTo(const T& x, ...);
+  //   PrintTo(T* x, ...);
+  DefaultPrintTo(IsContainerTest<T>(0), is_pointer<T>(), value, os);
+}
+
+// The following list of PrintTo() overloads tells
+// UniversalPrinter<T>::Print() how to print standard types (built-in
+// types, strings, plain arrays, and pointers).
+
+// Overloads for various char types.
+GTEST_API_ void PrintTo(unsigned char c, ::std::ostream* os);
+GTEST_API_ void PrintTo(signed char c, ::std::ostream* os);
+inline void PrintTo(char c, ::std::ostream* os) {
+  // When printing a plain char, we always treat it as unsigned.  This
+  // way, the output won't be affected by whether the compiler thinks
+  // char is signed or not.
+  PrintTo(static_cast<unsigned char>(c), os);
+}
+
+// Overloads for other simple built-in types.
+inline void PrintTo(bool x, ::std::ostream* os) {
+  *os << (x ? "true" : "false");
+}
+
+// Overload for wchar_t type.
+// Prints a wchar_t as a symbol if it is printable or as its internal
+// code otherwise and also as its decimal code (except for L'\0').
+// The L'\0' char is printed as "L'\\0'". The decimal code is printed
+// as signed integer when wchar_t is implemented by the compiler
+// as a signed type and is printed as an unsigned integer when wchar_t
+// is implemented as an unsigned type.
+GTEST_API_ void PrintTo(wchar_t wc, ::std::ostream* os);
+
+// Overloads for C strings.
+GTEST_API_ void PrintTo(const char* s, ::std::ostream* os);
+inline void PrintTo(char* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const char*>(s), os);
+}
+
+// signed/unsigned char is often used for representing binary data, so
+// we print pointers to it as void* to be safe.
+inline void PrintTo(const signed char* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const void*>(s), os);
+}
+inline void PrintTo(signed char* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const void*>(s), os);
+}
+inline void PrintTo(const unsigned char* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const void*>(s), os);
+}
+inline void PrintTo(unsigned char* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const void*>(s), os);
+}
+
+// MSVC can be configured to define wchar_t as a typedef of unsigned
+// short.  It defines _NATIVE_WCHAR_T_DEFINED when wchar_t is a native
+// type.  When wchar_t is a typedef, defining an overload for const
+// wchar_t* would cause unsigned short* be printed as a wide string,
+// possibly causing invalid memory accesses.
+#if !defined(_MSC_VER) || defined(_NATIVE_WCHAR_T_DEFINED)
+// Overloads for wide C strings
+GTEST_API_ void PrintTo(const wchar_t* s, ::std::ostream* os);
+inline void PrintTo(wchar_t* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const wchar_t*>(s), os);
+}
+#endif
+
+// Overload for C arrays.  Multi-dimensional arrays are printed
+// properly.
+
+// Prints the given number of elements in an array, without printing
+// the curly braces.
+template <typename T>
+void PrintRawArrayTo(const T a[], size_t count, ::std::ostream* os) {
+  UniversalPrint(a[0], os);
+  for (size_t i = 1; i != count; i++) {
+    *os << ", ";
+    UniversalPrint(a[i], os);
+  }
+}
+
+// Overloads for ::string and ::std::string.
+#if GTEST_HAS_GLOBAL_STRING
+GTEST_API_ void PrintStringTo(const ::string&s, ::std::ostream* os);
+inline void PrintTo(const ::string& s, ::std::ostream* os) {
+  PrintStringTo(s, os);
+}
+#endif  // GTEST_HAS_GLOBAL_STRING
+
+GTEST_API_ void PrintStringTo(const ::std::string&s, ::std::ostream* os);
+inline void PrintTo(const ::std::string& s, ::std::ostream* os) {
+  PrintStringTo(s, os);
+}
+
+// Overloads for ::wstring and ::std::wstring.
+#if GTEST_HAS_GLOBAL_WSTRING
+GTEST_API_ void PrintWideStringTo(const ::wstring&s, ::std::ostream* os);
+inline void PrintTo(const ::wstring& s, ::std::ostream* os) {
+  PrintWideStringTo(s, os);
+}
+#endif  // GTEST_HAS_GLOBAL_WSTRING
+
+#if GTEST_HAS_STD_WSTRING
+GTEST_API_ void PrintWideStringTo(const ::std::wstring&s, ::std::ostream* os);
+inline void PrintTo(const ::std::wstring& s, ::std::ostream* os) {
+  PrintWideStringTo(s, os);
+}
+#endif  // GTEST_HAS_STD_WSTRING
+
+#if GTEST_HAS_TR1_TUPLE
+// Overload for ::std::tr1::tuple.  Needed for printing function arguments,
+// which are packed as tuples.
+
+// Helper function for printing a tuple.  T must be instantiated with
+// a tuple type.
+template <typename T>
+void PrintTupleTo(const T& t, ::std::ostream* os);
+
+// Overloaded PrintTo() for tuples of various arities.  We support
+// tuples of up-to 10 fields.  The following implementation works
+// regardless of whether tr1::tuple is implemented using the
+// non-standard variadic template feature or not.
+
+inline void PrintTo(const ::std::tr1::tuple<>& t, ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1>
+void PrintTo(const ::std::tr1::tuple<T1>& t, ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2>
+void PrintTo(const ::std::tr1::tuple<T1, T2>& t, ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2, typename T3>
+void PrintTo(const ::std::tr1::tuple<T1, T2, T3>& t, ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2, typename T3, typename T4>
+void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4>& t, ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4, T5>& t,
+             ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+          typename T6>
+void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4, T5, T6>& t,
+             ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+          typename T6, typename T7>
+void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7>& t,
+             ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+          typename T6, typename T7, typename T8>
+void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8>& t,
+             ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+          typename T6, typename T7, typename T8, typename T9>
+void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9>& t,
+             ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+          typename T6, typename T7, typename T8, typename T9, typename T10>
+void PrintTo(
+    const ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10>& t,
+    ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+#endif  // GTEST_HAS_TR1_TUPLE
+
+// Overload for std::pair.
+template <typename T1, typename T2>
+void PrintTo(const ::std::pair<T1, T2>& value, ::std::ostream* os) {
+  *os << '(';
+  // We cannot use UniversalPrint(value.first, os) here, as T1 may be
+  // a reference type.  The same for printing value.second.
+  UniversalPrinter<T1>::Print(value.first, os);
+  *os << ", ";
+  UniversalPrinter<T2>::Print(value.second, os);
+  *os << ')';
+}
+
+// Implements printing a non-reference type T by letting the compiler
+// pick the right overload of PrintTo() for T.
+template <typename T>
+class UniversalPrinter {
+ public:
+  // MSVC warns about adding const to a function type, so we want to
+  // disable the warning.
+#ifdef _MSC_VER
+# pragma warning(push)          // Saves the current warning state.
+# pragma warning(disable:4180)  // Temporarily disables warning 4180.
+#endif  // _MSC_VER
+
+  // Note: we deliberately don't call this PrintTo(), as that name
+  // conflicts with ::testing::internal::PrintTo in the body of the
+  // function.
+  static void Print(const T& value, ::std::ostream* os) {
+    // By default, ::testing::internal::PrintTo() is used for printing
+    // the value.
+    //
+    // Thanks to Koenig look-up, if T is a class and has its own
+    // PrintTo() function defined in its namespace, that function will
+    // be visible here.  Since it is more specific than the generic ones
+    // in ::testing::internal, it will be picked by the compiler in the
+    // following statement - exactly what we want.
+    PrintTo(value, os);
+  }
+
+#ifdef _MSC_VER
+# pragma warning(pop)           // Restores the warning state.
+#endif  // _MSC_VER
+};
+
+// UniversalPrintArray(begin, len, os) prints an array of 'len'
+// elements, starting at address 'begin'.
+template <typename T>
+void UniversalPrintArray(const T* begin, size_t len, ::std::ostream* os) {
+  if (len == 0) {
+    *os << "{}";
+  } else {
+    *os << "{ ";
+    const size_t kThreshold = 18;
+    const size_t kChunkSize = 8;
+    // If the array has more than kThreshold elements, we'll have to
+    // omit some details by printing only the first and the last
+    // kChunkSize elements.
+    // TODO(wan@google.com): let the user control the threshold using a flag.
+    if (len <= kThreshold) {
+      PrintRawArrayTo(begin, len, os);
+    } else {
+      PrintRawArrayTo(begin, kChunkSize, os);
+      *os << ", ..., ";
+      PrintRawArrayTo(begin + len - kChunkSize, kChunkSize, os);
+    }
+    *os << " }";
+  }
+}
+// This overload prints a (const) char array compactly.
+GTEST_API_ void UniversalPrintArray(
+    const char* begin, size_t len, ::std::ostream* os);
+
+// This overload prints a (const) wchar_t array compactly.
+GTEST_API_ void UniversalPrintArray(
+    const wchar_t* begin, size_t len, ::std::ostream* os);
+
+// Implements printing an array type T[N].
+template <typename T, size_t N>
+class UniversalPrinter<T[N]> {
+ public:
+  // Prints the given array, omitting some elements when there are too
+  // many.
+  static void Print(const T (&a)[N], ::std::ostream* os) {
+    UniversalPrintArray(a, N, os);
+  }
+};
+
+// Implements printing a reference type T&.
+template <typename T>
+class UniversalPrinter<T&> {
+ public:
+  // MSVC warns about adding const to a function type, so we want to
+  // disable the warning.
+#ifdef _MSC_VER
+# pragma warning(push)          // Saves the current warning state.
+# pragma warning(disable:4180)  // Temporarily disables warning 4180.
+#endif  // _MSC_VER
+
+  static void Print(const T& value, ::std::ostream* os) {
+    // Prints the address of the value.  We use reinterpret_cast here
+    // as static_cast doesn't compile when T is a function type.
+    *os << "@" << reinterpret_cast<const void*>(&value) << " ";
+
+    // Then prints the value itself.
+    UniversalPrint(value, os);
+  }
+
+#ifdef _MSC_VER
+# pragma warning(pop)           // Restores the warning state.
+#endif  // _MSC_VER
+};
+
+// Prints a value tersely: for a reference type, the referenced value
+// (but not the address) is printed; for a (const) char pointer, the
+// NUL-terminated string (but not the pointer) is printed.
+
+template <typename T>
+class UniversalTersePrinter {
+ public:
+  static void Print(const T& value, ::std::ostream* os) {
+    UniversalPrint(value, os);
+  }
+};
+template <typename T>
+class UniversalTersePrinter<T&> {
+ public:
+  static void Print(const T& value, ::std::ostream* os) {
+    UniversalPrint(value, os);
+  }
+};
+template <typename T, size_t N>
+class UniversalTersePrinter<T[N]> {
+ public:
+  static void Print(const T (&value)[N], ::std::ostream* os) {
+    UniversalPrinter<T[N]>::Print(value, os);
+  }
+};
+template <>
+class UniversalTersePrinter<const char*> {
+ public:
+  static void Print(const char* str, ::std::ostream* os) {
+    if (str == NULL) {
+      *os << "NULL";
+    } else {
+      UniversalPrint(string(str), os);
+    }
+  }
+};
+template <>
+class UniversalTersePrinter<char*> {
+ public:
+  static void Print(char* str, ::std::ostream* os) {
+    UniversalTersePrinter<const char*>::Print(str, os);
+  }
+};
+
+#if GTEST_HAS_STD_WSTRING
+template <>
+class UniversalTersePrinter<const wchar_t*> {
+ public:
+  static void Print(const wchar_t* str, ::std::ostream* os) {
+    if (str == NULL) {
+      *os << "NULL";
+    } else {
+      UniversalPrint(::std::wstring(str), os);
+    }
+  }
+};
+#endif
+
+template <>
+class UniversalTersePrinter<wchar_t*> {
+ public:
+  static void Print(wchar_t* str, ::std::ostream* os) {
+    UniversalTersePrinter<const wchar_t*>::Print(str, os);
+  }
+};
+
+template <typename T>
+void UniversalTersePrint(const T& value, ::std::ostream* os) {
+  UniversalTersePrinter<T>::Print(value, os);
+}
+
+// Prints a value using the type inferred by the compiler.  The
+// difference between this and UniversalTersePrint() is that for a
+// (const) char pointer, this prints both the pointer and the
+// NUL-terminated string.
+template <typename T>
+void UniversalPrint(const T& value, ::std::ostream* os) {
+  // A workarond for the bug in VC++ 7.1 that prevents us from instantiating
+  // UniversalPrinter with T directly.
+  typedef T T1;
+  UniversalPrinter<T1>::Print(value, os);
+}
+
+#if GTEST_HAS_TR1_TUPLE
+typedef ::std::vector<string> Strings;
+
+// This helper template allows PrintTo() for tuples and
+// UniversalTersePrintTupleFieldsToStrings() to be defined by
+// induction on the number of tuple fields.  The idea is that
+// TuplePrefixPrinter<N>::PrintPrefixTo(t, os) prints the first N
+// fields in tuple t, and can be defined in terms of
+// TuplePrefixPrinter<N - 1>.
+
+// The inductive case.
+template <size_t N>
+struct TuplePrefixPrinter {
+  // Prints the first N fields of a tuple.
+  template <typename Tuple>
+  static void PrintPrefixTo(const Tuple& t, ::std::ostream* os) {
+    TuplePrefixPrinter<N - 1>::PrintPrefixTo(t, os);
+    *os << ", ";
+    UniversalPrinter<typename ::std::tr1::tuple_element<N - 1, Tuple>::type>
+        ::Print(::std::tr1::get<N - 1>(t), os);
+  }
+
+  // Tersely prints the first N fields of a tuple to a string vector,
+  // one element for each field.
+  template <typename Tuple>
+  static void TersePrintPrefixToStrings(const Tuple& t, Strings* strings) {
+    TuplePrefixPrinter<N - 1>::TersePrintPrefixToStrings(t, strings);
+    ::std::stringstream ss;
+    UniversalTersePrint(::std::tr1::get<N - 1>(t), &ss);
+    strings->push_back(ss.str());
+  }
+};
+
+// Base cases.
+template <>
+struct TuplePrefixPrinter<0> {
+  template <typename Tuple>
+  static void PrintPrefixTo(const Tuple&, ::std::ostream*) {}
+
+  template <typename Tuple>
+  static void TersePrintPrefixToStrings(const Tuple&, Strings*) {}
+};
+// We have to specialize the entire TuplePrefixPrinter<> class
+// template here, even though the definition of
+// TersePrintPrefixToStrings() is the same as the generic version, as
+// Embarcadero (formerly CodeGear, formerly Borland) C++ doesn't
+// support specializing a method template of a class template.
+template <>
+struct TuplePrefixPrinter<1> {
+  template <typename Tuple>
+  static void PrintPrefixTo(const Tuple& t, ::std::ostream* os) {
+    UniversalPrinter<typename ::std::tr1::tuple_element<0, Tuple>::type>::
+        Print(::std::tr1::get<0>(t), os);
+  }
+
+  template <typename Tuple>
+  static void TersePrintPrefixToStrings(const Tuple& t, Strings* strings) {
+    ::std::stringstream ss;
+    UniversalTersePrint(::std::tr1::get<0>(t), &ss);
+    strings->push_back(ss.str());
+  }
+};
+
+// Helper function for printing a tuple.  T must be instantiated with
+// a tuple type.
+template <typename T>
+void PrintTupleTo(const T& t, ::std::ostream* os) {
+  *os << "(";
+  TuplePrefixPrinter< ::std::tr1::tuple_size<T>::value>::
+      PrintPrefixTo(t, os);
+  *os << ")";
+}
+
+// Prints the fields of a tuple tersely to a string vector, one
+// element for each field.  See the comment before
+// UniversalTersePrint() for how we define "tersely".
+template <typename Tuple>
+Strings UniversalTersePrintTupleFieldsToStrings(const Tuple& value) {
+  Strings result;
+  TuplePrefixPrinter< ::std::tr1::tuple_size<Tuple>::value>::
+      TersePrintPrefixToStrings(value, &result);
+  return result;
+}
+#endif  // GTEST_HAS_TR1_TUPLE
+
+}  // namespace internal
+
+template <typename T>
+::std::string PrintToString(const T& value) {
+  ::std::stringstream ss;
+  internal::UniversalTersePrinter<T>::Print(value, &ss);
+  return ss.str();
+}
+
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
+
+#if GTEST_HAS_PARAM_TEST
+
+namespace testing {
+namespace internal {
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Outputs a message explaining invalid registration of different
+// fixture class for the same test case. This may happen when
+// TEST_P macro is used to define two tests with the same name
+// but in different namespaces.
+GTEST_API_ void ReportInvalidTestCaseType(const char* test_case_name,
+                                          const char* file, int line);
+
+template <typename> class ParamGeneratorInterface;
+template <typename> class ParamGenerator;
+
+// Interface for iterating over elements provided by an implementation
+// of ParamGeneratorInterface<T>.
+template <typename T>
+class ParamIteratorInterface {
+ public:
+  virtual ~ParamIteratorInterface() {}
+  // A pointer to the base generator instance.
+  // Used only for the purposes of iterator comparison
+  // to make sure that two iterators belong to the same generator.
+  virtual const ParamGeneratorInterface<T>* BaseGenerator() const = 0;
+  // Advances iterator to point to the next element
+  // provided by the generator. The caller is responsible
+  // for not calling Advance() on an iterator equal to
+  // BaseGenerator()->End().
+  virtual void Advance() = 0;
+  // Clones the iterator object. Used for implementing copy semantics
+  // of ParamIterator<T>.
+  virtual ParamIteratorInterface* Clone() const = 0;
+  // Dereferences the current iterator and provides (read-only) access
+  // to the pointed value. It is the caller's responsibility not to call
+  // Current() on an iterator equal to BaseGenerator()->End().
+  // Used for implementing ParamGenerator<T>::operator*().
+  virtual const T* Current() const = 0;
+  // Determines whether the given iterator and other point to the same
+  // element in the sequence generated by the generator.
+  // Used for implementing ParamGenerator<T>::operator==().
+  virtual bool Equals(const ParamIteratorInterface& other) const = 0;
+};
+
+// Class iterating over elements provided by an implementation of
+// ParamGeneratorInterface<T>. It wraps ParamIteratorInterface<T>
+// and implements the const forward iterator concept.
+template <typename T>
+class ParamIterator {
+ public:
+  typedef T value_type;
+  typedef const T& reference;
+  typedef ptrdiff_t difference_type;
+
+  // ParamIterator assumes ownership of the impl_ pointer.
+  ParamIterator(const ParamIterator& other) : impl_(other.impl_->Clone()) {}
+  ParamIterator& operator=(const ParamIterator& other) {
+    if (this != &other)
+      impl_.reset(other.impl_->Clone());
+    return *this;
+  }
+
+  const T& operator*() const { return *impl_->Current(); }
+  const T* operator->() const { return impl_->Current(); }
+  // Prefix version of operator++.
+  ParamIterator& operator++() {
+    impl_->Advance();
+    return *this;
+  }
+  // Postfix version of operator++.
+  ParamIterator operator++(int /*unused*/) {
+    ParamIteratorInterface<T>* clone = impl_->Clone();
+    impl_->Advance();
+    return ParamIterator(clone);
+  }
+  bool operator==(const ParamIterator& other) const {
+    return impl_.get() == other.impl_.get() || impl_->Equals(*other.impl_);
+  }
+  bool operator!=(const ParamIterator& other) const {
+    return !(*this == other);
+  }
+
+ private:
+  friend class ParamGenerator<T>;
+  explicit ParamIterator(ParamIteratorInterface<T>* impl) : impl_(impl) {}
+  scoped_ptr<ParamIteratorInterface<T> > impl_;
+};
+
+// ParamGeneratorInterface<T> is the binary interface to access generators
+// defined in other translation units.
+template <typename T>
+class ParamGeneratorInterface {
+ public:
+  typedef T ParamType;
+
+  virtual ~ParamGeneratorInterface() {}
+
+  // Generator interface definition
+  virtual ParamIteratorInterface<T>* Begin() const = 0;
+  virtual ParamIteratorInterface<T>* End() const = 0;
+};
+
+// Wraps ParamGeneratorInterface<T> and provides general generator syntax
+// compatible with the STL Container concept.
+// This class implements copy initialization semantics and the contained
+// ParamGeneratorInterface<T> instance is shared among all copies
+// of the original object. This is possible because that instance is immutable.
+template<typename T>
+class ParamGenerator {
+ public:
+  typedef ParamIterator<T> iterator;
+
+  explicit ParamGenerator(ParamGeneratorInterface<T>* impl) : impl_(impl) {}
+  ParamGenerator(const ParamGenerator& other) : impl_(other.impl_) {}
+
+  ParamGenerator& operator=(const ParamGenerator& other) {
+    impl_ = other.impl_;
+    return *this;
+  }
+
+  iterator begin() const { return iterator(impl_->Begin()); }
+  iterator end() const { return iterator(impl_->End()); }
+
+ private:
+  linked_ptr<const ParamGeneratorInterface<T> > impl_;
+};
+
+// Generates values from a range of two comparable values. Can be used to
+// generate sequences of user-defined types that implement operator+() and
+// operator<().
+// This class is used in the Range() function.
+template <typename T, typename IncrementT>
+class RangeGenerator : public ParamGeneratorInterface<T> {
+ public:
+  RangeGenerator(T begin, T end, IncrementT step)
+      : begin_(begin), end_(end),
+        step_(step), end_index_(CalculateEndIndex(begin, end, step)) {}
+  virtual ~RangeGenerator() {}
+
+  virtual ParamIteratorInterface<T>* Begin() const {
+    return new Iterator(this, begin_, 0, step_);
+  }
+  virtual ParamIteratorInterface<T>* End() const {
+    return new Iterator(this, end_, end_index_, step_);
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<T> {
+   public:
+    Iterator(const ParamGeneratorInterface<T>* base, T value, int index,
+             IncrementT step)
+        : base_(base), value_(value), index_(index), step_(step) {}
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<T>* BaseGenerator() const {
+      return base_;
+    }
+    virtual void Advance() {
+      value_ = value_ + step_;
+      index_++;
+    }
+    virtual ParamIteratorInterface<T>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const T* Current() const { return &value_; }
+    virtual bool Equals(const ParamIteratorInterface<T>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const int other_index =
+          CheckedDowncastToActualType<const Iterator>(&other)->index_;
+      return index_ == other_index;
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : ParamIteratorInterface<T>(),
+          base_(other.base_), value_(other.value_), index_(other.index_),
+          step_(other.step_) {}
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<T>* const base_;
+    T value_;
+    int index_;
+    const IncrementT step_;
+  };  // class RangeGenerator::Iterator
+
+  static int CalculateEndIndex(const T& begin,
+                               const T& end,
+                               const IncrementT& step) {
+    int end_index = 0;
+    for (T i = begin; i < end; i = i + step)
+      end_index++;
+    return end_index;
+  }
+
+  // No implementation - assignment is unsupported.
+  void operator=(const RangeGenerator& other);
+
+  const T begin_;
+  const T end_;
+  const IncrementT step_;
+  // The index for the end() iterator. All the elements in the generated
+  // sequence are indexed (0-based) to aid iterator comparison.
+  const int end_index_;
+};  // class RangeGenerator
+
+
+// Generates values from a pair of STL-style iterators. Used in the
+// ValuesIn() function. The elements are copied from the source range
+// since the source can be located on the stack, and the generator
+// is likely to persist beyond that stack frame.
+template <typename T>
+class ValuesInIteratorRangeGenerator : public ParamGeneratorInterface<T> {
+ public:
+  template <typename ForwardIterator>
+  ValuesInIteratorRangeGenerator(ForwardIterator begin, ForwardIterator end)
+      : container_(begin, end) {}
+  virtual ~ValuesInIteratorRangeGenerator() {}
+
+  virtual ParamIteratorInterface<T>* Begin() const {
+    return new Iterator(this, container_.begin());
+  }
+  virtual ParamIteratorInterface<T>* End() const {
+    return new Iterator(this, container_.end());
+  }
+
+ private:
+  typedef typename ::std::vector<T> ContainerType;
+
+  class Iterator : public ParamIteratorInterface<T> {
+   public:
+    Iterator(const ParamGeneratorInterface<T>* base,
+             typename ContainerType::const_iterator iterator)
+        : base_(base), iterator_(iterator) {}
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<T>* BaseGenerator() const {
+      return base_;
+    }
+    virtual void Advance() {
+      ++iterator_;
+      value_.reset();
+    }
+    virtual ParamIteratorInterface<T>* Clone() const {
+      return new Iterator(*this);
+    }
+    // We need to use cached value referenced by iterator_ because *iterator_
+    // can return a temporary object (and of type other then T), so just
+    // having "return &*iterator_;" doesn't work.
+    // value_ is updated here and not in Advance() because Advance()
+    // can advance iterator_ beyond the end of the range, and we cannot
+    // detect that fact. The client code, on the other hand, is
+    // responsible for not calling Current() on an out-of-range iterator.
+    virtual const T* Current() const {
+      if (value_.get() == NULL)
+        value_.reset(new T(*iterator_));
+      return value_.get();
+    }
+    virtual bool Equals(const ParamIteratorInterface<T>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      return iterator_ ==
+          CheckedDowncastToActualType<const Iterator>(&other)->iterator_;
+    }
+
+   private:
+    Iterator(const Iterator& other)
+          // The explicit constructor call suppresses a false warning
+          // emitted by gcc when supplied with the -Wextra option.
+        : ParamIteratorInterface<T>(),
+          base_(other.base_),
+          iterator_(other.iterator_) {}
+
+    const ParamGeneratorInterface<T>* const base_;
+    typename ContainerType::const_iterator iterator_;
+    // A cached value of *iterator_. We keep it here to allow access by
+    // pointer in the wrapping iterator's operator->().
+    // value_ needs to be mutable to be accessed in Current().
+    // Use of scoped_ptr helps manage cached value's lifetime,
+    // which is bound by the lifespan of the iterator itself.
+    mutable scoped_ptr<const T> value_;
+  };  // class ValuesInIteratorRangeGenerator::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const ValuesInIteratorRangeGenerator& other);
+
+  const ContainerType container_;
+};  // class ValuesInIteratorRangeGenerator
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Stores a parameter value and later creates tests parameterized with that
+// value.
+template <class TestClass>
+class ParameterizedTestFactory : public TestFactoryBase {
+ public:
+  typedef typename TestClass::ParamType ParamType;
+  explicit ParameterizedTestFactory(ParamType parameter) :
+      parameter_(parameter) {}
+  virtual Test* CreateTest() {
+    TestClass::SetParam(&parameter_);
+    return new TestClass();
+  }
+
+ private:
+  const ParamType parameter_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestFactory);
+};
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// TestMetaFactoryBase is a base class for meta-factories that create
+// test factories for passing into MakeAndRegisterTestInfo function.
+template <class ParamType>
+class TestMetaFactoryBase {
+ public:
+  virtual ~TestMetaFactoryBase() {}
+
+  virtual TestFactoryBase* CreateTestFactory(ParamType parameter) = 0;
+};
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// TestMetaFactory creates test factories for passing into
+// MakeAndRegisterTestInfo function. Since MakeAndRegisterTestInfo receives
+// ownership of test factory pointer, same factory object cannot be passed
+// into that method twice. But ParameterizedTestCaseInfo is going to call
+// it for each Test/Parameter value combination. Thus it needs meta factory
+// creator class.
+template <class TestCase>
+class TestMetaFactory
+    : public TestMetaFactoryBase<typename TestCase::ParamType> {
+ public:
+  typedef typename TestCase::ParamType ParamType;
+
+  TestMetaFactory() {}
+
+  virtual TestFactoryBase* CreateTestFactory(ParamType parameter) {
+    return new ParameterizedTestFactory<TestCase>(parameter);
+  }
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestMetaFactory);
+};
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// ParameterizedTestCaseInfoBase is a generic interface
+// to ParameterizedTestCaseInfo classes. ParameterizedTestCaseInfoBase
+// accumulates test information provided by TEST_P macro invocations
+// and generators provided by INSTANTIATE_TEST_CASE_P macro invocations
+// and uses that information to register all resulting test instances
+// in RegisterTests method. The ParameterizeTestCaseRegistry class holds
+// a collection of pointers to the ParameterizedTestCaseInfo objects
+// and calls RegisterTests() on each of them when asked.
+class ParameterizedTestCaseInfoBase {
+ public:
+  virtual ~ParameterizedTestCaseInfoBase() {}
+
+  // Base part of test case name for display purposes.
+  virtual const string& GetTestCaseName() const = 0;
+  // Test case id to verify identity.
+  virtual TypeId GetTestCaseTypeId() const = 0;
+  // UnitTest class invokes this method to register tests in this
+  // test case right before running them in RUN_ALL_TESTS macro.
+  // This method should not be called more then once on any single
+  // instance of a ParameterizedTestCaseInfoBase derived class.
+  virtual void RegisterTests() = 0;
+
+ protected:
+  ParameterizedTestCaseInfoBase() {}
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestCaseInfoBase);
+};
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// ParameterizedTestCaseInfo accumulates tests obtained from TEST_P
+// macro invocations for a particular test case and generators
+// obtained from INSTANTIATE_TEST_CASE_P macro invocations for that
+// test case. It registers tests with all values generated by all
+// generators when asked.
+template <class TestCase>
+class ParameterizedTestCaseInfo : public ParameterizedTestCaseInfoBase {
+ public:
+  // ParamType and GeneratorCreationFunc are private types but are required
+  // for declarations of public methods AddTestPattern() and
+  // AddTestCaseInstantiation().
+  typedef typename TestCase::ParamType ParamType;
+  // A function that returns an instance of appropriate generator type.
+  typedef ParamGenerator<ParamType>(GeneratorCreationFunc)();
+
+  explicit ParameterizedTestCaseInfo(const char* name)
+      : test_case_name_(name) {}
+
+  // Test case base name for display purposes.
+  virtual const string& GetTestCaseName() const { return test_case_name_; }
+  // Test case id to verify identity.
+  virtual TypeId GetTestCaseTypeId() const { return GetTypeId<TestCase>(); }
+  // TEST_P macro uses AddTestPattern() to record information
+  // about a single test in a LocalTestInfo structure.
+  // test_case_name is the base name of the test case (without invocation
+  // prefix). test_base_name is the name of an individual test without
+  // parameter index. For the test SequenceA/FooTest.DoBar/1 FooTest is
+  // test case base name and DoBar is test base name.
+  void AddTestPattern(const char* test_case_name,
+                      const char* test_base_name,
+                      TestMetaFactoryBase<ParamType>* meta_factory) {
+    tests_.push_back(linked_ptr<TestInfo>(new TestInfo(test_case_name,
+                                                       test_base_name,
+                                                       meta_factory)));
+  }
+  // INSTANTIATE_TEST_CASE_P macro uses AddGenerator() to record information
+  // about a generator.
+  int AddTestCaseInstantiation(const string& instantiation_name,
+                               GeneratorCreationFunc* func,
+                               const char* /* file */,
+                               int /* line */) {
+    instantiations_.push_back(::std::make_pair(instantiation_name, func));
+    return 0;  // Return value used only to run this method in namespace scope.
+  }
+  // UnitTest class invokes this method to register tests in this test case
+  // test cases right before running tests in RUN_ALL_TESTS macro.
+  // This method should not be called more then once on any single
+  // instance of a ParameterizedTestCaseInfoBase derived class.
+  // UnitTest has a guard to prevent from calling this method more then once.
+  virtual void RegisterTests() {
+    for (typename TestInfoContainer::iterator test_it = tests_.begin();
+         test_it != tests_.end(); ++test_it) {
+      linked_ptr<TestInfo> test_info = *test_it;
+      for (typename InstantiationContainer::iterator gen_it =
+               instantiations_.begin(); gen_it != instantiations_.end();
+               ++gen_it) {
+        const string& instantiation_name = gen_it->first;
+        ParamGenerator<ParamType> generator((*gen_it->second)());
+
+        string test_case_name;
+        if ( !instantiation_name.empty() )
+          test_case_name = instantiation_name + "/";
+        test_case_name += test_info->test_case_base_name;
+
+        int i = 0;
+        for (typename ParamGenerator<ParamType>::iterator param_it =
+                 generator.begin();
+             param_it != generator.end(); ++param_it, ++i) {
+          Message test_name_stream;
+          test_name_stream << test_info->test_base_name << "/" << i;
+          MakeAndRegisterTestInfo(
+              test_case_name.c_str(),
+              test_name_stream.GetString().c_str(),
+              NULL,  // No type parameter.
+              PrintToString(*param_it).c_str(),
+              GetTestCaseTypeId(),
+              TestCase::SetUpTestCase,
+              TestCase::TearDownTestCase,
+              test_info->test_meta_factory->CreateTestFactory(*param_it));
+        }  // for param_it
+      }  // for gen_it
+    }  // for test_it
+  }  // RegisterTests
+
+ private:
+  // LocalTestInfo structure keeps information about a single test registered
+  // with TEST_P macro.
+  struct TestInfo {
+    TestInfo(const char* a_test_case_base_name,
+             const char* a_test_base_name,
+             TestMetaFactoryBase<ParamType>* a_test_meta_factory) :
+        test_case_base_name(a_test_case_base_name),
+        test_base_name(a_test_base_name),
+        test_meta_factory(a_test_meta_factory) {}
+
+    const string test_case_base_name;
+    const string test_base_name;
+    const scoped_ptr<TestMetaFactoryBase<ParamType> > test_meta_factory;
+  };
+  typedef ::std::vector<linked_ptr<TestInfo> > TestInfoContainer;
+  // Keeps pairs of <Instantiation name, Sequence generator creation function>
+  // received from INSTANTIATE_TEST_CASE_P macros.
+  typedef ::std::vector<std::pair<string, GeneratorCreationFunc*> >
+      InstantiationContainer;
+
+  const string test_case_name_;
+  TestInfoContainer tests_;
+  InstantiationContainer instantiations_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestCaseInfo);
+};  // class ParameterizedTestCaseInfo
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// ParameterizedTestCaseRegistry contains a map of ParameterizedTestCaseInfoBase
+// classes accessed by test case names. TEST_P and INSTANTIATE_TEST_CASE_P
+// macros use it to locate their corresponding ParameterizedTestCaseInfo
+// descriptors.
+class ParameterizedTestCaseRegistry {
+ public:
+  ParameterizedTestCaseRegistry() {}
+  ~ParameterizedTestCaseRegistry() {
+    for (TestCaseInfoContainer::iterator it = test_case_infos_.begin();
+         it != test_case_infos_.end(); ++it) {
+      delete *it;
+    }
+  }
+
+  // Looks up or creates and returns a structure containing information about
+  // tests and instantiations of a particular test case.
+  template <class TestCase>
+  ParameterizedTestCaseInfo<TestCase>* GetTestCasePatternHolder(
+      const char* test_case_name,
+      const char* file,
+      int line) {
+    ParameterizedTestCaseInfo<TestCase>* typed_test_info = NULL;
+    for (TestCaseInfoContainer::iterator it = test_case_infos_.begin();
+         it != test_case_infos_.end(); ++it) {
+      if ((*it)->GetTestCaseName() == test_case_name) {
+        if ((*it)->GetTestCaseTypeId() != GetTypeId<TestCase>()) {
+          // Complain about incorrect usage of Google Test facilities
+          // and terminate the program since we cannot guaranty correct
+          // test case setup and tear-down in this case.
+          ReportInvalidTestCaseType(test_case_name,  file, line);
+          posix::Abort();
+        } else {
+          // At this point we are sure that the object we found is of the same
+          // type we are looking for, so we downcast it to that type
+          // without further checks.
+          typed_test_info = CheckedDowncastToActualType<
+              ParameterizedTestCaseInfo<TestCase> >(*it);
+        }
+        break;
+      }
+    }
+    if (typed_test_info == NULL) {
+      typed_test_info = new ParameterizedTestCaseInfo<TestCase>(test_case_name);
+      test_case_infos_.push_back(typed_test_info);
+    }
+    return typed_test_info;
+  }
+  void RegisterTests() {
+    for (TestCaseInfoContainer::iterator it = test_case_infos_.begin();
+         it != test_case_infos_.end(); ++it) {
+      (*it)->RegisterTests();
+    }
+  }
+
+ private:
+  typedef ::std::vector<ParameterizedTestCaseInfoBase*> TestCaseInfoContainer;
+
+  TestCaseInfoContainer test_case_infos_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestCaseRegistry);
+};
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  //  GTEST_HAS_PARAM_TEST
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
+// This file was GENERATED by command:
+//     pump.py gtest-param-util-generated.h.pump
+// DO NOT EDIT BY HAND!!!
+
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: vladl@google.com (Vlad Losev)
+
+// Type and function utilities for implementing parameterized tests.
+// This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
+//
+// Currently Google Test supports at most 50 arguments in Values,
+// and at most 10 arguments in Combine. Please contact
+// googletestframework@googlegroups.com if you need more.
+// Please note that the number of arguments to Combine is limited
+// by the maximum arity of the implementation of tr1::tuple which is
+// currently set at 10.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_
+
+// scripts/fuse_gtest.py depends on gtest's own header being #included
+// *unconditionally*.  Therefore these #includes cannot be moved
+// inside #if GTEST_HAS_PARAM_TEST.
+
+#if GTEST_HAS_PARAM_TEST
+
+namespace testing {
+
+// Forward declarations of ValuesIn(), which is implemented in
+// include/gtest/gtest-param-test.h.
+template <typename ForwardIterator>
+internal::ParamGenerator<
+  typename ::testing::internal::IteratorTraits<ForwardIterator>::value_type>
+ValuesIn(ForwardIterator begin, ForwardIterator end);
+
+template <typename T, size_t N>
+internal::ParamGenerator<T> ValuesIn(const T (&array)[N]);
+
+template <class Container>
+internal::ParamGenerator<typename Container::value_type> ValuesIn(
+    const Container& container);
+
+namespace internal {
+
+// Used in the Values() function to provide polymorphic capabilities.
+template <typename T1>
+class ValueArray1 {
+ public:
+  explicit ValueArray1(T1 v1) : v1_(v1) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const { return ValuesIn(&v1_, &v1_ + 1); }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray1& other);
+
+  const T1 v1_;
+};
+
+template <typename T1, typename T2>
+class ValueArray2 {
+ public:
+  ValueArray2(T1 v1, T2 v2) : v1_(v1), v2_(v2) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray2& other);
+
+  const T1 v1_;
+  const T2 v2_;
+};
+
+template <typename T1, typename T2, typename T3>
+class ValueArray3 {
+ public:
+  ValueArray3(T1 v1, T2 v2, T3 v3) : v1_(v1), v2_(v2), v3_(v3) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray3& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4>
+class ValueArray4 {
+ public:
+  ValueArray4(T1 v1, T2 v2, T3 v3, T4 v4) : v1_(v1), v2_(v2), v3_(v3),
+      v4_(v4) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray4& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+class ValueArray5 {
+ public:
+  ValueArray5(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5) : v1_(v1), v2_(v2), v3_(v3),
+      v4_(v4), v5_(v5) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray5& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6>
+class ValueArray6 {
+ public:
+  ValueArray6(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6) : v1_(v1), v2_(v2),
+      v3_(v3), v4_(v4), v5_(v5), v6_(v6) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray6& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7>
+class ValueArray7 {
+ public:
+  ValueArray7(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7) : v1_(v1),
+      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray7& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8>
+class ValueArray8 {
+ public:
+  ValueArray8(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
+      T8 v8) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray8& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9>
+class ValueArray9 {
+ public:
+  ValueArray9(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8,
+      T9 v9) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray9& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10>
+class ValueArray10 {
+ public:
+  ValueArray10(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray10& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11>
+class ValueArray11 {
+ public:
+  ValueArray11(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6),
+      v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray11& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12>
+class ValueArray12 {
+ public:
+  ValueArray12(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5),
+      v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray12& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13>
+class ValueArray13 {
+ public:
+  ValueArray13(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13) : v1_(v1), v2_(v2), v3_(v3), v4_(v4),
+      v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11),
+      v12_(v12), v13_(v13) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray13& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14>
+class ValueArray14 {
+ public:
+  ValueArray14(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14) : v1_(v1), v2_(v2), v3_(v3),
+      v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray14& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15>
+class ValueArray15 {
+ public:
+  ValueArray15(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15) : v1_(v1), v2_(v2),
+      v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray15& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16>
+class ValueArray16 {
+ public:
+  ValueArray16(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16) : v1_(v1),
+      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9),
+      v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15),
+      v16_(v16) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray16& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17>
+class ValueArray17 {
+ public:
+  ValueArray17(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16,
+      T17 v17) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray17& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18>
+class ValueArray18 {
+ public:
+  ValueArray18(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray18& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19>
+class ValueArray19 {
+ public:
+  ValueArray19(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6),
+      v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13),
+      v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray19& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20>
+class ValueArray20 {
+ public:
+  ValueArray20(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5),
+      v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12),
+      v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18),
+      v19_(v19), v20_(v20) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray20& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21>
+class ValueArray21 {
+ public:
+  ValueArray21(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21) : v1_(v1), v2_(v2), v3_(v3), v4_(v4),
+      v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11),
+      v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17),
+      v18_(v18), v19_(v19), v20_(v20), v21_(v21) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray21& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22>
+class ValueArray22 {
+ public:
+  ValueArray22(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22) : v1_(v1), v2_(v2), v3_(v3),
+      v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
+      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray22& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23>
+class ValueArray23 {
+ public:
+  ValueArray23(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23) : v1_(v1), v2_(v2),
+      v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
+      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
+      v23_(v23) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray23& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24>
+class ValueArray24 {
+ public:
+  ValueArray24(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24) : v1_(v1),
+      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9),
+      v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15),
+      v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21),
+      v22_(v22), v23_(v23), v24_(v24) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray24& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25>
+class ValueArray25 {
+ public:
+  ValueArray25(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24,
+      T25 v25) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
+      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray25& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26>
+class ValueArray26 {
+ public:
+  ValueArray26(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
+      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray26& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27>
+class ValueArray27 {
+ public:
+  ValueArray27(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6),
+      v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13),
+      v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19),
+      v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25),
+      v26_(v26), v27_(v27) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray27& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28>
+class ValueArray28 {
+ public:
+  ValueArray28(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5),
+      v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12),
+      v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18),
+      v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24),
+      v25_(v25), v26_(v26), v27_(v27), v28_(v28) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray28& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29>
+class ValueArray29 {
+ public:
+  ValueArray29(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29) : v1_(v1), v2_(v2), v3_(v3), v4_(v4),
+      v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11),
+      v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17),
+      v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23),
+      v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray29& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30>
+class ValueArray30 {
+ public:
+  ValueArray30(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30) : v1_(v1), v2_(v2), v3_(v3),
+      v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
+      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
+      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
+      v29_(v29), v30_(v30) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray30& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31>
+class ValueArray31 {
+ public:
+  ValueArray31(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31) : v1_(v1), v2_(v2),
+      v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
+      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
+      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
+      v29_(v29), v30_(v30), v31_(v31) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray31& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32>
+class ValueArray32 {
+ public:
+  ValueArray32(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32) : v1_(v1),
+      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9),
+      v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15),
+      v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21),
+      v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27),
+      v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray32& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33>
+class ValueArray33 {
+ public:
+  ValueArray33(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32,
+      T33 v33) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
+      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
+      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
+      v33_(v33) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray33& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34>
+class ValueArray34 {
+ public:
+  ValueArray34(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
+      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
+      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
+      v33_(v33), v34_(v34) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray34& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35>
+class ValueArray35 {
+ public:
+  ValueArray35(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6),
+      v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13),
+      v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19),
+      v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25),
+      v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31),
+      v32_(v32), v33_(v33), v34_(v34), v35_(v35) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray35& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36>
+class ValueArray36 {
+ public:
+  ValueArray36(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5),
+      v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12),
+      v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18),
+      v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24),
+      v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30),
+      v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35), v36_(v36) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray36& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37>
+class ValueArray37 {
+ public:
+  ValueArray37(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37) : v1_(v1), v2_(v2), v3_(v3), v4_(v4),
+      v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11),
+      v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17),
+      v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23),
+      v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29),
+      v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35),
+      v36_(v36), v37_(v37) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray37& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38>
+class ValueArray38 {
+ public:
+  ValueArray38(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38) : v1_(v1), v2_(v2), v3_(v3),
+      v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
+      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
+      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
+      v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34),
+      v35_(v35), v36_(v36), v37_(v37), v38_(v38) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray38& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39>
+class ValueArray39 {
+ public:
+  ValueArray39(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39) : v1_(v1), v2_(v2),
+      v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
+      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
+      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
+      v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34),
+      v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray39& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40>
+class ValueArray40 {
+ public:
+  ValueArray40(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40) : v1_(v1),
+      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9),
+      v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15),
+      v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21),
+      v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27),
+      v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33),
+      v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39),
+      v40_(v40) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray40& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41>
+class ValueArray41 {
+ public:
+  ValueArray41(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40,
+      T41 v41) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
+      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
+      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
+      v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38),
+      v39_(v39), v40_(v40), v41_(v41) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray41& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42>
+class ValueArray42 {
+ public:
+  ValueArray42(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
+      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
+      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
+      v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38),
+      v39_(v39), v40_(v40), v41_(v41), v42_(v42) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray42& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43>
+class ValueArray43 {
+ public:
+  ValueArray43(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42, T43 v43) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6),
+      v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13),
+      v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19),
+      v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25),
+      v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31),
+      v32_(v32), v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37),
+      v38_(v38), v39_(v39), v40_(v40), v41_(v41), v42_(v42), v43_(v43) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray43& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+  const T43 v43_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44>
+class ValueArray44 {
+ public:
+  ValueArray44(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42, T43 v43, T44 v44) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5),
+      v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12),
+      v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18),
+      v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24),
+      v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30),
+      v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35), v36_(v36),
+      v37_(v37), v38_(v38), v39_(v39), v40_(v40), v41_(v41), v42_(v42),
+      v43_(v43), v44_(v44) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray44& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+  const T43 v43_;
+  const T44 v44_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45>
+class ValueArray45 {
+ public:
+  ValueArray45(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42, T43 v43, T44 v44, T45 v45) : v1_(v1), v2_(v2), v3_(v3), v4_(v4),
+      v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11),
+      v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17),
+      v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23),
+      v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29),
+      v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35),
+      v36_(v36), v37_(v37), v38_(v38), v39_(v39), v40_(v40), v41_(v41),
+      v42_(v42), v43_(v43), v44_(v44), v45_(v45) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
+        static_cast<T>(v45_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray45& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+  const T43 v43_;
+  const T44 v44_;
+  const T45 v45_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46>
+class ValueArray46 {
+ public:
+  ValueArray46(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42, T43 v43, T44 v44, T45 v45, T46 v46) : v1_(v1), v2_(v2), v3_(v3),
+      v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
+      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
+      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
+      v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34),
+      v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39), v40_(v40),
+      v41_(v41), v42_(v42), v43_(v43), v44_(v44), v45_(v45), v46_(v46) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
+        static_cast<T>(v45_), static_cast<T>(v46_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray46& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+  const T43 v43_;
+  const T44 v44_;
+  const T45 v45_;
+  const T46 v46_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47>
+class ValueArray47 {
+ public:
+  ValueArray47(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47) : v1_(v1), v2_(v2),
+      v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
+      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
+      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
+      v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34),
+      v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39), v40_(v40),
+      v41_(v41), v42_(v42), v43_(v43), v44_(v44), v45_(v45), v46_(v46),
+      v47_(v47) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
+        static_cast<T>(v45_), static_cast<T>(v46_), static_cast<T>(v47_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray47& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+  const T43 v43_;
+  const T44 v44_;
+  const T45 v45_;
+  const T46 v46_;
+  const T47 v47_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48>
+class ValueArray48 {
+ public:
+  ValueArray48(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, T48 v48) : v1_(v1),
+      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9),
+      v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15),
+      v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21),
+      v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27),
+      v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33),
+      v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39),
+      v40_(v40), v41_(v41), v42_(v42), v43_(v43), v44_(v44), v45_(v45),
+      v46_(v46), v47_(v47), v48_(v48) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
+        static_cast<T>(v45_), static_cast<T>(v46_), static_cast<T>(v47_),
+        static_cast<T>(v48_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray48& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+  const T43 v43_;
+  const T44 v44_;
+  const T45 v45_;
+  const T46 v46_;
+  const T47 v47_;
+  const T48 v48_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48, typename T49>
+class ValueArray49 {
+ public:
+  ValueArray49(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, T48 v48,
+      T49 v49) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
+      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
+      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
+      v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38),
+      v39_(v39), v40_(v40), v41_(v41), v42_(v42), v43_(v43), v44_(v44),
+      v45_(v45), v46_(v46), v47_(v47), v48_(v48), v49_(v49) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
+        static_cast<T>(v45_), static_cast<T>(v46_), static_cast<T>(v47_),
+        static_cast<T>(v48_), static_cast<T>(v49_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray49& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+  const T43 v43_;
+  const T44 v44_;
+  const T45 v45_;
+  const T46 v46_;
+  const T47 v47_;
+  const T48 v48_;
+  const T49 v49_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48, typename T49, typename T50>
+class ValueArray50 {
+ public:
+  ValueArray50(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, T48 v48, T49 v49,
+      T50 v50) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
+      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
+      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
+      v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38),
+      v39_(v39), v40_(v40), v41_(v41), v42_(v42), v43_(v43), v44_(v44),
+      v45_(v45), v46_(v46), v47_(v47), v48_(v48), v49_(v49), v50_(v50) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
+        static_cast<T>(v45_), static_cast<T>(v46_), static_cast<T>(v47_),
+        static_cast<T>(v48_), static_cast<T>(v49_), static_cast<T>(v50_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray50& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+  const T43 v43_;
+  const T44 v44_;
+  const T45 v45_;
+  const T46 v46_;
+  const T47 v47_;
+  const T48 v48_;
+  const T49 v49_;
+  const T50 v50_;
+};
+
+# if GTEST_HAS_COMBINE
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Generates values from the Cartesian product of values produced
+// by the argument generators.
+//
+template <typename T1, typename T2>
+class CartesianProductGenerator2
+    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2> > {
+ public:
+  typedef ::std::tr1::tuple<T1, T2> ParamType;
+
+  CartesianProductGenerator2(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2)
+      : g1_(g1), g2_(g2) {}
+  virtual ~CartesianProductGenerator2() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current2_;
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return &current_value_; }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_ = ParamType(*current1_, *current2_);
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    ParamType current_value_;
+  };  // class CartesianProductGenerator2::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator2& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+};  // class CartesianProductGenerator2
+
+
+template <typename T1, typename T2, typename T3>
+class CartesianProductGenerator3
+    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3> > {
+ public:
+  typedef ::std::tr1::tuple<T1, T2, T3> ParamType;
+
+  CartesianProductGenerator3(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3)
+      : g1_(g1), g2_(g2), g3_(g3) {}
+  virtual ~CartesianProductGenerator3() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
+        g3_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2,
+      const ParamGenerator<T3>& g3,
+      const typename ParamGenerator<T3>::iterator& current3)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
+          begin3_(g3.begin()), end3_(g3.end()), current3_(current3)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current3_;
+      if (current3_ == end3_) {
+        current3_ = begin3_;
+        ++current2_;
+      }
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return &current_value_; }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_ &&
+          current3_ == typed_other->current3_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_),
+        begin3_(other.begin3_),
+        end3_(other.end3_),
+        current3_(other.current3_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_ = ParamType(*current1_, *current2_, *current3_);
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_ ||
+          current3_ == end3_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    const typename ParamGenerator<T3>::iterator begin3_;
+    const typename ParamGenerator<T3>::iterator end3_;
+    typename ParamGenerator<T3>::iterator current3_;
+    ParamType current_value_;
+  };  // class CartesianProductGenerator3::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator3& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+  const ParamGenerator<T3> g3_;
+};  // class CartesianProductGenerator3
+
+
+template <typename T1, typename T2, typename T3, typename T4>
+class CartesianProductGenerator4
+    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3, T4> > {
+ public:
+  typedef ::std::tr1::tuple<T1, T2, T3, T4> ParamType;
+
+  CartesianProductGenerator4(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
+      const ParamGenerator<T4>& g4)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4) {}
+  virtual ~CartesianProductGenerator4() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
+        g3_.begin(), g4_, g4_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
+        g4_, g4_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2,
+      const ParamGenerator<T3>& g3,
+      const typename ParamGenerator<T3>::iterator& current3,
+      const ParamGenerator<T4>& g4,
+      const typename ParamGenerator<T4>::iterator& current4)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
+          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
+          begin4_(g4.begin()), end4_(g4.end()), current4_(current4)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current4_;
+      if (current4_ == end4_) {
+        current4_ = begin4_;
+        ++current3_;
+      }
+      if (current3_ == end3_) {
+        current3_ = begin3_;
+        ++current2_;
+      }
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return &current_value_; }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_ &&
+          current3_ == typed_other->current3_ &&
+          current4_ == typed_other->current4_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_),
+        begin3_(other.begin3_),
+        end3_(other.end3_),
+        current3_(other.current3_),
+        begin4_(other.begin4_),
+        end4_(other.end4_),
+        current4_(other.current4_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_ = ParamType(*current1_, *current2_, *current3_,
+            *current4_);
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_ ||
+          current3_ == end3_ ||
+          current4_ == end4_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    const typename ParamGenerator<T3>::iterator begin3_;
+    const typename ParamGenerator<T3>::iterator end3_;
+    typename ParamGenerator<T3>::iterator current3_;
+    const typename ParamGenerator<T4>::iterator begin4_;
+    const typename ParamGenerator<T4>::iterator end4_;
+    typename ParamGenerator<T4>::iterator current4_;
+    ParamType current_value_;
+  };  // class CartesianProductGenerator4::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator4& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+  const ParamGenerator<T3> g3_;
+  const ParamGenerator<T4> g4_;
+};  // class CartesianProductGenerator4
+
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+class CartesianProductGenerator5
+    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3, T4, T5> > {
+ public:
+  typedef ::std::tr1::tuple<T1, T2, T3, T4, T5> ParamType;
+
+  CartesianProductGenerator5(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
+      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5) {}
+  virtual ~CartesianProductGenerator5() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
+        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
+        g4_, g4_.end(), g5_, g5_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2,
+      const ParamGenerator<T3>& g3,
+      const typename ParamGenerator<T3>::iterator& current3,
+      const ParamGenerator<T4>& g4,
+      const typename ParamGenerator<T4>::iterator& current4,
+      const ParamGenerator<T5>& g5,
+      const typename ParamGenerator<T5>::iterator& current5)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
+          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
+          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
+          begin5_(g5.begin()), end5_(g5.end()), current5_(current5)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current5_;
+      if (current5_ == end5_) {
+        current5_ = begin5_;
+        ++current4_;
+      }
+      if (current4_ == end4_) {
+        current4_ = begin4_;
+        ++current3_;
+      }
+      if (current3_ == end3_) {
+        current3_ = begin3_;
+        ++current2_;
+      }
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return &current_value_; }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_ &&
+          current3_ == typed_other->current3_ &&
+          current4_ == typed_other->current4_ &&
+          current5_ == typed_other->current5_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_),
+        begin3_(other.begin3_),
+        end3_(other.end3_),
+        current3_(other.current3_),
+        begin4_(other.begin4_),
+        end4_(other.end4_),
+        current4_(other.current4_),
+        begin5_(other.begin5_),
+        end5_(other.end5_),
+        current5_(other.current5_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_ = ParamType(*current1_, *current2_, *current3_,
+            *current4_, *current5_);
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_ ||
+          current3_ == end3_ ||
+          current4_ == end4_ ||
+          current5_ == end5_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    const typename ParamGenerator<T3>::iterator begin3_;
+    const typename ParamGenerator<T3>::iterator end3_;
+    typename ParamGenerator<T3>::iterator current3_;
+    const typename ParamGenerator<T4>::iterator begin4_;
+    const typename ParamGenerator<T4>::iterator end4_;
+    typename ParamGenerator<T4>::iterator current4_;
+    const typename ParamGenerator<T5>::iterator begin5_;
+    const typename ParamGenerator<T5>::iterator end5_;
+    typename ParamGenerator<T5>::iterator current5_;
+    ParamType current_value_;
+  };  // class CartesianProductGenerator5::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator5& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+  const ParamGenerator<T3> g3_;
+  const ParamGenerator<T4> g4_;
+  const ParamGenerator<T5> g5_;
+};  // class CartesianProductGenerator5
+
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6>
+class CartesianProductGenerator6
+    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3, T4, T5,
+        T6> > {
+ public:
+  typedef ::std::tr1::tuple<T1, T2, T3, T4, T5, T6> ParamType;
+
+  CartesianProductGenerator6(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
+      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5,
+      const ParamGenerator<T6>& g6)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6) {}
+  virtual ~CartesianProductGenerator6() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
+        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
+        g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2,
+      const ParamGenerator<T3>& g3,
+      const typename ParamGenerator<T3>::iterator& current3,
+      const ParamGenerator<T4>& g4,
+      const typename ParamGenerator<T4>::iterator& current4,
+      const ParamGenerator<T5>& g5,
+      const typename ParamGenerator<T5>::iterator& current5,
+      const ParamGenerator<T6>& g6,
+      const typename ParamGenerator<T6>::iterator& current6)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
+          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
+          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
+          begin5_(g5.begin()), end5_(g5.end()), current5_(current5),
+          begin6_(g6.begin()), end6_(g6.end()), current6_(current6)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current6_;
+      if (current6_ == end6_) {
+        current6_ = begin6_;
+        ++current5_;
+      }
+      if (current5_ == end5_) {
+        current5_ = begin5_;
+        ++current4_;
+      }
+      if (current4_ == end4_) {
+        current4_ = begin4_;
+        ++current3_;
+      }
+      if (current3_ == end3_) {
+        current3_ = begin3_;
+        ++current2_;
+      }
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return &current_value_; }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_ &&
+          current3_ == typed_other->current3_ &&
+          current4_ == typed_other->current4_ &&
+          current5_ == typed_other->current5_ &&
+          current6_ == typed_other->current6_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_),
+        begin3_(other.begin3_),
+        end3_(other.end3_),
+        current3_(other.current3_),
+        begin4_(other.begin4_),
+        end4_(other.end4_),
+        current4_(other.current4_),
+        begin5_(other.begin5_),
+        end5_(other.end5_),
+        current5_(other.current5_),
+        begin6_(other.begin6_),
+        end6_(other.end6_),
+        current6_(other.current6_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_ = ParamType(*current1_, *current2_, *current3_,
+            *current4_, *current5_, *current6_);
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_ ||
+          current3_ == end3_ ||
+          current4_ == end4_ ||
+          current5_ == end5_ ||
+          current6_ == end6_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    const typename ParamGenerator<T3>::iterator begin3_;
+    const typename ParamGenerator<T3>::iterator end3_;
+    typename ParamGenerator<T3>::iterator current3_;
+    const typename ParamGenerator<T4>::iterator begin4_;
+    const typename ParamGenerator<T4>::iterator end4_;
+    typename ParamGenerator<T4>::iterator current4_;
+    const typename ParamGenerator<T5>::iterator begin5_;
+    const typename ParamGenerator<T5>::iterator end5_;
+    typename ParamGenerator<T5>::iterator current5_;
+    const typename ParamGenerator<T6>::iterator begin6_;
+    const typename ParamGenerator<T6>::iterator end6_;
+    typename ParamGenerator<T6>::iterator current6_;
+    ParamType current_value_;
+  };  // class CartesianProductGenerator6::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator6& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+  const ParamGenerator<T3> g3_;
+  const ParamGenerator<T4> g4_;
+  const ParamGenerator<T5> g5_;
+  const ParamGenerator<T6> g6_;
+};  // class CartesianProductGenerator6
+
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7>
+class CartesianProductGenerator7
+    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6,
+        T7> > {
+ public:
+  typedef ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7> ParamType;
+
+  CartesianProductGenerator7(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
+      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5,
+      const ParamGenerator<T6>& g6, const ParamGenerator<T7>& g7)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7) {}
+  virtual ~CartesianProductGenerator7() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
+        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_,
+        g7_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
+        g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2,
+      const ParamGenerator<T3>& g3,
+      const typename ParamGenerator<T3>::iterator& current3,
+      const ParamGenerator<T4>& g4,
+      const typename ParamGenerator<T4>::iterator& current4,
+      const ParamGenerator<T5>& g5,
+      const typename ParamGenerator<T5>::iterator& current5,
+      const ParamGenerator<T6>& g6,
+      const typename ParamGenerator<T6>::iterator& current6,
+      const ParamGenerator<T7>& g7,
+      const typename ParamGenerator<T7>::iterator& current7)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
+          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
+          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
+          begin5_(g5.begin()), end5_(g5.end()), current5_(current5),
+          begin6_(g6.begin()), end6_(g6.end()), current6_(current6),
+          begin7_(g7.begin()), end7_(g7.end()), current7_(current7)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current7_;
+      if (current7_ == end7_) {
+        current7_ = begin7_;
+        ++current6_;
+      }
+      if (current6_ == end6_) {
+        current6_ = begin6_;
+        ++current5_;
+      }
+      if (current5_ == end5_) {
+        current5_ = begin5_;
+        ++current4_;
+      }
+      if (current4_ == end4_) {
+        current4_ = begin4_;
+        ++current3_;
+      }
+      if (current3_ == end3_) {
+        current3_ = begin3_;
+        ++current2_;
+      }
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return &current_value_; }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_ &&
+          current3_ == typed_other->current3_ &&
+          current4_ == typed_other->current4_ &&
+          current5_ == typed_other->current5_ &&
+          current6_ == typed_other->current6_ &&
+          current7_ == typed_other->current7_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_),
+        begin3_(other.begin3_),
+        end3_(other.end3_),
+        current3_(other.current3_),
+        begin4_(other.begin4_),
+        end4_(other.end4_),
+        current4_(other.current4_),
+        begin5_(other.begin5_),
+        end5_(other.end5_),
+        current5_(other.current5_),
+        begin6_(other.begin6_),
+        end6_(other.end6_),
+        current6_(other.current6_),
+        begin7_(other.begin7_),
+        end7_(other.end7_),
+        current7_(other.current7_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_ = ParamType(*current1_, *current2_, *current3_,
+            *current4_, *current5_, *current6_, *current7_);
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_ ||
+          current3_ == end3_ ||
+          current4_ == end4_ ||
+          current5_ == end5_ ||
+          current6_ == end6_ ||
+          current7_ == end7_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    const typename ParamGenerator<T3>::iterator begin3_;
+    const typename ParamGenerator<T3>::iterator end3_;
+    typename ParamGenerator<T3>::iterator current3_;
+    const typename ParamGenerator<T4>::iterator begin4_;
+    const typename ParamGenerator<T4>::iterator end4_;
+    typename ParamGenerator<T4>::iterator current4_;
+    const typename ParamGenerator<T5>::iterator begin5_;
+    const typename ParamGenerator<T5>::iterator end5_;
+    typename ParamGenerator<T5>::iterator current5_;
+    const typename ParamGenerator<T6>::iterator begin6_;
+    const typename ParamGenerator<T6>::iterator end6_;
+    typename ParamGenerator<T6>::iterator current6_;
+    const typename ParamGenerator<T7>::iterator begin7_;
+    const typename ParamGenerator<T7>::iterator end7_;
+    typename ParamGenerator<T7>::iterator current7_;
+    ParamType current_value_;
+  };  // class CartesianProductGenerator7::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator7& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+  const ParamGenerator<T3> g3_;
+  const ParamGenerator<T4> g4_;
+  const ParamGenerator<T5> g5_;
+  const ParamGenerator<T6> g6_;
+  const ParamGenerator<T7> g7_;
+};  // class CartesianProductGenerator7
+
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8>
+class CartesianProductGenerator8
+    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6,
+        T7, T8> > {
+ public:
+  typedef ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8> ParamType;
+
+  CartesianProductGenerator8(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
+      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5,
+      const ParamGenerator<T6>& g6, const ParamGenerator<T7>& g7,
+      const ParamGenerator<T8>& g8)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7),
+          g8_(g8) {}
+  virtual ~CartesianProductGenerator8() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
+        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_,
+        g7_.begin(), g8_, g8_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
+        g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end(), g8_,
+        g8_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2,
+      const ParamGenerator<T3>& g3,
+      const typename ParamGenerator<T3>::iterator& current3,
+      const ParamGenerator<T4>& g4,
+      const typename ParamGenerator<T4>::iterator& current4,
+      const ParamGenerator<T5>& g5,
+      const typename ParamGenerator<T5>::iterator& current5,
+      const ParamGenerator<T6>& g6,
+      const typename ParamGenerator<T6>::iterator& current6,
+      const ParamGenerator<T7>& g7,
+      const typename ParamGenerator<T7>::iterator& current7,
+      const ParamGenerator<T8>& g8,
+      const typename ParamGenerator<T8>::iterator& current8)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
+          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
+          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
+          begin5_(g5.begin()), end5_(g5.end()), current5_(current5),
+          begin6_(g6.begin()), end6_(g6.end()), current6_(current6),
+          begin7_(g7.begin()), end7_(g7.end()), current7_(current7),
+          begin8_(g8.begin()), end8_(g8.end()), current8_(current8)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current8_;
+      if (current8_ == end8_) {
+        current8_ = begin8_;
+        ++current7_;
+      }
+      if (current7_ == end7_) {
+        current7_ = begin7_;
+        ++current6_;
+      }
+      if (current6_ == end6_) {
+        current6_ = begin6_;
+        ++current5_;
+      }
+      if (current5_ == end5_) {
+        current5_ = begin5_;
+        ++current4_;
+      }
+      if (current4_ == end4_) {
+        current4_ = begin4_;
+        ++current3_;
+      }
+      if (current3_ == end3_) {
+        current3_ = begin3_;
+        ++current2_;
+      }
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return &current_value_; }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_ &&
+          current3_ == typed_other->current3_ &&
+          current4_ == typed_other->current4_ &&
+          current5_ == typed_other->current5_ &&
+          current6_ == typed_other->current6_ &&
+          current7_ == typed_other->current7_ &&
+          current8_ == typed_other->current8_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_),
+        begin3_(other.begin3_),
+        end3_(other.end3_),
+        current3_(other.current3_),
+        begin4_(other.begin4_),
+        end4_(other.end4_),
+        current4_(other.current4_),
+        begin5_(other.begin5_),
+        end5_(other.end5_),
+        current5_(other.current5_),
+        begin6_(other.begin6_),
+        end6_(other.end6_),
+        current6_(other.current6_),
+        begin7_(other.begin7_),
+        end7_(other.end7_),
+        current7_(other.current7_),
+        begin8_(other.begin8_),
+        end8_(other.end8_),
+        current8_(other.current8_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_ = ParamType(*current1_, *current2_, *current3_,
+            *current4_, *current5_, *current6_, *current7_, *current8_);
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_ ||
+          current3_ == end3_ ||
+          current4_ == end4_ ||
+          current5_ == end5_ ||
+          current6_ == end6_ ||
+          current7_ == end7_ ||
+          current8_ == end8_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    const typename ParamGenerator<T3>::iterator begin3_;
+    const typename ParamGenerator<T3>::iterator end3_;
+    typename ParamGenerator<T3>::iterator current3_;
+    const typename ParamGenerator<T4>::iterator begin4_;
+    const typename ParamGenerator<T4>::iterator end4_;
+    typename ParamGenerator<T4>::iterator current4_;
+    const typename ParamGenerator<T5>::iterator begin5_;
+    const typename ParamGenerator<T5>::iterator end5_;
+    typename ParamGenerator<T5>::iterator current5_;
+    const typename ParamGenerator<T6>::iterator begin6_;
+    const typename ParamGenerator<T6>::iterator end6_;
+    typename ParamGenerator<T6>::iterator current6_;
+    const typename ParamGenerator<T7>::iterator begin7_;
+    const typename ParamGenerator<T7>::iterator end7_;
+    typename ParamGenerator<T7>::iterator current7_;
+    const typename ParamGenerator<T8>::iterator begin8_;
+    const typename ParamGenerator<T8>::iterator end8_;
+    typename ParamGenerator<T8>::iterator current8_;
+    ParamType current_value_;
+  };  // class CartesianProductGenerator8::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator8& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+  const ParamGenerator<T3> g3_;
+  const ParamGenerator<T4> g4_;
+  const ParamGenerator<T5> g5_;
+  const ParamGenerator<T6> g6_;
+  const ParamGenerator<T7> g7_;
+  const ParamGenerator<T8> g8_;
+};  // class CartesianProductGenerator8
+
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9>
+class CartesianProductGenerator9
+    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6,
+        T7, T8, T9> > {
+ public:
+  typedef ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9> ParamType;
+
+  CartesianProductGenerator9(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
+      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5,
+      const ParamGenerator<T6>& g6, const ParamGenerator<T7>& g7,
+      const ParamGenerator<T8>& g8, const ParamGenerator<T9>& g9)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8),
+          g9_(g9) {}
+  virtual ~CartesianProductGenerator9() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
+        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_,
+        g7_.begin(), g8_, g8_.begin(), g9_, g9_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
+        g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end(), g8_,
+        g8_.end(), g9_, g9_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2,
+      const ParamGenerator<T3>& g3,
+      const typename ParamGenerator<T3>::iterator& current3,
+      const ParamGenerator<T4>& g4,
+      const typename ParamGenerator<T4>::iterator& current4,
+      const ParamGenerator<T5>& g5,
+      const typename ParamGenerator<T5>::iterator& current5,
+      const ParamGenerator<T6>& g6,
+      const typename ParamGenerator<T6>::iterator& current6,
+      const ParamGenerator<T7>& g7,
+      const typename ParamGenerator<T7>::iterator& current7,
+      const ParamGenerator<T8>& g8,
+      const typename ParamGenerator<T8>::iterator& current8,
+      const ParamGenerator<T9>& g9,
+      const typename ParamGenerator<T9>::iterator& current9)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
+          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
+          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
+          begin5_(g5.begin()), end5_(g5.end()), current5_(current5),
+          begin6_(g6.begin()), end6_(g6.end()), current6_(current6),
+          begin7_(g7.begin()), end7_(g7.end()), current7_(current7),
+          begin8_(g8.begin()), end8_(g8.end()), current8_(current8),
+          begin9_(g9.begin()), end9_(g9.end()), current9_(current9)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current9_;
+      if (current9_ == end9_) {
+        current9_ = begin9_;
+        ++current8_;
+      }
+      if (current8_ == end8_) {
+        current8_ = begin8_;
+        ++current7_;
+      }
+      if (current7_ == end7_) {
+        current7_ = begin7_;
+        ++current6_;
+      }
+      if (current6_ == end6_) {
+        current6_ = begin6_;
+        ++current5_;
+      }
+      if (current5_ == end5_) {
+        current5_ = begin5_;
+        ++current4_;
+      }
+      if (current4_ == end4_) {
+        current4_ = begin4_;
+        ++current3_;
+      }
+      if (current3_ == end3_) {
+        current3_ = begin3_;
+        ++current2_;
+      }
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return &current_value_; }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_ &&
+          current3_ == typed_other->current3_ &&
+          current4_ == typed_other->current4_ &&
+          current5_ == typed_other->current5_ &&
+          current6_ == typed_other->current6_ &&
+          current7_ == typed_other->current7_ &&
+          current8_ == typed_other->current8_ &&
+          current9_ == typed_other->current9_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_),
+        begin3_(other.begin3_),
+        end3_(other.end3_),
+        current3_(other.current3_),
+        begin4_(other.begin4_),
+        end4_(other.end4_),
+        current4_(other.current4_),
+        begin5_(other.begin5_),
+        end5_(other.end5_),
+        current5_(other.current5_),
+        begin6_(other.begin6_),
+        end6_(other.end6_),
+        current6_(other.current6_),
+        begin7_(other.begin7_),
+        end7_(other.end7_),
+        current7_(other.current7_),
+        begin8_(other.begin8_),
+        end8_(other.end8_),
+        current8_(other.current8_),
+        begin9_(other.begin9_),
+        end9_(other.end9_),
+        current9_(other.current9_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_ = ParamType(*current1_, *current2_, *current3_,
+            *current4_, *current5_, *current6_, *current7_, *current8_,
+            *current9_);
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_ ||
+          current3_ == end3_ ||
+          current4_ == end4_ ||
+          current5_ == end5_ ||
+          current6_ == end6_ ||
+          current7_ == end7_ ||
+          current8_ == end8_ ||
+          current9_ == end9_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    const typename ParamGenerator<T3>::iterator begin3_;
+    const typename ParamGenerator<T3>::iterator end3_;
+    typename ParamGenerator<T3>::iterator current3_;
+    const typename ParamGenerator<T4>::iterator begin4_;
+    const typename ParamGenerator<T4>::iterator end4_;
+    typename ParamGenerator<T4>::iterator current4_;
+    const typename ParamGenerator<T5>::iterator begin5_;
+    const typename ParamGenerator<T5>::iterator end5_;
+    typename ParamGenerator<T5>::iterator current5_;
+    const typename ParamGenerator<T6>::iterator begin6_;
+    const typename ParamGenerator<T6>::iterator end6_;
+    typename ParamGenerator<T6>::iterator current6_;
+    const typename ParamGenerator<T7>::iterator begin7_;
+    const typename ParamGenerator<T7>::iterator end7_;
+    typename ParamGenerator<T7>::iterator current7_;
+    const typename ParamGenerator<T8>::iterator begin8_;
+    const typename ParamGenerator<T8>::iterator end8_;
+    typename ParamGenerator<T8>::iterator current8_;
+    const typename ParamGenerator<T9>::iterator begin9_;
+    const typename ParamGenerator<T9>::iterator end9_;
+    typename ParamGenerator<T9>::iterator current9_;
+    ParamType current_value_;
+  };  // class CartesianProductGenerator9::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator9& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+  const ParamGenerator<T3> g3_;
+  const ParamGenerator<T4> g4_;
+  const ParamGenerator<T5> g5_;
+  const ParamGenerator<T6> g6_;
+  const ParamGenerator<T7> g7_;
+  const ParamGenerator<T8> g8_;
+  const ParamGenerator<T9> g9_;
+};  // class CartesianProductGenerator9
+
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10>
+class CartesianProductGenerator10
+    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6,
+        T7, T8, T9, T10> > {
+ public:
+  typedef ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10> ParamType;
+
+  CartesianProductGenerator10(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
+      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5,
+      const ParamGenerator<T6>& g6, const ParamGenerator<T7>& g7,
+      const ParamGenerator<T8>& g8, const ParamGenerator<T9>& g9,
+      const ParamGenerator<T10>& g10)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8),
+          g9_(g9), g10_(g10) {}
+  virtual ~CartesianProductGenerator10() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
+        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_,
+        g7_.begin(), g8_, g8_.begin(), g9_, g9_.begin(), g10_, g10_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
+        g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end(), g8_,
+        g8_.end(), g9_, g9_.end(), g10_, g10_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2,
+      const ParamGenerator<T3>& g3,
+      const typename ParamGenerator<T3>::iterator& current3,
+      const ParamGenerator<T4>& g4,
+      const typename ParamGenerator<T4>::iterator& current4,
+      const ParamGenerator<T5>& g5,
+      const typename ParamGenerator<T5>::iterator& current5,
+      const ParamGenerator<T6>& g6,
+      const typename ParamGenerator<T6>::iterator& current6,
+      const ParamGenerator<T7>& g7,
+      const typename ParamGenerator<T7>::iterator& current7,
+      const ParamGenerator<T8>& g8,
+      const typename ParamGenerator<T8>::iterator& current8,
+      const ParamGenerator<T9>& g9,
+      const typename ParamGenerator<T9>::iterator& current9,
+      const ParamGenerator<T10>& g10,
+      const typename ParamGenerator<T10>::iterator& current10)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
+          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
+          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
+          begin5_(g5.begin()), end5_(g5.end()), current5_(current5),
+          begin6_(g6.begin()), end6_(g6.end()), current6_(current6),
+          begin7_(g7.begin()), end7_(g7.end()), current7_(current7),
+          begin8_(g8.begin()), end8_(g8.end()), current8_(current8),
+          begin9_(g9.begin()), end9_(g9.end()), current9_(current9),
+          begin10_(g10.begin()), end10_(g10.end()), current10_(current10)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current10_;
+      if (current10_ == end10_) {
+        current10_ = begin10_;
+        ++current9_;
+      }
+      if (current9_ == end9_) {
+        current9_ = begin9_;
+        ++current8_;
+      }
+      if (current8_ == end8_) {
+        current8_ = begin8_;
+        ++current7_;
+      }
+      if (current7_ == end7_) {
+        current7_ = begin7_;
+        ++current6_;
+      }
+      if (current6_ == end6_) {
+        current6_ = begin6_;
+        ++current5_;
+      }
+      if (current5_ == end5_) {
+        current5_ = begin5_;
+        ++current4_;
+      }
+      if (current4_ == end4_) {
+        current4_ = begin4_;
+        ++current3_;
+      }
+      if (current3_ == end3_) {
+        current3_ = begin3_;
+        ++current2_;
+      }
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return &current_value_; }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_ &&
+          current3_ == typed_other->current3_ &&
+          current4_ == typed_other->current4_ &&
+          current5_ == typed_other->current5_ &&
+          current6_ == typed_other->current6_ &&
+          current7_ == typed_other->current7_ &&
+          current8_ == typed_other->current8_ &&
+          current9_ == typed_other->current9_ &&
+          current10_ == typed_other->current10_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_),
+        begin3_(other.begin3_),
+        end3_(other.end3_),
+        current3_(other.current3_),
+        begin4_(other.begin4_),
+        end4_(other.end4_),
+        current4_(other.current4_),
+        begin5_(other.begin5_),
+        end5_(other.end5_),
+        current5_(other.current5_),
+        begin6_(other.begin6_),
+        end6_(other.end6_),
+        current6_(other.current6_),
+        begin7_(other.begin7_),
+        end7_(other.end7_),
+        current7_(other.current7_),
+        begin8_(other.begin8_),
+        end8_(other.end8_),
+        current8_(other.current8_),
+        begin9_(other.begin9_),
+        end9_(other.end9_),
+        current9_(other.current9_),
+        begin10_(other.begin10_),
+        end10_(other.end10_),
+        current10_(other.current10_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_ = ParamType(*current1_, *current2_, *current3_,
+            *current4_, *current5_, *current6_, *current7_, *current8_,
+            *current9_, *current10_);
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_ ||
+          current3_ == end3_ ||
+          current4_ == end4_ ||
+          current5_ == end5_ ||
+          current6_ == end6_ ||
+          current7_ == end7_ ||
+          current8_ == end8_ ||
+          current9_ == end9_ ||
+          current10_ == end10_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    const typename ParamGenerator<T3>::iterator begin3_;
+    const typename ParamGenerator<T3>::iterator end3_;
+    typename ParamGenerator<T3>::iterator current3_;
+    const typename ParamGenerator<T4>::iterator begin4_;
+    const typename ParamGenerator<T4>::iterator end4_;
+    typename ParamGenerator<T4>::iterator current4_;
+    const typename ParamGenerator<T5>::iterator begin5_;
+    const typename ParamGenerator<T5>::iterator end5_;
+    typename ParamGenerator<T5>::iterator current5_;
+    const typename ParamGenerator<T6>::iterator begin6_;
+    const typename ParamGenerator<T6>::iterator end6_;
+    typename ParamGenerator<T6>::iterator current6_;
+    const typename ParamGenerator<T7>::iterator begin7_;
+    const typename ParamGenerator<T7>::iterator end7_;
+    typename ParamGenerator<T7>::iterator current7_;
+    const typename ParamGenerator<T8>::iterator begin8_;
+    const typename ParamGenerator<T8>::iterator end8_;
+    typename ParamGenerator<T8>::iterator current8_;
+    const typename ParamGenerator<T9>::iterator begin9_;
+    const typename ParamGenerator<T9>::iterator end9_;
+    typename ParamGenerator<T9>::iterator current9_;
+    const typename ParamGenerator<T10>::iterator begin10_;
+    const typename ParamGenerator<T10>::iterator end10_;
+    typename ParamGenerator<T10>::iterator current10_;
+    ParamType current_value_;
+  };  // class CartesianProductGenerator10::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator10& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+  const ParamGenerator<T3> g3_;
+  const ParamGenerator<T4> g4_;
+  const ParamGenerator<T5> g5_;
+  const ParamGenerator<T6> g6_;
+  const ParamGenerator<T7> g7_;
+  const ParamGenerator<T8> g8_;
+  const ParamGenerator<T9> g9_;
+  const ParamGenerator<T10> g10_;
+};  // class CartesianProductGenerator10
+
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Helper classes providing Combine() with polymorphic features. They allow
+// casting CartesianProductGeneratorN<T> to ParamGenerator<U> if T is
+// convertible to U.
+//
+template <class Generator1, class Generator2>
+class CartesianProductHolder2 {
+ public:
+CartesianProductHolder2(const Generator1& g1, const Generator2& g2)
+      : g1_(g1), g2_(g2) {}
+  template <typename T1, typename T2>
+  operator ParamGenerator< ::std::tr1::tuple<T1, T2> >() const {
+    return ParamGenerator< ::std::tr1::tuple<T1, T2> >(
+        new CartesianProductGenerator2<T1, T2>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder2& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+};  // class CartesianProductHolder2
+
+template <class Generator1, class Generator2, class Generator3>
+class CartesianProductHolder3 {
+ public:
+CartesianProductHolder3(const Generator1& g1, const Generator2& g2,
+    const Generator3& g3)
+      : g1_(g1), g2_(g2), g3_(g3) {}
+  template <typename T1, typename T2, typename T3>
+  operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3> >() const {
+    return ParamGenerator< ::std::tr1::tuple<T1, T2, T3> >(
+        new CartesianProductGenerator3<T1, T2, T3>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_),
+        static_cast<ParamGenerator<T3> >(g3_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder3& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+  const Generator3 g3_;
+};  // class CartesianProductHolder3
+
+template <class Generator1, class Generator2, class Generator3,
+    class Generator4>
+class CartesianProductHolder4 {
+ public:
+CartesianProductHolder4(const Generator1& g1, const Generator2& g2,
+    const Generator3& g3, const Generator4& g4)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4) {}
+  template <typename T1, typename T2, typename T3, typename T4>
+  operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4> >() const {
+    return ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4> >(
+        new CartesianProductGenerator4<T1, T2, T3, T4>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_),
+        static_cast<ParamGenerator<T3> >(g3_),
+        static_cast<ParamGenerator<T4> >(g4_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder4& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+  const Generator3 g3_;
+  const Generator4 g4_;
+};  // class CartesianProductHolder4
+
+template <class Generator1, class Generator2, class Generator3,
+    class Generator4, class Generator5>
+class CartesianProductHolder5 {
+ public:
+CartesianProductHolder5(const Generator1& g1, const Generator2& g2,
+    const Generator3& g3, const Generator4& g4, const Generator5& g5)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5) {}
+  template <typename T1, typename T2, typename T3, typename T4, typename T5>
+  operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5> >() const {
+    return ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5> >(
+        new CartesianProductGenerator5<T1, T2, T3, T4, T5>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_),
+        static_cast<ParamGenerator<T3> >(g3_),
+        static_cast<ParamGenerator<T4> >(g4_),
+        static_cast<ParamGenerator<T5> >(g5_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder5& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+  const Generator3 g3_;
+  const Generator4 g4_;
+  const Generator5 g5_;
+};  // class CartesianProductHolder5
+
+template <class Generator1, class Generator2, class Generator3,
+    class Generator4, class Generator5, class Generator6>
+class CartesianProductHolder6 {
+ public:
+CartesianProductHolder6(const Generator1& g1, const Generator2& g2,
+    const Generator3& g3, const Generator4& g4, const Generator5& g5,
+    const Generator6& g6)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6) {}
+  template <typename T1, typename T2, typename T3, typename T4, typename T5,
+      typename T6>
+  operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6> >() const {
+    return ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6> >(
+        new CartesianProductGenerator6<T1, T2, T3, T4, T5, T6>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_),
+        static_cast<ParamGenerator<T3> >(g3_),
+        static_cast<ParamGenerator<T4> >(g4_),
+        static_cast<ParamGenerator<T5> >(g5_),
+        static_cast<ParamGenerator<T6> >(g6_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder6& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+  const Generator3 g3_;
+  const Generator4 g4_;
+  const Generator5 g5_;
+  const Generator6 g6_;
+};  // class CartesianProductHolder6
+
+template <class Generator1, class Generator2, class Generator3,
+    class Generator4, class Generator5, class Generator6, class Generator7>
+class CartesianProductHolder7 {
+ public:
+CartesianProductHolder7(const Generator1& g1, const Generator2& g2,
+    const Generator3& g3, const Generator4& g4, const Generator5& g5,
+    const Generator6& g6, const Generator7& g7)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7) {}
+  template <typename T1, typename T2, typename T3, typename T4, typename T5,
+      typename T6, typename T7>
+  operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6,
+      T7> >() const {
+    return ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7> >(
+        new CartesianProductGenerator7<T1, T2, T3, T4, T5, T6, T7>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_),
+        static_cast<ParamGenerator<T3> >(g3_),
+        static_cast<ParamGenerator<T4> >(g4_),
+        static_cast<ParamGenerator<T5> >(g5_),
+        static_cast<ParamGenerator<T6> >(g6_),
+        static_cast<ParamGenerator<T7> >(g7_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder7& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+  const Generator3 g3_;
+  const Generator4 g4_;
+  const Generator5 g5_;
+  const Generator6 g6_;
+  const Generator7 g7_;
+};  // class CartesianProductHolder7
+
+template <class Generator1, class Generator2, class Generator3,
+    class Generator4, class Generator5, class Generator6, class Generator7,
+    class Generator8>
+class CartesianProductHolder8 {
+ public:
+CartesianProductHolder8(const Generator1& g1, const Generator2& g2,
+    const Generator3& g3, const Generator4& g4, const Generator5& g5,
+    const Generator6& g6, const Generator7& g7, const Generator8& g8)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7),
+          g8_(g8) {}
+  template <typename T1, typename T2, typename T3, typename T4, typename T5,
+      typename T6, typename T7, typename T8>
+  operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7,
+      T8> >() const {
+    return ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8> >(
+        new CartesianProductGenerator8<T1, T2, T3, T4, T5, T6, T7, T8>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_),
+        static_cast<ParamGenerator<T3> >(g3_),
+        static_cast<ParamGenerator<T4> >(g4_),
+        static_cast<ParamGenerator<T5> >(g5_),
+        static_cast<ParamGenerator<T6> >(g6_),
+        static_cast<ParamGenerator<T7> >(g7_),
+        static_cast<ParamGenerator<T8> >(g8_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder8& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+  const Generator3 g3_;
+  const Generator4 g4_;
+  const Generator5 g5_;
+  const Generator6 g6_;
+  const Generator7 g7_;
+  const Generator8 g8_;
+};  // class CartesianProductHolder8
+
+template <class Generator1, class Generator2, class Generator3,
+    class Generator4, class Generator5, class Generator6, class Generator7,
+    class Generator8, class Generator9>
+class CartesianProductHolder9 {
+ public:
+CartesianProductHolder9(const Generator1& g1, const Generator2& g2,
+    const Generator3& g3, const Generator4& g4, const Generator5& g5,
+    const Generator6& g6, const Generator7& g7, const Generator8& g8,
+    const Generator9& g9)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8),
+          g9_(g9) {}
+  template <typename T1, typename T2, typename T3, typename T4, typename T5,
+      typename T6, typename T7, typename T8, typename T9>
+  operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8,
+      T9> >() const {
+    return ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8,
+        T9> >(
+        new CartesianProductGenerator9<T1, T2, T3, T4, T5, T6, T7, T8, T9>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_),
+        static_cast<ParamGenerator<T3> >(g3_),
+        static_cast<ParamGenerator<T4> >(g4_),
+        static_cast<ParamGenerator<T5> >(g5_),
+        static_cast<ParamGenerator<T6> >(g6_),
+        static_cast<ParamGenerator<T7> >(g7_),
+        static_cast<ParamGenerator<T8> >(g8_),
+        static_cast<ParamGenerator<T9> >(g9_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder9& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+  const Generator3 g3_;
+  const Generator4 g4_;
+  const Generator5 g5_;
+  const Generator6 g6_;
+  const Generator7 g7_;
+  const Generator8 g8_;
+  const Generator9 g9_;
+};  // class CartesianProductHolder9
+
+template <class Generator1, class Generator2, class Generator3,
+    class Generator4, class Generator5, class Generator6, class Generator7,
+    class Generator8, class Generator9, class Generator10>
+class CartesianProductHolder10 {
+ public:
+CartesianProductHolder10(const Generator1& g1, const Generator2& g2,
+    const Generator3& g3, const Generator4& g4, const Generator5& g5,
+    const Generator6& g6, const Generator7& g7, const Generator8& g8,
+    const Generator9& g9, const Generator10& g10)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8),
+          g9_(g9), g10_(g10) {}
+  template <typename T1, typename T2, typename T3, typename T4, typename T5,
+      typename T6, typename T7, typename T8, typename T9, typename T10>
+  operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8,
+      T9, T10> >() const {
+    return ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8,
+        T9, T10> >(
+        new CartesianProductGenerator10<T1, T2, T3, T4, T5, T6, T7, T8, T9,
+            T10>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_),
+        static_cast<ParamGenerator<T3> >(g3_),
+        static_cast<ParamGenerator<T4> >(g4_),
+        static_cast<ParamGenerator<T5> >(g5_),
+        static_cast<ParamGenerator<T6> >(g6_),
+        static_cast<ParamGenerator<T7> >(g7_),
+        static_cast<ParamGenerator<T8> >(g8_),
+        static_cast<ParamGenerator<T9> >(g9_),
+        static_cast<ParamGenerator<T10> >(g10_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder10& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+  const Generator3 g3_;
+  const Generator4 g4_;
+  const Generator5 g5_;
+  const Generator6 g6_;
+  const Generator7 g7_;
+  const Generator8 g8_;
+  const Generator9 g9_;
+  const Generator10 g10_;
+};  // class CartesianProductHolder10
+
+# endif  // GTEST_HAS_COMBINE
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  //  GTEST_HAS_PARAM_TEST
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_
+
+#if GTEST_HAS_PARAM_TEST
+
+namespace testing {
+
+// Functions producing parameter generators.
+//
+// Google Test uses these generators to produce parameters for value-
+// parameterized tests. When a parameterized test case is instantiated
+// with a particular generator, Google Test creates and runs tests
+// for each element in the sequence produced by the generator.
+//
+// In the following sample, tests from test case FooTest are instantiated
+// each three times with parameter values 3, 5, and 8:
+//
+// class FooTest : public TestWithParam<int> { ... };
+//
+// TEST_P(FooTest, TestThis) {
+// }
+// TEST_P(FooTest, TestThat) {
+// }
+// INSTANTIATE_TEST_CASE_P(TestSequence, FooTest, Values(3, 5, 8));
+//
+
+// Range() returns generators providing sequences of values in a range.
+//
+// Synopsis:
+// Range(start, end)
+//   - returns a generator producing a sequence of values {start, start+1,
+//     start+2, ..., }.
+// Range(start, end, step)
+//   - returns a generator producing a sequence of values {start, start+step,
+//     start+step+step, ..., }.
+// Notes:
+//   * The generated sequences never include end. For example, Range(1, 5)
+//     returns a generator producing a sequence {1, 2, 3, 4}. Range(1, 9, 2)
+//     returns a generator producing {1, 3, 5, 7}.
+//   * start and end must have the same type. That type may be any integral or
+//     floating-point type or a user defined type satisfying these conditions:
+//     * It must be assignable (have operator=() defined).
+//     * It must have operator+() (operator+(int-compatible type) for
+//       two-operand version).
+//     * It must have operator<() defined.
+//     Elements in the resulting sequences will also have that type.
+//   * Condition start < end must be satisfied in order for resulting sequences
+//     to contain any elements.
+//
+template <typename T, typename IncrementT>
+internal::ParamGenerator<T> Range(T start, T end, IncrementT step) {
+  return internal::ParamGenerator<T>(
+      new internal::RangeGenerator<T, IncrementT>(start, end, step));
+}
+
+template <typename T>
+internal::ParamGenerator<T> Range(T start, T end) {
+  return Range(start, end, 1);
+}
+
+// ValuesIn() function allows generation of tests with parameters coming from
+// a container.
+//
+// Synopsis:
+// ValuesIn(const T (&array)[N])
+//   - returns a generator producing sequences with elements from
+//     a C-style array.
+// ValuesIn(const Container& container)
+//   - returns a generator producing sequences with elements from
+//     an STL-style container.
+// ValuesIn(Iterator begin, Iterator end)
+//   - returns a generator producing sequences with elements from
+//     a range [begin, end) defined by a pair of STL-style iterators. These
+//     iterators can also be plain C pointers.
+//
+// Please note that ValuesIn copies the values from the containers
+// passed in and keeps them to generate tests in RUN_ALL_TESTS().
+//
+// Examples:
+//
+// This instantiates tests from test case StringTest
+// each with C-string values of "foo", "bar", and "baz":
+//
+// const char* strings[] = {"foo", "bar", "baz"};
+// INSTANTIATE_TEST_CASE_P(StringSequence, SrtingTest, ValuesIn(strings));
+//
+// This instantiates tests from test case StlStringTest
+// each with STL strings with values "a" and "b":
+//
+// ::std::vector< ::std::string> GetParameterStrings() {
+//   ::std::vector< ::std::string> v;
+//   v.push_back("a");
+//   v.push_back("b");
+//   return v;
+// }
+//
+// INSTANTIATE_TEST_CASE_P(CharSequence,
+//                         StlStringTest,
+//                         ValuesIn(GetParameterStrings()));
+//
+//
+// This will also instantiate tests from CharTest
+// each with parameter values 'a' and 'b':
+//
+// ::std::list<char> GetParameterChars() {
+//   ::std::list<char> list;
+//   list.push_back('a');
+//   list.push_back('b');
+//   return list;
+// }
+// ::std::list<char> l = GetParameterChars();
+// INSTANTIATE_TEST_CASE_P(CharSequence2,
+//                         CharTest,
+//                         ValuesIn(l.begin(), l.end()));
+//
+template <typename ForwardIterator>
+internal::ParamGenerator<
+  typename ::testing::internal::IteratorTraits<ForwardIterator>::value_type>
+ValuesIn(ForwardIterator begin, ForwardIterator end) {
+  typedef typename ::testing::internal::IteratorTraits<ForwardIterator>
+      ::value_type ParamType;
+  return internal::ParamGenerator<ParamType>(
+      new internal::ValuesInIteratorRangeGenerator<ParamType>(begin, end));
+}
+
+template <typename T, size_t N>
+internal::ParamGenerator<T> ValuesIn(const T (&array)[N]) {
+  return ValuesIn(array, array + N);
+}
+
+template <class Container>
+internal::ParamGenerator<typename Container::value_type> ValuesIn(
+    const Container& container) {
+  return ValuesIn(container.begin(), container.end());
+}
+
+// Values() allows generating tests from explicitly specified list of
+// parameters.
+//
+// Synopsis:
+// Values(T v1, T v2, ..., T vN)
+//   - returns a generator producing sequences with elements v1, v2, ..., vN.
+//
+// For example, this instantiates tests from test case BarTest each
+// with values "one", "two", and "three":
+//
+// INSTANTIATE_TEST_CASE_P(NumSequence, BarTest, Values("one", "two", "three"));
+//
+// This instantiates tests from test case BazTest each with values 1, 2, 3.5.
+// The exact type of values will depend on the type of parameter in BazTest.
+//
+// INSTANTIATE_TEST_CASE_P(FloatingNumbers, BazTest, Values(1, 2, 3.5));
+//
+// Currently, Values() supports from 1 to 50 parameters.
+//
+template <typename T1>
+internal::ValueArray1<T1> Values(T1 v1) {
+  return internal::ValueArray1<T1>(v1);
+}
+
+template <typename T1, typename T2>
+internal::ValueArray2<T1, T2> Values(T1 v1, T2 v2) {
+  return internal::ValueArray2<T1, T2>(v1, v2);
+}
+
+template <typename T1, typename T2, typename T3>
+internal::ValueArray3<T1, T2, T3> Values(T1 v1, T2 v2, T3 v3) {
+  return internal::ValueArray3<T1, T2, T3>(v1, v2, v3);
+}
+
+template <typename T1, typename T2, typename T3, typename T4>
+internal::ValueArray4<T1, T2, T3, T4> Values(T1 v1, T2 v2, T3 v3, T4 v4) {
+  return internal::ValueArray4<T1, T2, T3, T4>(v1, v2, v3, v4);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+internal::ValueArray5<T1, T2, T3, T4, T5> Values(T1 v1, T2 v2, T3 v3, T4 v4,
+    T5 v5) {
+  return internal::ValueArray5<T1, T2, T3, T4, T5>(v1, v2, v3, v4, v5);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6>
+internal::ValueArray6<T1, T2, T3, T4, T5, T6> Values(T1 v1, T2 v2, T3 v3,
+    T4 v4, T5 v5, T6 v6) {
+  return internal::ValueArray6<T1, T2, T3, T4, T5, T6>(v1, v2, v3, v4, v5, v6);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7>
+internal::ValueArray7<T1, T2, T3, T4, T5, T6, T7> Values(T1 v1, T2 v2, T3 v3,
+    T4 v4, T5 v5, T6 v6, T7 v7) {
+  return internal::ValueArray7<T1, T2, T3, T4, T5, T6, T7>(v1, v2, v3, v4, v5,
+      v6, v7);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8>
+internal::ValueArray8<T1, T2, T3, T4, T5, T6, T7, T8> Values(T1 v1, T2 v2,
+    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8) {
+  return internal::ValueArray8<T1, T2, T3, T4, T5, T6, T7, T8>(v1, v2, v3, v4,
+      v5, v6, v7, v8);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9>
+internal::ValueArray9<T1, T2, T3, T4, T5, T6, T7, T8, T9> Values(T1 v1, T2 v2,
+    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9) {
+  return internal::ValueArray9<T1, T2, T3, T4, T5, T6, T7, T8, T9>(v1, v2, v3,
+      v4, v5, v6, v7, v8, v9);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10>
+internal::ValueArray10<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10> Values(T1 v1,
+    T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10) {
+  return internal::ValueArray10<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10>(v1,
+      v2, v3, v4, v5, v6, v7, v8, v9, v10);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11>
+internal::ValueArray11<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10,
+    T11> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11) {
+  return internal::ValueArray11<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10,
+      T11>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12>
+internal::ValueArray12<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+    T12> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12) {
+  return internal::ValueArray12<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13>
+internal::ValueArray13<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+    T13> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13) {
+  return internal::ValueArray13<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14>
+internal::ValueArray14<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14) {
+  return internal::ValueArray14<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
+      v14);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15>
+internal::ValueArray15<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8,
+    T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15) {
+  return internal::ValueArray15<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
+      v13, v14, v15);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16>
+internal::ValueArray16<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
+    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16) {
+  return internal::ValueArray16<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,
+      v12, v13, v14, v15, v16);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17>
+internal::ValueArray17<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
+    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16, T17 v17) {
+  return internal::ValueArray17<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10,
+      v11, v12, v13, v14, v15, v16, v17);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18>
+internal::ValueArray18<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6,
+    T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16, T17 v17, T18 v18) {
+  return internal::ValueArray18<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18>(v1, v2, v3, v4, v5, v6, v7, v8, v9,
+      v10, v11, v12, v13, v14, v15, v16, v17, v18);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19>
+internal::ValueArray19<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5,
+    T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14,
+    T15 v15, T16 v16, T17 v17, T18 v18, T19 v19) {
+  return internal::ValueArray19<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19>(v1, v2, v3, v4, v5, v6, v7, v8,
+      v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20>
+internal::ValueArray20<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20> Values(T1 v1, T2 v2, T3 v3, T4 v4,
+    T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13,
+    T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20) {
+  return internal::ValueArray20<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20>(v1, v2, v3, v4, v5, v6, v7,
+      v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21>
+internal::ValueArray21<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21> Values(T1 v1, T2 v2, T3 v3, T4 v4,
+    T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13,
+    T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21) {
+  return internal::ValueArray21<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21>(v1, v2, v3, v4, v5, v6,
+      v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22>
+internal::ValueArray22<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22> Values(T1 v1, T2 v2, T3 v3,
+    T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
+    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
+    T21 v21, T22 v22) {
+  return internal::ValueArray22<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22>(v1, v2, v3, v4,
+      v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
+      v20, v21, v22);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23>
+internal::ValueArray23<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23> Values(T1 v1, T2 v2,
+    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
+    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
+    T21 v21, T22 v22, T23 v23) {
+  return internal::ValueArray23<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23>(v1, v2, v3,
+      v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
+      v20, v21, v22, v23);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24>
+internal::ValueArray24<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24> Values(T1 v1, T2 v2,
+    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
+    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
+    T21 v21, T22 v22, T23 v23, T24 v24) {
+  return internal::ValueArray24<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24>(v1, v2,
+      v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18,
+      v19, v20, v21, v22, v23, v24);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25>
+internal::ValueArray25<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> Values(T1 v1,
+    T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11,
+    T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19,
+    T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25) {
+  return internal::ValueArray25<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25>(v1,
+      v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17,
+      v18, v19, v20, v21, v22, v23, v24, v25);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26>
+internal::ValueArray26<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+    T26> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+    T26 v26) {
+  return internal::ValueArray26<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
+      v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27>
+internal::ValueArray27<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+    T27> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+    T26 v26, T27 v27) {
+  return internal::ValueArray27<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14,
+      v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28>
+internal::ValueArray28<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+    T28> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+    T26 v26, T27 v27, T28 v28) {
+  return internal::ValueArray28<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
+      v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27,
+      v28);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29>
+internal::ValueArray29<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+    T26 v26, T27 v27, T28 v28, T29 v29) {
+  return internal::ValueArray29<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
+      v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26,
+      v27, v28, v29);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30>
+internal::ValueArray30<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8,
+    T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16,
+    T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24,
+    T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30) {
+  return internal::ValueArray30<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,
+      v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25,
+      v26, v27, v28, v29, v30);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31>
+internal::ValueArray31<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
+    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
+    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31) {
+  return internal::ValueArray31<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10,
+      v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24,
+      v25, v26, v27, v28, v29, v30, v31);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32>
+internal::ValueArray32<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
+    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
+    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31,
+    T32 v32) {
+  return internal::ValueArray32<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32>(v1, v2, v3, v4, v5, v6, v7, v8, v9,
+      v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23,
+      v24, v25, v26, v27, v28, v29, v30, v31, v32);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33>
+internal::ValueArray33<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6,
+    T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
+    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31,
+    T32 v32, T33 v33) {
+  return internal::ValueArray33<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33>(v1, v2, v3, v4, v5, v6, v7, v8,
+      v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23,
+      v24, v25, v26, v27, v28, v29, v30, v31, v32, v33);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34>
+internal::ValueArray34<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5,
+    T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14,
+    T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22,
+    T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30,
+    T31 v31, T32 v32, T33 v33, T34 v34) {
+  return internal::ValueArray34<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34>(v1, v2, v3, v4, v5, v6, v7,
+      v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22,
+      v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35>
+internal::ValueArray35<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35> Values(T1 v1, T2 v2, T3 v3, T4 v4,
+    T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13,
+    T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21,
+    T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29,
+    T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35) {
+  return internal::ValueArray35<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35>(v1, v2, v3, v4, v5, v6,
+      v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21,
+      v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36>
+internal::ValueArray36<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36> Values(T1 v1, T2 v2, T3 v3, T4 v4,
+    T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13,
+    T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21,
+    T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29,
+    T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36) {
+  return internal::ValueArray36<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36>(v1, v2, v3, v4,
+      v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
+      v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33,
+      v34, v35, v36);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37>
+internal::ValueArray37<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37> Values(T1 v1, T2 v2, T3 v3,
+    T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
+    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
+    T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28,
+    T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36,
+    T37 v37) {
+  return internal::ValueArray37<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37>(v1, v2, v3,
+      v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
+      v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33,
+      v34, v35, v36, v37);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38>
+internal::ValueArray38<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38> Values(T1 v1, T2 v2,
+    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
+    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
+    T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28,
+    T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36,
+    T37 v37, T38 v38) {
+  return internal::ValueArray38<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38>(v1, v2,
+      v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18,
+      v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32,
+      v33, v34, v35, v36, v37, v38);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39>
+internal::ValueArray39<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> Values(T1 v1, T2 v2,
+    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
+    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
+    T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28,
+    T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36,
+    T37 v37, T38 v38, T39 v39) {
+  return internal::ValueArray39<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39>(v1,
+      v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17,
+      v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31,
+      v32, v33, v34, v35, v36, v37, v38, v39);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40>
+internal::ValueArray40<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40> Values(T1 v1,
+    T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11,
+    T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19,
+    T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27,
+    T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35,
+    T36 v36, T37 v37, T38 v38, T39 v39, T40 v40) {
+  return internal::ValueArray40<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
+      v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29,
+      v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41>
+internal::ValueArray41<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+    T41> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+    T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+    T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41) {
+  return internal::ValueArray41<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14,
+      v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28,
+      v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42>
+internal::ValueArray42<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+    T42> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+    T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+    T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+    T42 v42) {
+  return internal::ValueArray42<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
+      v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27,
+      v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41,
+      v42);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43>
+internal::ValueArray43<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+    T43> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+    T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+    T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+    T42 v42, T43 v43) {
+  return internal::ValueArray43<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42, T43>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
+      v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26,
+      v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40,
+      v41, v42, v43);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44>
+internal::ValueArray44<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    T44> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+    T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+    T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+    T42 v42, T43 v43, T44 v44) {
+  return internal::ValueArray44<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42, T43, T44>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,
+      v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25,
+      v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39,
+      v40, v41, v42, v43, v44);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45>
+internal::ValueArray45<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    T44, T45> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8,
+    T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16,
+    T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24,
+    T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32,
+    T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40,
+    T41 v41, T42 v42, T43 v43, T44 v44, T45 v45) {
+  return internal::ValueArray45<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42, T43, T44, T45>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10,
+      v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24,
+      v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38,
+      v39, v40, v41, v42, v43, v44, v45);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46>
+internal::ValueArray46<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    T44, T45, T46> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
+    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
+    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31,
+    T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39,
+    T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46) {
+  return internal::ValueArray46<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42, T43, T44, T45, T46>(v1, v2, v3, v4, v5, v6, v7, v8, v9,
+      v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23,
+      v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37,
+      v38, v39, v40, v41, v42, v43, v44, v45, v46);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47>
+internal::ValueArray47<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    T44, T45, T46, T47> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
+    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
+    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31,
+    T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39,
+    T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47) {
+  return internal::ValueArray47<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42, T43, T44, T45, T46, T47>(v1, v2, v3, v4, v5, v6, v7, v8,
+      v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23,
+      v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37,
+      v38, v39, v40, v41, v42, v43, v44, v45, v46, v47);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48>
+internal::ValueArray48<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    T44, T45, T46, T47, T48> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6,
+    T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
+    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31,
+    T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39,
+    T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47,
+    T48 v48) {
+  return internal::ValueArray48<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42, T43, T44, T45, T46, T47, T48>(v1, v2, v3, v4, v5, v6, v7,
+      v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22,
+      v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36,
+      v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48, typename T49>
+internal::ValueArray49<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    T44, T45, T46, T47, T48, T49> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5,
+    T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14,
+    T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22,
+    T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30,
+    T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38,
+    T39 v39, T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46,
+    T47 v47, T48 v48, T49 v49) {
+  return internal::ValueArray49<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42, T43, T44, T45, T46, T47, T48, T49>(v1, v2, v3, v4, v5, v6,
+      v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21,
+      v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35,
+      v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48, typename T49, typename T50>
+internal::ValueArray50<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    T44, T45, T46, T47, T48, T49, T50> Values(T1 v1, T2 v2, T3 v3, T4 v4,
+    T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13,
+    T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21,
+    T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29,
+    T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37,
+    T38 v38, T39 v39, T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45,
+    T46 v46, T47 v47, T48 v48, T49 v49, T50 v50) {
+  return internal::ValueArray50<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42, T43, T44, T45, T46, T47, T48, T49, T50>(v1, v2, v3, v4,
+      v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
+      v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33,
+      v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47,
+      v48, v49, v50);
+}
+
+// Bool() allows generating tests with parameters in a set of (false, true).
+//
+// Synopsis:
+// Bool()
+//   - returns a generator producing sequences with elements {false, true}.
+//
+// It is useful when testing code that depends on Boolean flags. Combinations
+// of multiple flags can be tested when several Bool()'s are combined using
+// Combine() function.
+//
+// In the following example all tests in the test case FlagDependentTest
+// will be instantiated twice with parameters false and true.
+//
+// class FlagDependentTest : public testing::TestWithParam<bool> {
+//   virtual void SetUp() {
+//     external_flag = GetParam();
+//   }
+// }
+// INSTANTIATE_TEST_CASE_P(BoolSequence, FlagDependentTest, Bool());
+//
+inline internal::ParamGenerator<bool> Bool() {
+  return Values(false, true);
+}
+
+# if GTEST_HAS_COMBINE
+// Combine() allows the user to combine two or more sequences to produce
+// values of a Cartesian product of those sequences' elements.
+//
+// Synopsis:
+// Combine(gen1, gen2, ..., genN)
+//   - returns a generator producing sequences with elements coming from
+//     the Cartesian product of elements from the sequences generated by
+//     gen1, gen2, ..., genN. The sequence elements will have a type of
+//     tuple<T1, T2, ..., TN> where T1, T2, ..., TN are the types
+//     of elements from sequences produces by gen1, gen2, ..., genN.
+//
+// Combine can have up to 10 arguments. This number is currently limited
+// by the maximum number of elements in the tuple implementation used by Google
+// Test.
+//
+// Example:
+//
+// This will instantiate tests in test case AnimalTest each one with
+// the parameter values tuple("cat", BLACK), tuple("cat", WHITE),
+// tuple("dog", BLACK), and tuple("dog", WHITE):
+//
+// enum Color { BLACK, GRAY, WHITE };
+// class AnimalTest
+//     : public testing::TestWithParam<tuple<const char*, Color> > {...};
+//
+// TEST_P(AnimalTest, AnimalLooksNice) {...}
+//
+// INSTANTIATE_TEST_CASE_P(AnimalVariations, AnimalTest,
+//                         Combine(Values("cat", "dog"),
+//                                 Values(BLACK, WHITE)));
+//
+// This will instantiate tests in FlagDependentTest with all variations of two
+// Boolean flags:
+//
+// class FlagDependentTest
+//     : public testing::TestWithParam<tuple<bool, bool> > {
+//   virtual void SetUp() {
+//     // Assigns external_flag_1 and external_flag_2 values from the tuple.
+//     tie(external_flag_1, external_flag_2) = GetParam();
+//   }
+// };
+//
+// TEST_P(FlagDependentTest, TestFeature1) {
+//   // Test your code using external_flag_1 and external_flag_2 here.
+// }
+// INSTANTIATE_TEST_CASE_P(TwoBoolSequence, FlagDependentTest,
+//                         Combine(Bool(), Bool()));
+//
+template <typename Generator1, typename Generator2>
+internal::CartesianProductHolder2<Generator1, Generator2> Combine(
+    const Generator1& g1, const Generator2& g2) {
+  return internal::CartesianProductHolder2<Generator1, Generator2>(
+      g1, g2);
+}
+
+template <typename Generator1, typename Generator2, typename Generator3>
+internal::CartesianProductHolder3<Generator1, Generator2, Generator3> Combine(
+    const Generator1& g1, const Generator2& g2, const Generator3& g3) {
+  return internal::CartesianProductHolder3<Generator1, Generator2, Generator3>(
+      g1, g2, g3);
+}
+
+template <typename Generator1, typename Generator2, typename Generator3,
+    typename Generator4>
+internal::CartesianProductHolder4<Generator1, Generator2, Generator3,
+    Generator4> Combine(
+    const Generator1& g1, const Generator2& g2, const Generator3& g3,
+        const Generator4& g4) {
+  return internal::CartesianProductHolder4<Generator1, Generator2, Generator3,
+      Generator4>(
+      g1, g2, g3, g4);
+}
+
+template <typename Generator1, typename Generator2, typename Generator3,
+    typename Generator4, typename Generator5>
+internal::CartesianProductHolder5<Generator1, Generator2, Generator3,
+    Generator4, Generator5> Combine(
+    const Generator1& g1, const Generator2& g2, const Generator3& g3,
+        const Generator4& g4, const Generator5& g5) {
+  return internal::CartesianProductHolder5<Generator1, Generator2, Generator3,
+      Generator4, Generator5>(
+      g1, g2, g3, g4, g5);
+}
+
+template <typename Generator1, typename Generator2, typename Generator3,
+    typename Generator4, typename Generator5, typename Generator6>
+internal::CartesianProductHolder6<Generator1, Generator2, Generator3,
+    Generator4, Generator5, Generator6> Combine(
+    const Generator1& g1, const Generator2& g2, const Generator3& g3,
+        const Generator4& g4, const Generator5& g5, const Generator6& g6) {
+  return internal::CartesianProductHolder6<Generator1, Generator2, Generator3,
+      Generator4, Generator5, Generator6>(
+      g1, g2, g3, g4, g5, g6);
+}
+
+template <typename Generator1, typename Generator2, typename Generator3,
+    typename Generator4, typename Generator5, typename Generator6,
+    typename Generator7>
+internal::CartesianProductHolder7<Generator1, Generator2, Generator3,
+    Generator4, Generator5, Generator6, Generator7> Combine(
+    const Generator1& g1, const Generator2& g2, const Generator3& g3,
+        const Generator4& g4, const Generator5& g5, const Generator6& g6,
+        const Generator7& g7) {
+  return internal::CartesianProductHolder7<Generator1, Generator2, Generator3,
+      Generator4, Generator5, Generator6, Generator7>(
+      g1, g2, g3, g4, g5, g6, g7);
+}
+
+template <typename Generator1, typename Generator2, typename Generator3,
+    typename Generator4, typename Generator5, typename Generator6,
+    typename Generator7, typename Generator8>
+internal::CartesianProductHolder8<Generator1, Generator2, Generator3,
+    Generator4, Generator5, Generator6, Generator7, Generator8> Combine(
+    const Generator1& g1, const Generator2& g2, const Generator3& g3,
+        const Generator4& g4, const Generator5& g5, const Generator6& g6,
+        const Generator7& g7, const Generator8& g8) {
+  return internal::CartesianProductHolder8<Generator1, Generator2, Generator3,
+      Generator4, Generator5, Generator6, Generator7, Generator8>(
+      g1, g2, g3, g4, g5, g6, g7, g8);
+}
+
+template <typename Generator1, typename Generator2, typename Generator3,
+    typename Generator4, typename Generator5, typename Generator6,
+    typename Generator7, typename Generator8, typename Generator9>
+internal::CartesianProductHolder9<Generator1, Generator2, Generator3,
+    Generator4, Generator5, Generator6, Generator7, Generator8,
+    Generator9> Combine(
+    const Generator1& g1, const Generator2& g2, const Generator3& g3,
+        const Generator4& g4, const Generator5& g5, const Generator6& g6,
+        const Generator7& g7, const Generator8& g8, const Generator9& g9) {
+  return internal::CartesianProductHolder9<Generator1, Generator2, Generator3,
+      Generator4, Generator5, Generator6, Generator7, Generator8, Generator9>(
+      g1, g2, g3, g4, g5, g6, g7, g8, g9);
+}
+
+template <typename Generator1, typename Generator2, typename Generator3,
+    typename Generator4, typename Generator5, typename Generator6,
+    typename Generator7, typename Generator8, typename Generator9,
+    typename Generator10>
+internal::CartesianProductHolder10<Generator1, Generator2, Generator3,
+    Generator4, Generator5, Generator6, Generator7, Generator8, Generator9,
+    Generator10> Combine(
+    const Generator1& g1, const Generator2& g2, const Generator3& g3,
+        const Generator4& g4, const Generator5& g5, const Generator6& g6,
+        const Generator7& g7, const Generator8& g8, const Generator9& g9,
+        const Generator10& g10) {
+  return internal::CartesianProductHolder10<Generator1, Generator2, Generator3,
+      Generator4, Generator5, Generator6, Generator7, Generator8, Generator9,
+      Generator10>(
+      g1, g2, g3, g4, g5, g6, g7, g8, g9, g10);
+}
+# endif  // GTEST_HAS_COMBINE
+
+
+
+# define TEST_P(test_case_name, test_name) \
+  class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) \
+      : public test_case_name { \
+   public: \
+    GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {} \
+    virtual void TestBody(); \
+   private: \
+    static int AddToRegistry() { \
+      ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \
+          GetTestCasePatternHolder<test_case_name>(\
+              #test_case_name, __FILE__, __LINE__)->AddTestPattern(\
+                  #test_case_name, \
+                  #test_name, \
+                  new ::testing::internal::TestMetaFactory< \
+                      GTEST_TEST_CLASS_NAME_(test_case_name, test_name)>()); \
+      return 0; \
+    } \
+    static int gtest_registering_dummy_; \
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(\
+        GTEST_TEST_CLASS_NAME_(test_case_name, test_name)); \
+  }; \
+  int GTEST_TEST_CLASS_NAME_(test_case_name, \
+                             test_name)::gtest_registering_dummy_ = \
+      GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::AddToRegistry(); \
+  void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody()
+
+# define INSTANTIATE_TEST_CASE_P(prefix, test_case_name, generator) \
+  ::testing::internal::ParamGenerator<test_case_name::ParamType> \
+      gtest_##prefix##test_case_name##_EvalGenerator_() { return generator; } \
+  int gtest_##prefix##test_case_name##_dummy_ = \
+      ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \
+          GetTestCasePatternHolder<test_case_name>(\
+              #test_case_name, __FILE__, __LINE__)->AddTestCaseInstantiation(\
+                  #prefix, \
+                  &gtest_##prefix##test_case_name##_EvalGenerator_, \
+                  __FILE__, __LINE__)
+
+}  // namespace testing
+
+#endif  // GTEST_HAS_PARAM_TEST
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
+// Copyright 2006, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+//
+// Google C++ Testing Framework definitions useful in production code.
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_PROD_H_
+#define GTEST_INCLUDE_GTEST_GTEST_PROD_H_
+
+// When you need to test the private or protected members of a class,
+// use the FRIEND_TEST macro to declare your tests as friends of the
+// class.  For example:
+//
+// class MyClass {
+//  private:
+//   void MyMethod();
+//   FRIEND_TEST(MyClassTest, MyMethod);
+// };
+//
+// class MyClassTest : public testing::Test {
+//   // ...
+// };
+//
+// TEST_F(MyClassTest, MyMethod) {
+//   // Can call MyClass::MyMethod() here.
+// }
+
+#define FRIEND_TEST(test_case_name, test_name)\
+friend class test_case_name##_##test_name##_Test
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_PROD_H_
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: mheule@google.com (Markus Heule)
+//
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
+#define GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
+
+#include <iosfwd>
+#include <vector>
+
+namespace testing {
+
+// A copyable object representing the result of a test part (i.e. an
+// assertion or an explicit FAIL(), ADD_FAILURE(), or SUCCESS()).
+//
+// Don't inherit from TestPartResult as its destructor is not virtual.
+class GTEST_API_ TestPartResult {
+ public:
+  // The possible outcomes of a test part (i.e. an assertion or an
+  // explicit SUCCEED(), FAIL(), or ADD_FAILURE()).
+  enum Type {
+    kSuccess,          // Succeeded.
+    kNonFatalFailure,  // Failed but the test can continue.
+    kFatalFailure      // Failed and the test should be terminated.
+  };
+
+  // C'tor.  TestPartResult does NOT have a default constructor.
+  // Always use this constructor (with parameters) to create a
+  // TestPartResult object.
+  TestPartResult(Type a_type,
+                 const char* a_file_name,
+                 int a_line_number,
+                 const char* a_message)
+      : type_(a_type),
+        file_name_(a_file_name == NULL ? "" : a_file_name),
+        line_number_(a_line_number),
+        summary_(ExtractSummary(a_message)),
+        message_(a_message) {
+  }
+
+  // Gets the outcome of the test part.
+  Type type() const { return type_; }
+
+  // Gets the name of the source file where the test part took place, or
+  // NULL if it's unknown.
+  const char* file_name() const {
+    return file_name_.empty() ? NULL : file_name_.c_str();
+  }
+
+  // Gets the line in the source file where the test part took place,
+  // or -1 if it's unknown.
+  int line_number() const { return line_number_; }
+
+  // Gets the summary of the failure message.
+  const char* summary() const { return summary_.c_str(); }
+
+  // Gets the message associated with the test part.
+  const char* message() const { return message_.c_str(); }
+
+  // Returns true iff the test part passed.
+  bool passed() const { return type_ == kSuccess; }
+
+  // Returns true iff the test part failed.
+  bool failed() const { return type_ != kSuccess; }
+
+  // Returns true iff the test part non-fatally failed.
+  bool nonfatally_failed() const { return type_ == kNonFatalFailure; }
+
+  // Returns true iff the test part fatally failed.
+  bool fatally_failed() const { return type_ == kFatalFailure; }
+
+ private:
+  Type type_;
+
+  // Gets the summary of the failure message by omitting the stack
+  // trace in it.
+  static std::string ExtractSummary(const char* message);
+
+  // The name of the source file where the test part took place, or
+  // "" if the source file is unknown.
+  std::string file_name_;
+  // The line in the source file where the test part took place, or -1
+  // if the line number is unknown.
+  int line_number_;
+  std::string summary_;  // The test failure summary.
+  std::string message_;  // The test failure message.
+};
+
+// Prints a TestPartResult object.
+std::ostream& operator<<(std::ostream& os, const TestPartResult& result);
+
+// An array of TestPartResult objects.
+//
+// Don't inherit from TestPartResultArray as its destructor is not
+// virtual.
+class GTEST_API_ TestPartResultArray {
+ public:
+  TestPartResultArray() {}
+
+  // Appends the given TestPartResult to the array.
+  void Append(const TestPartResult& result);
+
+  // Returns the TestPartResult at the given index (0-based).
+  const TestPartResult& GetTestPartResult(int index) const;
+
+  // Returns the number of TestPartResult objects in the array.
+  int size() const;
+
+ private:
+  std::vector<TestPartResult> array_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestPartResultArray);
+};
+
+// This interface knows how to report a test part result.
+class TestPartResultReporterInterface {
+ public:
+  virtual ~TestPartResultReporterInterface() {}
+
+  virtual void ReportTestPartResult(const TestPartResult& result) = 0;
+};
+
+namespace internal {
+
+// This helper class is used by {ASSERT|EXPECT}_NO_FATAL_FAILURE to check if a
+// statement generates new fatal failures. To do so it registers itself as the
+// current test part result reporter. Besides checking if fatal failures were
+// reported, it only delegates the reporting to the former result reporter.
+// The original result reporter is restored in the destructor.
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+class GTEST_API_ HasNewFatalFailureHelper
+    : public TestPartResultReporterInterface {
+ public:
+  HasNewFatalFailureHelper();
+  virtual ~HasNewFatalFailureHelper();
+  virtual void ReportTestPartResult(const TestPartResult& result);
+  bool has_new_fatal_failure() const { return has_new_fatal_failure_; }
+ private:
+  bool has_new_fatal_failure_;
+  TestPartResultReporterInterface* original_reporter_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(HasNewFatalFailureHelper);
+};
+
+}  // namespace internal
+
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
+#define GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
+
+// This header implements typed tests and type-parameterized tests.
+
+// Typed (aka type-driven) tests repeat the same test for types in a
+// list.  You must know which types you want to test with when writing
+// typed tests. Here's how you do it:
+
+#if 0
+
+// First, define a fixture class template.  It should be parameterized
+// by a type.  Remember to derive it from testing::Test.
+template <typename T>
+class FooTest : public testing::Test {
+ public:
+  ...
+  typedef std::list<T> List;
+  static T shared_;
+  T value_;
+};
+
+// Next, associate a list of types with the test case, which will be
+// repeated for each type in the list.  The typedef is necessary for
+// the macro to parse correctly.
+typedef testing::Types<char, int, unsigned int> MyTypes;
+TYPED_TEST_CASE(FooTest, MyTypes);
+
+// If the type list contains only one type, you can write that type
+// directly without Types<...>:
+//   TYPED_TEST_CASE(FooTest, int);
+
+// Then, use TYPED_TEST() instead of TEST_F() to define as many typed
+// tests for this test case as you want.
+TYPED_TEST(FooTest, DoesBlah) {
+  // Inside a test, refer to TypeParam to get the type parameter.
+  // Since we are inside a derived class template, C++ requires use to
+  // visit the members of FooTest via 'this'.
+  TypeParam n = this->value_;
+
+  // To visit static members of the fixture, add the TestFixture::
+  // prefix.
+  n += TestFixture::shared_;
+
+  // To refer to typedefs in the fixture, add the "typename
+  // TestFixture::" prefix.
+  typename TestFixture::List values;
+  values.push_back(n);
+  ...
+}
+
+TYPED_TEST(FooTest, HasPropertyA) { ... }
+
+#endif  // 0
+
+// Type-parameterized tests are abstract test patterns parameterized
+// by a type.  Compared with typed tests, type-parameterized tests
+// allow you to define the test pattern without knowing what the type
+// parameters are.  The defined pattern can be instantiated with
+// different types any number of times, in any number of translation
+// units.
+//
+// If you are designing an interface or concept, you can define a
+// suite of type-parameterized tests to verify properties that any
+// valid implementation of the interface/concept should have.  Then,
+// each implementation can easily instantiate the test suite to verify
+// that it conforms to the requirements, without having to write
+// similar tests repeatedly.  Here's an example:
+
+#if 0
+
+// First, define a fixture class template.  It should be parameterized
+// by a type.  Remember to derive it from testing::Test.
+template <typename T>
+class FooTest : public testing::Test {
+  ...
+};
+
+// Next, declare that you will define a type-parameterized test case
+// (the _P suffix is for "parameterized" or "pattern", whichever you
+// prefer):
+TYPED_TEST_CASE_P(FooTest);
+
+// Then, use TYPED_TEST_P() to define as many type-parameterized tests
+// for this type-parameterized test case as you want.
+TYPED_TEST_P(FooTest, DoesBlah) {
+  // Inside a test, refer to TypeParam to get the type parameter.
+  TypeParam n = 0;
+  ...
+}
+
+TYPED_TEST_P(FooTest, HasPropertyA) { ... }
+
+// Now the tricky part: you need to register all test patterns before
+// you can instantiate them.  The first argument of the macro is the
+// test case name; the rest are the names of the tests in this test
+// case.
+REGISTER_TYPED_TEST_CASE_P(FooTest,
+                           DoesBlah, HasPropertyA);
+
+// Finally, you are free to instantiate the pattern with the types you
+// want.  If you put the above code in a header file, you can #include
+// it in multiple C++ source files and instantiate it multiple times.
+//
+// To distinguish different instances of the pattern, the first
+// argument to the INSTANTIATE_* macro is a prefix that will be added
+// to the actual test case name.  Remember to pick unique prefixes for
+// different instances.
+typedef testing::Types<char, int, unsigned int> MyTypes;
+INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, MyTypes);
+
+// If the type list contains only one type, you can write that type
+// directly without Types<...>:
+//   INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, int);
+
+#endif  // 0
+
+
+// Implements typed tests.
+
+#if GTEST_HAS_TYPED_TEST
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Expands to the name of the typedef for the type parameters of the
+// given test case.
+# define GTEST_TYPE_PARAMS_(TestCaseName) gtest_type_params_##TestCaseName##_
+
+// The 'Types' template argument below must have spaces around it
+// since some compilers may choke on '>>' when passing a template
+// instance (e.g. Types<int>)
+# define TYPED_TEST_CASE(CaseName, Types) \
+  typedef ::testing::internal::TypeList< Types >::type \
+      GTEST_TYPE_PARAMS_(CaseName)
+
+# define TYPED_TEST(CaseName, TestName) \
+  template <typename gtest_TypeParam_> \
+  class GTEST_TEST_CLASS_NAME_(CaseName, TestName) \
+      : public CaseName<gtest_TypeParam_> { \
+   private: \
+    typedef CaseName<gtest_TypeParam_> TestFixture; \
+    typedef gtest_TypeParam_ TypeParam; \
+    virtual void TestBody(); \
+  }; \
+  bool gtest_##CaseName##_##TestName##_registered_ GTEST_ATTRIBUTE_UNUSED_ = \
+      ::testing::internal::TypeParameterizedTest< \
+          CaseName, \
+          ::testing::internal::TemplateSel< \
+              GTEST_TEST_CLASS_NAME_(CaseName, TestName)>, \
+          GTEST_TYPE_PARAMS_(CaseName)>::Register(\
+              "", #CaseName, #TestName, 0); \
+  template <typename gtest_TypeParam_> \
+  void GTEST_TEST_CLASS_NAME_(CaseName, TestName)<gtest_TypeParam_>::TestBody()
+
+#endif  // GTEST_HAS_TYPED_TEST
+
+// Implements type-parameterized tests.
+
+#if GTEST_HAS_TYPED_TEST_P
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Expands to the namespace name that the type-parameterized tests for
+// the given type-parameterized test case are defined in.  The exact
+// name of the namespace is subject to change without notice.
+# define GTEST_CASE_NAMESPACE_(TestCaseName) \
+  gtest_case_##TestCaseName##_
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Expands to the name of the variable used to remember the names of
+// the defined tests in the given test case.
+# define GTEST_TYPED_TEST_CASE_P_STATE_(TestCaseName) \
+  gtest_typed_test_case_p_state_##TestCaseName##_
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE DIRECTLY.
+//
+// Expands to the name of the variable used to remember the names of
+// the registered tests in the given test case.
+# define GTEST_REGISTERED_TEST_NAMES_(TestCaseName) \
+  gtest_registered_test_names_##TestCaseName##_
+
+// The variables defined in the type-parameterized test macros are
+// static as typically these macros are used in a .h file that can be
+// #included in multiple translation units linked together.
+# define TYPED_TEST_CASE_P(CaseName) \
+  static ::testing::internal::TypedTestCasePState \
+      GTEST_TYPED_TEST_CASE_P_STATE_(CaseName)
+
+# define TYPED_TEST_P(CaseName, TestName) \
+  namespace GTEST_CASE_NAMESPACE_(CaseName) { \
+  template <typename gtest_TypeParam_> \
+  class TestName : public CaseName<gtest_TypeParam_> { \
+   private: \
+    typedef CaseName<gtest_TypeParam_> TestFixture; \
+    typedef gtest_TypeParam_ TypeParam; \
+    virtual void TestBody(); \
+  }; \
+  static bool gtest_##TestName##_defined_ GTEST_ATTRIBUTE_UNUSED_ = \
+      GTEST_TYPED_TEST_CASE_P_STATE_(CaseName).AddTestName(\
+          __FILE__, __LINE__, #CaseName, #TestName); \
+  } \
+  template <typename gtest_TypeParam_> \
+  void GTEST_CASE_NAMESPACE_(CaseName)::TestName<gtest_TypeParam_>::TestBody()
+
+# define REGISTER_TYPED_TEST_CASE_P(CaseName, ...) \
+  namespace GTEST_CASE_NAMESPACE_(CaseName) { \
+  typedef ::testing::internal::Templates<__VA_ARGS__>::type gtest_AllTests_; \
+  } \
+  static const char* const GTEST_REGISTERED_TEST_NAMES_(CaseName) = \
+      GTEST_TYPED_TEST_CASE_P_STATE_(CaseName).VerifyRegisteredTestNames(\
+          __FILE__, __LINE__, #__VA_ARGS__)
+
+// The 'Types' template argument below must have spaces around it
+// since some compilers may choke on '>>' when passing a template
+// instance (e.g. Types<int>)
+# define INSTANTIATE_TYPED_TEST_CASE_P(Prefix, CaseName, Types) \
+  bool gtest_##Prefix##_##CaseName GTEST_ATTRIBUTE_UNUSED_ = \
+      ::testing::internal::TypeParameterizedTestCase<CaseName, \
+          GTEST_CASE_NAMESPACE_(CaseName)::gtest_AllTests_, \
+          ::testing::internal::TypeList< Types >::type>::Register(\
+              #Prefix, #CaseName, GTEST_REGISTERED_TEST_NAMES_(CaseName))
+
+#endif  // GTEST_HAS_TYPED_TEST_P
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
+
+// Depending on the platform, different string classes are available.
+// On Linux, in addition to ::std::string, Google also makes use of
+// class ::string, which has the same interface as ::std::string, but
+// has a different implementation.
+//
+// The user can define GTEST_HAS_GLOBAL_STRING to 1 to indicate that
+// ::string is available AND is a distinct type to ::std::string, or
+// define it to 0 to indicate otherwise.
+//
+// If the user's ::std::string and ::string are the same class due to
+// aliasing, he should define GTEST_HAS_GLOBAL_STRING to 0.
+//
+// If the user doesn't define GTEST_HAS_GLOBAL_STRING, it is defined
+// heuristically.
+
+namespace testing {
+
+// Declares the flags.
+
+// This flag temporary enables the disabled tests.
+GTEST_DECLARE_bool_(also_run_disabled_tests);
+
+// This flag brings the debugger on an assertion failure.
+GTEST_DECLARE_bool_(break_on_failure);
+
+// This flag controls whether Google Test catches all test-thrown exceptions
+// and logs them as failures.
+GTEST_DECLARE_bool_(catch_exceptions);
+
+// This flag enables using colors in terminal output. Available values are
+// "yes" to enable colors, "no" (disable colors), or "auto" (the default)
+// to let Google Test decide.
+GTEST_DECLARE_string_(color);
+
+// This flag sets up the filter to select by name using a glob pattern
+// the tests to run. If the filter is not given all tests are executed.
+GTEST_DECLARE_string_(filter);
+
+// This flag causes the Google Test to list tests. None of the tests listed
+// are actually run if the flag is provided.
+GTEST_DECLARE_bool_(list_tests);
+
+// This flag controls whether Google Test emits a detailed XML report to a file
+// in addition to its normal textual output.
+GTEST_DECLARE_string_(output);
+
+// This flags control whether Google Test prints the elapsed time for each
+// test.
+GTEST_DECLARE_bool_(print_time);
+
+// This flag specifies the random number seed.
+GTEST_DECLARE_int32_(random_seed);
+
+// This flag sets how many times the tests are repeated. The default value
+// is 1. If the value is -1 the tests are repeating forever.
+GTEST_DECLARE_int32_(repeat);
+
+// This flag controls whether Google Test includes Google Test internal
+// stack frames in failure stack traces.
+GTEST_DECLARE_bool_(show_internal_stack_frames);
+
+// When this flag is specified, tests' order is randomized on every iteration.
+GTEST_DECLARE_bool_(shuffle);
+
+// This flag specifies the maximum number of stack frames to be
+// printed in a failure message.
+GTEST_DECLARE_int32_(stack_trace_depth);
+
+// When this flag is specified, a failed assertion will throw an
+// exception if exceptions are enabled, or exit the program with a
+// non-zero code otherwise.
+GTEST_DECLARE_bool_(throw_on_failure);
+
+// When this flag is set with a "host:port" string, on supported
+// platforms test results are streamed to the specified port on
+// the specified host machine.
+GTEST_DECLARE_string_(stream_result_to);
+
+// The upper limit for valid stack trace depths.
+const int kMaxStackTraceDepth = 100;
+
+namespace internal {
+
+class AssertHelper;
+class DefaultGlobalTestPartResultReporter;
+class ExecDeathTest;
+class NoExecDeathTest;
+class FinalSuccessChecker;
+class GTestFlagSaver;
+class StreamingListenerTest;
+class TestResultAccessor;
+class TestEventListenersAccessor;
+class TestEventRepeater;
+class UnitTestRecordPropertyTestHelper;
+class WindowsDeathTest;
+class UnitTestImpl* GetUnitTestImpl();
+void ReportFailureInUnknownLocation(TestPartResult::Type result_type,
+                                    const std::string& message);
+
+}  // namespace internal
+
+// The friend relationship of some of these classes is cyclic.
+// If we don't forward declare them the compiler might confuse the classes
+// in friendship clauses with same named classes on the scope.
+class Test;
+class TestCase;
+class TestInfo;
+class UnitTest;
+
+// A class for indicating whether an assertion was successful.  When
+// the assertion wasn't successful, the AssertionResult object
+// remembers a non-empty message that describes how it failed.
+//
+// To create an instance of this class, use one of the factory functions
+// (AssertionSuccess() and AssertionFailure()).
+//
+// This class is useful for two purposes:
+//   1. Defining predicate functions to be used with Boolean test assertions
+//      EXPECT_TRUE/EXPECT_FALSE and their ASSERT_ counterparts
+//   2. Defining predicate-format functions to be
+//      used with predicate assertions (ASSERT_PRED_FORMAT*, etc).
+//
+// For example, if you define IsEven predicate:
+//
+//   testing::AssertionResult IsEven(int n) {
+//     if ((n % 2) == 0)
+//       return testing::AssertionSuccess();
+//     else
+//       return testing::AssertionFailure() << n << " is odd";
+//   }
+//
+// Then the failed expectation EXPECT_TRUE(IsEven(Fib(5)))
+// will print the message
+//
+//   Value of: IsEven(Fib(5))
+//     Actual: false (5 is odd)
+//   Expected: true
+//
+// instead of a more opaque
+//
+//   Value of: IsEven(Fib(5))
+//     Actual: false
+//   Expected: true
+//
+// in case IsEven is a simple Boolean predicate.
+//
+// If you expect your predicate to be reused and want to support informative
+// messages in EXPECT_FALSE and ASSERT_FALSE (negative assertions show up
+// about half as often as positive ones in our tests), supply messages for
+// both success and failure cases:
+//
+//   testing::AssertionResult IsEven(int n) {
+//     if ((n % 2) == 0)
+//       return testing::AssertionSuccess() << n << " is even";
+//     else
+//       return testing::AssertionFailure() << n << " is odd";
+//   }
+//
+// Then a statement EXPECT_FALSE(IsEven(Fib(6))) will print
+//
+//   Value of: IsEven(Fib(6))
+//     Actual: true (8 is even)
+//   Expected: false
+//
+// NB: Predicates that support negative Boolean assertions have reduced
+// performance in positive ones so be careful not to use them in tests
+// that have lots (tens of thousands) of positive Boolean assertions.
+//
+// To use this class with EXPECT_PRED_FORMAT assertions such as:
+//
+//   // Verifies that Foo() returns an even number.
+//   EXPECT_PRED_FORMAT1(IsEven, Foo());
+//
+// you need to define:
+//
+//   testing::AssertionResult IsEven(const char* expr, int n) {
+//     if ((n % 2) == 0)
+//       return testing::AssertionSuccess();
+//     else
+//       return testing::AssertionFailure()
+//         << "Expected: " << expr << " is even\n  Actual: it's " << n;
+//   }
+//
+// If Foo() returns 5, you will see the following message:
+//
+//   Expected: Foo() is even
+//     Actual: it's 5
+//
+class GTEST_API_ AssertionResult {
+ public:
+  // Copy constructor.
+  // Used in EXPECT_TRUE/FALSE(assertion_result).
+  AssertionResult(const AssertionResult& other);
+  // Used in the EXPECT_TRUE/FALSE(bool_expression).
+  explicit AssertionResult(bool success) : success_(success) {}
+
+  // Returns true iff the assertion succeeded.
+  operator bool() const { return success_; }  // NOLINT
+
+  // Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE.
+  AssertionResult operator!() const;
+
+  // Returns the text streamed into this AssertionResult. Test assertions
+  // use it when they fail (i.e., the predicate's outcome doesn't match the
+  // assertion's expectation). When nothing has been streamed into the
+  // object, returns an empty string.
+  const char* message() const {
+    return message_.get() != NULL ?  message_->c_str() : "";
+  }
+  // TODO(vladl@google.com): Remove this after making sure no clients use it.
+  // Deprecated; please use message() instead.
+  const char* failure_message() const { return message(); }
+
+  // Streams a custom failure message into this object.
+  template <typename T> AssertionResult& operator<<(const T& value) {
+    AppendMessage(Message() << value);
+    return *this;
+  }
+
+  // Allows streaming basic output manipulators such as endl or flush into
+  // this object.
+  AssertionResult& operator<<(
+      ::std::ostream& (*basic_manipulator)(::std::ostream& stream)) {
+    AppendMessage(Message() << basic_manipulator);
+    return *this;
+  }
+
+ private:
+  // Appends the contents of message to message_.
+  void AppendMessage(const Message& a_message) {
+    if (message_.get() == NULL)
+      message_.reset(new ::std::string);
+    message_->append(a_message.GetString().c_str());
+  }
+
+  // Stores result of the assertion predicate.
+  bool success_;
+  // Stores the message describing the condition in case the expectation
+  // construct is not satisfied with the predicate's outcome.
+  // Referenced via a pointer to avoid taking too much stack frame space
+  // with test assertions.
+  internal::scoped_ptr< ::std::string> message_;
+
+  GTEST_DISALLOW_ASSIGN_(AssertionResult);
+};
+
+// Makes a successful assertion result.
+GTEST_API_ AssertionResult AssertionSuccess();
+
+// Makes a failed assertion result.
+GTEST_API_ AssertionResult AssertionFailure();
+
+// Makes a failed assertion result with the given failure message.
+// Deprecated; use AssertionFailure() << msg.
+GTEST_API_ AssertionResult AssertionFailure(const Message& msg);
+
+// The abstract class that all tests inherit from.
+//
+// In Google Test, a unit test program contains one or many TestCases, and
+// each TestCase contains one or many Tests.
+//
+// When you define a test using the TEST macro, you don't need to
+// explicitly derive from Test - the TEST macro automatically does
+// this for you.
+//
+// The only time you derive from Test is when defining a test fixture
+// to be used a TEST_F.  For example:
+//
+//   class FooTest : public testing::Test {
+//    protected:
+//     virtual void SetUp() { ... }
+//     virtual void TearDown() { ... }
+//     ...
+//   };
+//
+//   TEST_F(FooTest, Bar) { ... }
+//   TEST_F(FooTest, Baz) { ... }
+//
+// Test is not copyable.
+class GTEST_API_ Test {
+ public:
+  friend class TestInfo;
+
+  // Defines types for pointers to functions that set up and tear down
+  // a test case.
+  typedef internal::SetUpTestCaseFunc SetUpTestCaseFunc;
+  typedef internal::TearDownTestCaseFunc TearDownTestCaseFunc;
+
+  // The d'tor is virtual as we intend to inherit from Test.
+  virtual ~Test();
+
+  // Sets up the stuff shared by all tests in this test case.
+  //
+  // Google Test will call Foo::SetUpTestCase() before running the first
+  // test in test case Foo.  Hence a sub-class can define its own
+  // SetUpTestCase() method to shadow the one defined in the super
+  // class.
+  static void SetUpTestCase() {}
+
+  // Tears down the stuff shared by all tests in this test case.
+  //
+  // Google Test will call Foo::TearDownTestCase() after running the last
+  // test in test case Foo.  Hence a sub-class can define its own
+  // TearDownTestCase() method to shadow the one defined in the super
+  // class.
+  static void TearDownTestCase() {}
+
+  // Returns true iff the current test has a fatal failure.
+  static bool HasFatalFailure();
+
+  // Returns true iff the current test has a non-fatal failure.
+  static bool HasNonfatalFailure();
+
+  // Returns true iff the current test has a (either fatal or
+  // non-fatal) failure.
+  static bool HasFailure() { return HasFatalFailure() || HasNonfatalFailure(); }
+
+  // Logs a property for the current test, test case, or for the entire
+  // invocation of the test program when used outside of the context of a
+  // test case.  Only the last value for a given key is remembered.  These
+  // are public static so they can be called from utility functions that are
+  // not members of the test fixture.  Calls to RecordProperty made during
+  // lifespan of the test (from the moment its constructor starts to the
+  // moment its destructor finishes) will be output in XML as attributes of
+  // the <testcase> element.  Properties recorded from fixture's
+  // SetUpTestCase or TearDownTestCase are logged as attributes of the
+  // corresponding <testsuite> element.  Calls to RecordProperty made in the
+  // global context (before or after invocation of RUN_ALL_TESTS and from
+  // SetUp/TearDown method of Environment objects registered with Google
+  // Test) will be output as attributes of the <testsuites> element.
+  static void RecordProperty(const std::string& key, const std::string& value);
+  static void RecordProperty(const std::string& key, int value);
+
+ protected:
+  // Creates a Test object.
+  Test();
+
+  // Sets up the test fixture.
+  virtual void SetUp();
+
+  // Tears down the test fixture.
+  virtual void TearDown();
+
+ private:
+  // Returns true iff the current test has the same fixture class as
+  // the first test in the current test case.
+  static bool HasSameFixtureClass();
+
+  // Runs the test after the test fixture has been set up.
+  //
+  // A sub-class must implement this to define the test logic.
+  //
+  // DO NOT OVERRIDE THIS FUNCTION DIRECTLY IN A USER PROGRAM.
+  // Instead, use the TEST or TEST_F macro.
+  virtual void TestBody() = 0;
+
+  // Sets up, executes, and tears down the test.
+  void Run();
+
+  // Deletes self.  We deliberately pick an unusual name for this
+  // internal method to avoid clashing with names used in user TESTs.
+  void DeleteSelf_() { delete this; }
+
+  // Uses a GTestFlagSaver to save and restore all Google Test flags.
+  const internal::GTestFlagSaver* const gtest_flag_saver_;
+
+  // Often a user mis-spells SetUp() as Setup() and spends a long time
+  // wondering why it is never called by Google Test.  The declaration of
+  // the following method is solely for catching such an error at
+  // compile time:
+  //
+  //   - The return type is deliberately chosen to be not void, so it
+  //   will be a conflict if a user declares void Setup() in his test
+  //   fixture.
+  //
+  //   - This method is private, so it will be another compiler error
+  //   if a user calls it from his test fixture.
+  //
+  // DO NOT OVERRIDE THIS FUNCTION.
+  //
+  // If you see an error about overriding the following function or
+  // about it being private, you have mis-spelled SetUp() as Setup().
+  struct Setup_should_be_spelled_SetUp {};
+  virtual Setup_should_be_spelled_SetUp* Setup() { return NULL; }
+
+  // We disallow copying Tests.
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Test);
+};
+
+typedef internal::TimeInMillis TimeInMillis;
+
+// A copyable object representing a user specified test property which can be
+// output as a key/value string pair.
+//
+// Don't inherit from TestProperty as its destructor is not virtual.
+class TestProperty {
+ public:
+  // C'tor.  TestProperty does NOT have a default constructor.
+  // Always use this constructor (with parameters) to create a
+  // TestProperty object.
+  TestProperty(const std::string& a_key, const std::string& a_value) :
+    key_(a_key), value_(a_value) {
+  }
+
+  // Gets the user supplied key.
+  const char* key() const {
+    return key_.c_str();
+  }
+
+  // Gets the user supplied value.
+  const char* value() const {
+    return value_.c_str();
+  }
+
+  // Sets a new value, overriding the one supplied in the constructor.
+  void SetValue(const std::string& new_value) {
+    value_ = new_value;
+  }
+
+ private:
+  // The key supplied by the user.
+  std::string key_;
+  // The value supplied by the user.
+  std::string value_;
+};
+
+// The result of a single Test.  This includes a list of
+// TestPartResults, a list of TestProperties, a count of how many
+// death tests there are in the Test, and how much time it took to run
+// the Test.
+//
+// TestResult is not copyable.
+class GTEST_API_ TestResult {
+ public:
+  // Creates an empty TestResult.
+  TestResult();
+
+  // D'tor.  Do not inherit from TestResult.
+  ~TestResult();
+
+  // Gets the number of all test parts.  This is the sum of the number
+  // of successful test parts and the number of failed test parts.
+  int total_part_count() const;
+
+  // Returns the number of the test properties.
+  int test_property_count() const;
+
+  // Returns true iff the test passed (i.e. no test part failed).
+  bool Passed() const { return !Failed(); }
+
+  // Returns true iff the test failed.
+  bool Failed() const;
+
+  // Returns true iff the test fatally failed.
+  bool HasFatalFailure() const;
+
+  // Returns true iff the test has a non-fatal failure.
+  bool HasNonfatalFailure() const;
+
+  // Returns the elapsed time, in milliseconds.
+  TimeInMillis elapsed_time() const { return elapsed_time_; }
+
+  // Returns the i-th test part result among all the results. i can range
+  // from 0 to test_property_count() - 1. If i is not in that range, aborts
+  // the program.
+  const TestPartResult& GetTestPartResult(int i) const;
+
+  // Returns the i-th test property. i can range from 0 to
+  // test_property_count() - 1. If i is not in that range, aborts the
+  // program.
+  const TestProperty& GetTestProperty(int i) const;
+
+ private:
+  friend class TestInfo;
+  friend class TestCase;
+  friend class UnitTest;
+  friend class internal::DefaultGlobalTestPartResultReporter;
+  friend class internal::ExecDeathTest;
+  friend class internal::TestResultAccessor;
+  friend class internal::UnitTestImpl;
+  friend class internal::WindowsDeathTest;
+
+  // Gets the vector of TestPartResults.
+  const std::vector<TestPartResult>& test_part_results() const {
+    return test_part_results_;
+  }
+
+  // Gets the vector of TestProperties.
+  const std::vector<TestProperty>& test_properties() const {
+    return test_properties_;
+  }
+
+  // Sets the elapsed time.
+  void set_elapsed_time(TimeInMillis elapsed) { elapsed_time_ = elapsed; }
+
+  // Adds a test property to the list. The property is validated and may add
+  // a non-fatal failure if invalid (e.g., if it conflicts with reserved
+  // key names). If a property is already recorded for the same key, the
+  // value will be updated, rather than storing multiple values for the same
+  // key.  xml_element specifies the element for which the property is being
+  // recorded and is used for validation.
+  void RecordProperty(const std::string& xml_element,
+                      const TestProperty& test_property);
+
+  // Adds a failure if the key is a reserved attribute of Google Test
+  // testcase tags.  Returns true if the property is valid.
+  // TODO(russr): Validate attribute names are legal and human readable.
+  static bool ValidateTestProperty(const std::string& xml_element,
+                                   const TestProperty& test_property);
+
+  // Adds a test part result to the list.
+  void AddTestPartResult(const TestPartResult& test_part_result);
+
+  // Returns the death test count.
+  int death_test_count() const { return death_test_count_; }
+
+  // Increments the death test count, returning the new count.
+  int increment_death_test_count() { return ++death_test_count_; }
+
+  // Clears the test part results.
+  void ClearTestPartResults();
+
+  // Clears the object.
+  void Clear();
+
+  // Protects mutable state of the property vector and of owned
+  // properties, whose values may be updated.
+  internal::Mutex test_properites_mutex_;
+
+  // The vector of TestPartResults
+  std::vector<TestPartResult> test_part_results_;
+  // The vector of TestProperties
+  std::vector<TestProperty> test_properties_;
+  // Running count of death tests.
+  int death_test_count_;
+  // The elapsed time, in milliseconds.
+  TimeInMillis elapsed_time_;
+
+  // We disallow copying TestResult.
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestResult);
+};  // class TestResult
+
+// A TestInfo object stores the following information about a test:
+//
+//   Test case name
+//   Test name
+//   Whether the test should be run
+//   A function pointer that creates the test object when invoked
+//   Test result
+//
+// The constructor of TestInfo registers itself with the UnitTest
+// singleton such that the RUN_ALL_TESTS() macro knows which tests to
+// run.
+class GTEST_API_ TestInfo {
+ public:
+  // Destructs a TestInfo object.  This function is not virtual, so
+  // don't inherit from TestInfo.
+  ~TestInfo();
+
+  // Returns the test case name.
+  const char* test_case_name() const { return test_case_name_.c_str(); }
+
+  // Returns the test name.
+  const char* name() const { return name_.c_str(); }
+
+  // Returns the name of the parameter type, or NULL if this is not a typed
+  // or a type-parameterized test.
+  const char* type_param() const {
+    if (type_param_.get() != NULL)
+      return type_param_->c_str();
+    return NULL;
+  }
+
+  // Returns the text representation of the value parameter, or NULL if this
+  // is not a value-parameterized test.
+  const char* value_param() const {
+    if (value_param_.get() != NULL)
+      return value_param_->c_str();
+    return NULL;
+  }
+
+  // Returns true if this test should run, that is if the test is not
+  // disabled (or it is disabled but the also_run_disabled_tests flag has
+  // been specified) and its full name matches the user-specified filter.
+  //
+  // Google Test allows the user to filter the tests by their full names.
+  // The full name of a test Bar in test case Foo is defined as
+  // "Foo.Bar".  Only the tests that match the filter will run.
+  //
+  // A filter is a colon-separated list of glob (not regex) patterns,
+  // optionally followed by a '-' and a colon-separated list of
+  // negative patterns (tests to exclude).  A test is run if it
+  // matches one of the positive patterns and does not match any of
+  // the negative patterns.
+  //
+  // For example, *A*:Foo.* is a filter that matches any string that
+  // contains the character 'A' or starts with "Foo.".
+  bool should_run() const { return should_run_; }
+
+  // Returns true iff this test will appear in the XML report.
+  bool is_reportable() const {
+    // For now, the XML report includes all tests matching the filter.
+    // In the future, we may trim tests that are excluded because of
+    // sharding.
+    return matches_filter_;
+  }
+
+  // Returns the result of the test.
+  const TestResult* result() const { return &result_; }
+
+ private:
+#if GTEST_HAS_DEATH_TEST
+  friend class internal::DefaultDeathTestFactory;
+#endif  // GTEST_HAS_DEATH_TEST
+  friend class Test;
+  friend class TestCase;
+  friend class internal::UnitTestImpl;
+  friend class internal::StreamingListenerTest;
+  friend TestInfo* internal::MakeAndRegisterTestInfo(
+      const char* test_case_name,
+      const char* name,
+      const char* type_param,
+      const char* value_param,
+      internal::TypeId fixture_class_id,
+      Test::SetUpTestCaseFunc set_up_tc,
+      Test::TearDownTestCaseFunc tear_down_tc,
+      internal::TestFactoryBase* factory);
+
+  // Constructs a TestInfo object. The newly constructed instance assumes
+  // ownership of the factory object.
+  TestInfo(const std::string& test_case_name,
+           const std::string& name,
+           const char* a_type_param,   // NULL if not a type-parameterized test
+           const char* a_value_param,  // NULL if not a value-parameterized test
+           internal::TypeId fixture_class_id,
+           internal::TestFactoryBase* factory);
+
+  // Increments the number of death tests encountered in this test so
+  // far.
+  int increment_death_test_count() {
+    return result_.increment_death_test_count();
+  }
+
+  // Creates the test object, runs it, records its result, and then
+  // deletes it.
+  void Run();
+
+  static void ClearTestResult(TestInfo* test_info) {
+    test_info->result_.Clear();
+  }
+
+  // These fields are immutable properties of the test.
+  const std::string test_case_name_;     // Test case name
+  const std::string name_;               // Test name
+  // Name of the parameter type, or NULL if this is not a typed or a
+  // type-parameterized test.
+  const internal::scoped_ptr<const ::std::string> type_param_;
+  // Text representation of the value parameter, or NULL if this is not a
+  // value-parameterized test.
+  const internal::scoped_ptr<const ::std::string> value_param_;
+  const internal::TypeId fixture_class_id_;   // ID of the test fixture class
+  bool should_run_;                 // True iff this test should run
+  bool is_disabled_;                // True iff this test is disabled
+  bool matches_filter_;             // True if this test matches the
+                                    // user-specified filter.
+  internal::TestFactoryBase* const factory_;  // The factory that creates
+                                              // the test object
+
+  // This field is mutable and needs to be reset before running the
+  // test for the second time.
+  TestResult result_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestInfo);
+};
+
+// A test case, which consists of a vector of TestInfos.
+//
+// TestCase is not copyable.
+class GTEST_API_ TestCase {
+ public:
+  // Creates a TestCase with the given name.
+  //
+  // TestCase does NOT have a default constructor.  Always use this
+  // constructor to create a TestCase object.
+  //
+  // Arguments:
+  //
+  //   name:         name of the test case
+  //   a_type_param: the name of the test's type parameter, or NULL if
+  //                 this is not a type-parameterized test.
+  //   set_up_tc:    pointer to the function that sets up the test case
+  //   tear_down_tc: pointer to the function that tears down the test case
+  TestCase(const char* name, const char* a_type_param,
+           Test::SetUpTestCaseFunc set_up_tc,
+           Test::TearDownTestCaseFunc tear_down_tc);
+
+  // Destructor of TestCase.
+  virtual ~TestCase();
+
+  // Gets the name of the TestCase.
+  const char* name() const { return name_.c_str(); }
+
+  // Returns the name of the parameter type, or NULL if this is not a
+  // type-parameterized test case.
+  const char* type_param() const {
+    if (type_param_.get() != NULL)
+      return type_param_->c_str();
+    return NULL;
+  }
+
+  // Returns true if any test in this test case should run.
+  bool should_run() const { return should_run_; }
+
+  // Gets the number of successful tests in this test case.
+  int successful_test_count() const;
+
+  // Gets the number of failed tests in this test case.
+  int failed_test_count() const;
+
+  // Gets the number of disabled tests that will be reported in the XML report.
+  int reportable_disabled_test_count() const;
+
+  // Gets the number of disabled tests in this test case.
+  int disabled_test_count() const;
+
+  // Gets the number of tests to be printed in the XML report.
+  int reportable_test_count() const;
+
+  // Get the number of tests in this test case that should run.
+  int test_to_run_count() const;
+
+  // Gets the number of all tests in this test case.
+  int total_test_count() const;
+
+  // Returns true iff the test case passed.
+  bool Passed() const { return !Failed(); }
+
+  // Returns true iff the test case failed.
+  bool Failed() const { return failed_test_count() > 0; }
+
+  // Returns the elapsed time, in milliseconds.
+  TimeInMillis elapsed_time() const { return elapsed_time_; }
+
+  // Returns the i-th test among all the tests. i can range from 0 to
+  // total_test_count() - 1. If i is not in that range, returns NULL.
+  const TestInfo* GetTestInfo(int i) const;
+
+  // Returns the TestResult that holds test properties recorded during
+  // execution of SetUpTestCase and TearDownTestCase.
+  const TestResult& ad_hoc_test_result() const { return ad_hoc_test_result_; }
+
+ private:
+  friend class Test;
+  friend class internal::UnitTestImpl;
+
+  // Gets the (mutable) vector of TestInfos in this TestCase.
+  std::vector<TestInfo*>& test_info_list() { return test_info_list_; }
+
+  // Gets the (immutable) vector of TestInfos in this TestCase.
+  const std::vector<TestInfo*>& test_info_list() const {
+    return test_info_list_;
+  }
+
+  // Returns the i-th test among all the tests. i can range from 0 to
+  // total_test_count() - 1. If i is not in that range, returns NULL.
+  TestInfo* GetMutableTestInfo(int i);
+
+  // Sets the should_run member.
+  void set_should_run(bool should) { should_run_ = should; }
+
+  // Adds a TestInfo to this test case.  Will delete the TestInfo upon
+  // destruction of the TestCase object.
+  void AddTestInfo(TestInfo * test_info);
+
+  // Clears the results of all tests in this test case.
+  void ClearResult();
+
+  // Clears the results of all tests in the given test case.
+  static void ClearTestCaseResult(TestCase* test_case) {
+    test_case->ClearResult();
+  }
+
+  // Runs every test in this TestCase.
+  void Run();
+
+  // Runs SetUpTestCase() for this TestCase.  This wrapper is needed
+  // for catching exceptions thrown from SetUpTestCase().
+  void RunSetUpTestCase() { (*set_up_tc_)(); }
+
+  // Runs TearDownTestCase() for this TestCase.  This wrapper is
+  // needed for catching exceptions thrown from TearDownTestCase().
+  void RunTearDownTestCase() { (*tear_down_tc_)(); }
+
+  // Returns true iff test passed.
+  static bool TestPassed(const TestInfo* test_info) {
+    return test_info->should_run() && test_info->result()->Passed();
+  }
+
+  // Returns true iff test failed.
+  static bool TestFailed(const TestInfo* test_info) {
+    return test_info->should_run() && test_info->result()->Failed();
+  }
+
+  // Returns true iff the test is disabled and will be reported in the XML
+  // report.
+  static bool TestReportableDisabled(const TestInfo* test_info) {
+    return test_info->is_reportable() && test_info->is_disabled_;
+  }
+
+  // Returns true iff test is disabled.
+  static bool TestDisabled(const TestInfo* test_info) {
+    return test_info->is_disabled_;
+  }
+
+  // Returns true iff this test will appear in the XML report.
+  static bool TestReportable(const TestInfo* test_info) {
+    return test_info->is_reportable();
+  }
+
+  // Returns true if the given test should run.
+  static bool ShouldRunTest(const TestInfo* test_info) {
+    return test_info->should_run();
+  }
+
+  // Shuffles the tests in this test case.
+  void ShuffleTests(internal::Random* random);
+
+  // Restores the test order to before the first shuffle.
+  void UnshuffleTests();
+
+  // Name of the test case.
+  std::string name_;
+  // Name of the parameter type, or NULL if this is not a typed or a
+  // type-parameterized test.
+  const internal::scoped_ptr<const ::std::string> type_param_;
+  // The vector of TestInfos in their original order.  It owns the
+  // elements in the vector.
+  std::vector<TestInfo*> test_info_list_;
+  // Provides a level of indirection for the test list to allow easy
+  // shuffling and restoring the test order.  The i-th element in this
+  // vector is the index of the i-th test in the shuffled test list.
+  std::vector<int> test_indices_;
+  // Pointer to the function that sets up the test case.
+  Test::SetUpTestCaseFunc set_up_tc_;
+  // Pointer to the function that tears down the test case.
+  Test::TearDownTestCaseFunc tear_down_tc_;
+  // True iff any test in this test case should run.
+  bool should_run_;
+  // Elapsed time, in milliseconds.
+  TimeInMillis elapsed_time_;
+  // Holds test properties recorded during execution of SetUpTestCase and
+  // TearDownTestCase.
+  TestResult ad_hoc_test_result_;
+
+  // We disallow copying TestCases.
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestCase);
+};
+
+// An Environment object is capable of setting up and tearing down an
+// environment.  The user should subclass this to define his own
+// environment(s).
+//
+// An Environment object does the set-up and tear-down in virtual
+// methods SetUp() and TearDown() instead of the constructor and the
+// destructor, as:
+//
+//   1. You cannot safely throw from a destructor.  This is a problem
+//      as in some cases Google Test is used where exceptions are enabled, and
+//      we may want to implement ASSERT_* using exceptions where they are
+//      available.
+//   2. You cannot use ASSERT_* directly in a constructor or
+//      destructor.
+class Environment {
+ public:
+  // The d'tor is virtual as we need to subclass Environment.
+  virtual ~Environment() {}
+
+  // Override this to define how to set up the environment.
+  virtual void SetUp() {}
+
+  // Override this to define how to tear down the environment.
+  virtual void TearDown() {}
+ private:
+  // If you see an error about overriding the following function or
+  // about it being private, you have mis-spelled SetUp() as Setup().
+  struct Setup_should_be_spelled_SetUp {};
+  virtual Setup_should_be_spelled_SetUp* Setup() { return NULL; }
+};
+
+// The interface for tracing execution of tests. The methods are organized in
+// the order the corresponding events are fired.
+class TestEventListener {
+ public:
+  virtual ~TestEventListener() {}
+
+  // Fired before any test activity starts.
+  virtual void OnTestProgramStart(const UnitTest& unit_test) = 0;
+
+  // Fired before each iteration of tests starts.  There may be more than
+  // one iteration if GTEST_FLAG(repeat) is set. iteration is the iteration
+  // index, starting from 0.
+  virtual void OnTestIterationStart(const UnitTest& unit_test,
+                                    int iteration) = 0;
+
+  // Fired before environment set-up for each iteration of tests starts.
+  virtual void OnEnvironmentsSetUpStart(const UnitTest& unit_test) = 0;
+
+  // Fired after environment set-up for each iteration of tests ends.
+  virtual void OnEnvironmentsSetUpEnd(const UnitTest& unit_test) = 0;
+
+  // Fired before the test case starts.
+  virtual void OnTestCaseStart(const TestCase& test_case) = 0;
+
+  // Fired before the test starts.
+  virtual void OnTestStart(const TestInfo& test_info) = 0;
+
+  // Fired after a failed assertion or a SUCCEED() invocation.
+  virtual void OnTestPartResult(const TestPartResult& test_part_result) = 0;
+
+  // Fired after the test ends.
+  virtual void OnTestEnd(const TestInfo& test_info) = 0;
+
+  // Fired after the test case ends.
+  virtual void OnTestCaseEnd(const TestCase& test_case) = 0;
+
+  // Fired before environment tear-down for each iteration of tests starts.
+  virtual void OnEnvironmentsTearDownStart(const UnitTest& unit_test) = 0;
+
+  // Fired after environment tear-down for each iteration of tests ends.
+  virtual void OnEnvironmentsTearDownEnd(const UnitTest& unit_test) = 0;
+
+  // Fired after each iteration of tests finishes.
+  virtual void OnTestIterationEnd(const UnitTest& unit_test,
+                                  int iteration) = 0;
+
+  // Fired after all test activities have ended.
+  virtual void OnTestProgramEnd(const UnitTest& unit_test) = 0;
+};
+
+// The convenience class for users who need to override just one or two
+// methods and are not concerned that a possible change to a signature of
+// the methods they override will not be caught during the build.  For
+// comments about each method please see the definition of TestEventListener
+// above.
+class EmptyTestEventListener : public TestEventListener {
+ public:
+  virtual void OnTestProgramStart(const UnitTest& /*unit_test*/) {}
+  virtual void OnTestIterationStart(const UnitTest& /*unit_test*/,
+                                    int /*iteration*/) {}
+  virtual void OnEnvironmentsSetUpStart(const UnitTest& /*unit_test*/) {}
+  virtual void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) {}
+  virtual void OnTestCaseStart(const TestCase& /*test_case*/) {}
+  virtual void OnTestStart(const TestInfo& /*test_info*/) {}
+  virtual void OnTestPartResult(const TestPartResult& /*test_part_result*/) {}
+  virtual void OnTestEnd(const TestInfo& /*test_info*/) {}
+  virtual void OnTestCaseEnd(const TestCase& /*test_case*/) {}
+  virtual void OnEnvironmentsTearDownStart(const UnitTest& /*unit_test*/) {}
+  virtual void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) {}
+  virtual void OnTestIterationEnd(const UnitTest& /*unit_test*/,
+                                  int /*iteration*/) {}
+  virtual void OnTestProgramEnd(const UnitTest& /*unit_test*/) {}
+};
+
+// TestEventListeners lets users add listeners to track events in Google Test.
+class GTEST_API_ TestEventListeners {
+ public:
+  TestEventListeners();
+  ~TestEventListeners();
+
+  // Appends an event listener to the end of the list. Google Test assumes
+  // the ownership of the listener (i.e. it will delete the listener when
+  // the test program finishes).
+  void Append(TestEventListener* listener);
+
+  // Removes the given event listener from the list and returns it.  It then
+  // becomes the caller's responsibility to delete the listener. Returns
+  // NULL if the listener is not found in the list.
+  TestEventListener* Release(TestEventListener* listener);
+
+  // Returns the standard listener responsible for the default console
+  // output.  Can be removed from the listeners list to shut down default
+  // console output.  Note that removing this object from the listener list
+  // with Release transfers its ownership to the caller and makes this
+  // function return NULL the next time.
+  TestEventListener* default_result_printer() const {
+    return default_result_printer_;
+  }
+
+  // Returns the standard listener responsible for the default XML output
+  // controlled by the --gtest_output=xml flag.  Can be removed from the
+  // listeners list by users who want to shut down the default XML output
+  // controlled by this flag and substitute it with custom one.  Note that
+  // removing this object from the listener list with Release transfers its
+  // ownership to the caller and makes this function return NULL the next
+  // time.
+  TestEventListener* default_xml_generator() const {
+    return default_xml_generator_;
+  }
+
+ private:
+  friend class TestCase;
+  friend class TestInfo;
+  friend class internal::DefaultGlobalTestPartResultReporter;
+  friend class internal::NoExecDeathTest;
+  friend class internal::TestEventListenersAccessor;
+  friend class internal::UnitTestImpl;
+
+  // Returns repeater that broadcasts the TestEventListener events to all
+  // subscribers.
+  TestEventListener* repeater();
+
+  // Sets the default_result_printer attribute to the provided listener.
+  // The listener is also added to the listener list and previous
+  // default_result_printer is removed from it and deleted. The listener can
+  // also be NULL in which case it will not be added to the list. Does
+  // nothing if the previous and the current listener objects are the same.
+  void SetDefaultResultPrinter(TestEventListener* listener);
+
+  // Sets the default_xml_generator attribute to the provided listener.  The
+  // listener is also added to the listener list and previous
+  // default_xml_generator is removed from it and deleted. The listener can
+  // also be NULL in which case it will not be added to the list. Does
+  // nothing if the previous and the current listener objects are the same.
+  void SetDefaultXmlGenerator(TestEventListener* listener);
+
+  // Controls whether events will be forwarded by the repeater to the
+  // listeners in the list.
+  bool EventForwardingEnabled() const;
+  void SuppressEventForwarding();
+
+  // The actual list of listeners.
+  internal::TestEventRepeater* repeater_;
+  // Listener responsible for the standard result output.
+  TestEventListener* default_result_printer_;
+  // Listener responsible for the creation of the XML output file.
+  TestEventListener* default_xml_generator_;
+
+  // We disallow copying TestEventListeners.
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestEventListeners);
+};
+
+// A UnitTest consists of a vector of TestCases.
+//
+// This is a singleton class.  The only instance of UnitTest is
+// created when UnitTest::GetInstance() is first called.  This
+// instance is never deleted.
+//
+// UnitTest is not copyable.
+//
+// This class is thread-safe as long as the methods are called
+// according to their specification.
+class GTEST_API_ UnitTest {
+ public:
+  // Gets the singleton UnitTest object.  The first time this method
+  // is called, a UnitTest object is constructed and returned.
+  // Consecutive calls will return the same object.
+  static UnitTest* GetInstance();
+
+  // Runs all tests in this UnitTest object and prints the result.
+  // Returns 0 if successful, or 1 otherwise.
+  //
+  // This method can only be called from the main thread.
+  //
+  // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+  int Run() GTEST_MUST_USE_RESULT_;
+
+  // Returns the working directory when the first TEST() or TEST_F()
+  // was executed.  The UnitTest object owns the string.
+  const char* original_working_dir() const;
+
+  // Returns the TestCase object for the test that's currently running,
+  // or NULL if no test is running.
+  const TestCase* current_test_case() const
+      GTEST_LOCK_EXCLUDED_(mutex_);
+
+  // Returns the TestInfo object for the test that's currently running,
+  // or NULL if no test is running.
+  const TestInfo* current_test_info() const
+      GTEST_LOCK_EXCLUDED_(mutex_);
+
+  // Returns the random seed used at the start of the current test run.
+  int random_seed() const;
+
+#if GTEST_HAS_PARAM_TEST
+  // Returns the ParameterizedTestCaseRegistry object used to keep track of
+  // value-parameterized tests and instantiate and register them.
+  //
+  // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+  internal::ParameterizedTestCaseRegistry& parameterized_test_registry()
+      GTEST_LOCK_EXCLUDED_(mutex_);
+#endif  // GTEST_HAS_PARAM_TEST
+
+  // Gets the number of successful test cases.
+  int successful_test_case_count() const;
+
+  // Gets the number of failed test cases.
+  int failed_test_case_count() const;
+
+  // Gets the number of all test cases.
+  int total_test_case_count() const;
+
+  // Gets the number of all test cases that contain at least one test
+  // that should run.
+  int test_case_to_run_count() const;
+
+  // Gets the number of successful tests.
+  int successful_test_count() const;
+
+  // Gets the number of failed tests.
+  int failed_test_count() const;
+
+  // Gets the number of disabled tests that will be reported in the XML report.
+  int reportable_disabled_test_count() const;
+
+  // Gets the number of disabled tests.
+  int disabled_test_count() const;
+
+  // Gets the number of tests to be printed in the XML report.
+  int reportable_test_count() const;
+
+  // Gets the number of all tests.
+  int total_test_count() const;
+
+  // Gets the number of tests that should run.
+  int test_to_run_count() const;
+
+  // Gets the time of the test program start, in ms from the start of the
+  // UNIX epoch.
+  TimeInMillis start_timestamp() const;
+
+  // Gets the elapsed time, in milliseconds.
+  TimeInMillis elapsed_time() const;
+
+  // Returns true iff the unit test passed (i.e. all test cases passed).
+  bool Passed() const;
+
+  // Returns true iff the unit test failed (i.e. some test case failed
+  // or something outside of all tests failed).
+  bool Failed() const;
+
+  // Gets the i-th test case among all the test cases. i can range from 0 to
+  // total_test_case_count() - 1. If i is not in that range, returns NULL.
+  const TestCase* GetTestCase(int i) const;
+
+  // Returns the TestResult containing information on test failures and
+  // properties logged outside of individual test cases.
+  const TestResult& ad_hoc_test_result() const;
+
+  // Returns the list of event listeners that can be used to track events
+  // inside Google Test.
+  TestEventListeners& listeners();
+
+ private:
+  // Registers and returns a global test environment.  When a test
+  // program is run, all global test environments will be set-up in
+  // the order they were registered.  After all tests in the program
+  // have finished, all global test environments will be torn-down in
+  // the *reverse* order they were registered.
+  //
+  // The UnitTest object takes ownership of the given environment.
+  //
+  // This method can only be called from the main thread.
+  Environment* AddEnvironment(Environment* env);
+
+  // Adds a TestPartResult to the current TestResult object.  All
+  // Google Test assertion macros (e.g. ASSERT_TRUE, EXPECT_EQ, etc)
+  // eventually call this to report their results.  The user code
+  // should use the assertion macros instead of calling this directly.
+  void AddTestPartResult(TestPartResult::Type result_type,
+                         const char* file_name,
+                         int line_number,
+                         const std::string& message,
+                         const std::string& os_stack_trace)
+      GTEST_LOCK_EXCLUDED_(mutex_);
+
+  // Adds a TestProperty to the current TestResult object when invoked from
+  // inside a test, to current TestCase's ad_hoc_test_result_ when invoked
+  // from SetUpTestCase or TearDownTestCase, or to the global property set
+  // when invoked elsewhere.  If the result already contains a property with
+  // the same key, the value will be updated.
+  void RecordProperty(const std::string& key, const std::string& value);
+
+  // Gets the i-th test case among all the test cases. i can range from 0 to
+  // total_test_case_count() - 1. If i is not in that range, returns NULL.
+  TestCase* GetMutableTestCase(int i);
+
+  // Accessors for the implementation object.
+  internal::UnitTestImpl* impl() { return impl_; }
+  const internal::UnitTestImpl* impl() const { return impl_; }
+
+  // These classes and funcions are friends as they need to access private
+  // members of UnitTest.
+  friend class Test;
+  friend class internal::AssertHelper;
+  friend class internal::ScopedTrace;
+  friend class internal::StreamingListenerTest;
+  friend class internal::UnitTestRecordPropertyTestHelper;
+  friend Environment* AddGlobalTestEnvironment(Environment* env);
+  friend internal::UnitTestImpl* internal::GetUnitTestImpl();
+  friend void internal::ReportFailureInUnknownLocation(
+      TestPartResult::Type result_type,
+      const std::string& message);
+
+  // Creates an empty UnitTest.
+  UnitTest();
+
+  // D'tor
+  virtual ~UnitTest();
+
+  // Pushes a trace defined by SCOPED_TRACE() on to the per-thread
+  // Google Test trace stack.
+  void PushGTestTrace(const internal::TraceInfo& trace)
+      GTEST_LOCK_EXCLUDED_(mutex_);
+
+  // Pops a trace from the per-thread Google Test trace stack.
+  void PopGTestTrace()
+      GTEST_LOCK_EXCLUDED_(mutex_);
+
+  // Protects mutable state in *impl_.  This is mutable as some const
+  // methods need to lock it too.
+  mutable internal::Mutex mutex_;
+
+  // Opaque implementation object.  This field is never changed once
+  // the object is constructed.  We don't mark it as const here, as
+  // doing so will cause a warning in the constructor of UnitTest.
+  // Mutable state in *impl_ is protected by mutex_.
+  internal::UnitTestImpl* impl_;
+
+  // We disallow copying UnitTest.
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(UnitTest);
+};
+
+// A convenient wrapper for adding an environment for the test
+// program.
+//
+// You should call this before RUN_ALL_TESTS() is called, probably in
+// main().  If you use gtest_main, you need to call this before main()
+// starts for it to take effect.  For example, you can define a global
+// variable like this:
+//
+//   testing::Environment* const foo_env =
+//       testing::AddGlobalTestEnvironment(new FooEnvironment);
+//
+// However, we strongly recommend you to write your own main() and
+// call AddGlobalTestEnvironment() there, as relying on initialization
+// of global variables makes the code harder to read and may cause
+// problems when you register multiple environments from different
+// translation units and the environments have dependencies among them
+// (remember that the compiler doesn't guarantee the order in which
+// global variables from different translation units are initialized).
+inline Environment* AddGlobalTestEnvironment(Environment* env) {
+  return UnitTest::GetInstance()->AddEnvironment(env);
+}
+
+// Initializes Google Test.  This must be called before calling
+// RUN_ALL_TESTS().  In particular, it parses a command line for the
+// flags that Google Test recognizes.  Whenever a Google Test flag is
+// seen, it is removed from argv, and *argc is decremented.
+//
+// No value is returned.  Instead, the Google Test flag variables are
+// updated.
+//
+// Calling the function for the second time has no user-visible effect.
+GTEST_API_ void InitGoogleTest(int* argc, char** argv);
+
+// This overloaded version can be used in Windows programs compiled in
+// UNICODE mode.
+GTEST_API_ void InitGoogleTest(int* argc, wchar_t** argv);
+
+namespace internal {
+
+// FormatForComparison<ToPrint, OtherOperand>::Format(value) formats a
+// value of type ToPrint that is an operand of a comparison assertion
+// (e.g. ASSERT_EQ).  OtherOperand is the type of the other operand in
+// the comparison, and is used to help determine the best way to
+// format the value.  In particular, when the value is a C string
+// (char pointer) and the other operand is an STL string object, we
+// want to format the C string as a string, since we know it is
+// compared by value with the string object.  If the value is a char
+// pointer but the other operand is not an STL string object, we don't
+// know whether the pointer is supposed to point to a NUL-terminated
+// string, and thus want to print it as a pointer to be safe.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+
+// The default case.
+template <typename ToPrint, typename OtherOperand>
+class FormatForComparison {
+ public:
+  static ::std::string Format(const ToPrint& value) {
+    return ::testing::PrintToString(value);
+  }
+};
+
+// Array.
+template <typename ToPrint, size_t N, typename OtherOperand>
+class FormatForComparison<ToPrint[N], OtherOperand> {
+ public:
+  static ::std::string Format(const ToPrint* value) {
+    return FormatForComparison<const ToPrint*, OtherOperand>::Format(value);
+  }
+};
+
+// By default, print C string as pointers to be safe, as we don't know
+// whether they actually point to a NUL-terminated string.
+
+#define GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(CharType)                \
+  template <typename OtherOperand>                                      \
+  class FormatForComparison<CharType*, OtherOperand> {                  \
+   public:                                                              \
+    static ::std::string Format(CharType* value) {                      \
+      return ::testing::PrintToString(static_cast<const void*>(value)); \
+    }                                                                   \
+  }
+
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(wchar_t);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const wchar_t);
+
+#undef GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_
+
+// If a C string is compared with an STL string object, we know it's meant
+// to point to a NUL-terminated string, and thus can print it as a string.
+
+#define GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(CharType, OtherStringType) \
+  template <>                                                           \
+  class FormatForComparison<CharType*, OtherStringType> {               \
+   public:                                                              \
+    static ::std::string Format(CharType* value) {                      \
+      return ::testing::PrintToString(value);                           \
+    }                                                                   \
+  }
+
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::std::string);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char, ::std::string);
+
+#if GTEST_HAS_GLOBAL_STRING
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::string);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char, ::string);
+#endif
+
+#if GTEST_HAS_GLOBAL_WSTRING
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(wchar_t, ::wstring);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const wchar_t, ::wstring);
+#endif
+
+#if GTEST_HAS_STD_WSTRING
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(wchar_t, ::std::wstring);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const wchar_t, ::std::wstring);
+#endif
+
+#undef GTEST_IMPL_FORMAT_C_STRING_AS_STRING_
+
+// Formats a comparison assertion (e.g. ASSERT_EQ, EXPECT_LT, and etc)
+// operand to be used in a failure message.  The type (but not value)
+// of the other operand may affect the format.  This allows us to
+// print a char* as a raw pointer when it is compared against another
+// char* or void*, and print it as a C string when it is compared
+// against an std::string object, for example.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+template <typename T1, typename T2>
+std::string FormatForComparisonFailureMessage(
+    const T1& value, const T2& /* other_operand */) {
+  return FormatForComparison<T1, T2>::Format(value);
+}
+
+// The helper function for {ASSERT|EXPECT}_EQ.
+template <typename T1, typename T2>
+AssertionResult CmpHelperEQ(const char* expected_expression,
+                            const char* actual_expression,
+                            const T1& expected,
+                            const T2& actual) {
+#ifdef _MSC_VER
+# pragma warning(push)          // Saves the current warning state.
+# pragma warning(disable:4389)  // Temporarily disables warning on
+                                // signed/unsigned mismatch.
+#endif
+
+  if (expected == actual) {
+    return AssertionSuccess();
+  }
+
+#ifdef _MSC_VER
+# pragma warning(pop)          // Restores the warning state.
+#endif
+
+  return EqFailure(expected_expression,
+                   actual_expression,
+                   FormatForComparisonFailureMessage(expected, actual),
+                   FormatForComparisonFailureMessage(actual, expected),
+                   false);
+}
+
+// With this overloaded version, we allow anonymous enums to be used
+// in {ASSERT|EXPECT}_EQ when compiled with gcc 4, as anonymous enums
+// can be implicitly cast to BiggestInt.
+GTEST_API_ AssertionResult CmpHelperEQ(const char* expected_expression,
+                                       const char* actual_expression,
+                                       BiggestInt expected,
+                                       BiggestInt actual);
+
+// The helper class for {ASSERT|EXPECT}_EQ.  The template argument
+// lhs_is_null_literal is true iff the first argument to ASSERT_EQ()
+// is a null pointer literal.  The following default implementation is
+// for lhs_is_null_literal being false.
+template <bool lhs_is_null_literal>
+class EqHelper {
+ public:
+  // This templatized version is for the general case.
+  template <typename T1, typename T2>
+  static AssertionResult Compare(const char* expected_expression,
+                                 const char* actual_expression,
+                                 const T1& expected,
+                                 const T2& actual) {
+    return CmpHelperEQ(expected_expression, actual_expression, expected,
+                       actual);
+  }
+
+  // With this overloaded version, we allow anonymous enums to be used
+  // in {ASSERT|EXPECT}_EQ when compiled with gcc 4, as anonymous
+  // enums can be implicitly cast to BiggestInt.
+  //
+  // Even though its body looks the same as the above version, we
+  // cannot merge the two, as it will make anonymous enums unhappy.
+  static AssertionResult Compare(const char* expected_expression,
+                                 const char* actual_expression,
+                                 BiggestInt expected,
+                                 BiggestInt actual) {
+    return CmpHelperEQ(expected_expression, actual_expression, expected,
+                       actual);
+  }
+};
+
+// This specialization is used when the first argument to ASSERT_EQ()
+// is a null pointer literal, like NULL, false, or 0.
+template <>
+class EqHelper<true> {
+ public:
+  // We define two overloaded versions of Compare().  The first
+  // version will be picked when the second argument to ASSERT_EQ() is
+  // NOT a pointer, e.g. ASSERT_EQ(0, AnIntFunction()) or
+  // EXPECT_EQ(false, a_bool).
+  template <typename T1, typename T2>
+  static AssertionResult Compare(
+      const char* expected_expression,
+      const char* actual_expression,
+      const T1& expected,
+      const T2& actual,
+      // The following line prevents this overload from being considered if T2
+      // is not a pointer type.  We need this because ASSERT_EQ(NULL, my_ptr)
+      // expands to Compare("", "", NULL, my_ptr), which requires a conversion
+      // to match the Secret* in the other overload, which would otherwise make
+      // this template match better.
+      typename EnableIf<!is_pointer<T2>::value>::type* = 0) {
+    return CmpHelperEQ(expected_expression, actual_expression, expected,
+                       actual);
+  }
+
+  // This version will be picked when the second argument to ASSERT_EQ() is a
+  // pointer, e.g. ASSERT_EQ(NULL, a_pointer).
+  template <typename T>
+  static AssertionResult Compare(
+      const char* expected_expression,
+      const char* actual_expression,
+      // We used to have a second template parameter instead of Secret*.  That
+      // template parameter would deduce to 'long', making this a better match
+      // than the first overload even without the first overload's EnableIf.
+      // Unfortunately, gcc with -Wconversion-null warns when "passing NULL to
+      // non-pointer argument" (even a deduced integral argument), so the old
+      // implementation caused warnings in user code.
+      Secret* /* expected (NULL) */,
+      T* actual) {
+    // We already know that 'expected' is a null pointer.
+    return CmpHelperEQ(expected_expression, actual_expression,
+                       static_cast<T*>(NULL), actual);
+  }
+};
+
+// A macro for implementing the helper functions needed to implement
+// ASSERT_?? and EXPECT_??.  It is here just to avoid copy-and-paste
+// of similar code.
+//
+// For each templatized helper function, we also define an overloaded
+// version for BiggestInt in order to reduce code bloat and allow
+// anonymous enums to be used with {ASSERT|EXPECT}_?? when compiled
+// with gcc 4.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+#define GTEST_IMPL_CMP_HELPER_(op_name, op)\
+template <typename T1, typename T2>\
+AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \
+                                   const T1& val1, const T2& val2) {\
+  if (val1 op val2) {\
+    return AssertionSuccess();\
+  } else {\
+    return AssertionFailure() \
+        << "Expected: (" << expr1 << ") " #op " (" << expr2\
+        << "), actual: " << FormatForComparisonFailureMessage(val1, val2)\
+        << " vs " << FormatForComparisonFailureMessage(val2, val1);\
+  }\
+}\
+GTEST_API_ AssertionResult CmpHelper##op_name(\
+    const char* expr1, const char* expr2, BiggestInt val1, BiggestInt val2)
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+
+// Implements the helper function for {ASSERT|EXPECT}_NE
+GTEST_IMPL_CMP_HELPER_(NE, !=);
+// Implements the helper function for {ASSERT|EXPECT}_LE
+GTEST_IMPL_CMP_HELPER_(LE, <=);
+// Implements the helper function for {ASSERT|EXPECT}_LT
+GTEST_IMPL_CMP_HELPER_(LT, <);
+// Implements the helper function for {ASSERT|EXPECT}_GE
+GTEST_IMPL_CMP_HELPER_(GE, >=);
+// Implements the helper function for {ASSERT|EXPECT}_GT
+GTEST_IMPL_CMP_HELPER_(GT, >);
+
+#undef GTEST_IMPL_CMP_HELPER_
+
+// The helper function for {ASSERT|EXPECT}_STREQ.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult CmpHelperSTREQ(const char* expected_expression,
+                                          const char* actual_expression,
+                                          const char* expected,
+                                          const char* actual);
+
+// The helper function for {ASSERT|EXPECT}_STRCASEEQ.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult CmpHelperSTRCASEEQ(const char* expected_expression,
+                                              const char* actual_expression,
+                                              const char* expected,
+                                              const char* actual);
+
+// The helper function for {ASSERT|EXPECT}_STRNE.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult CmpHelperSTRNE(const char* s1_expression,
+                                          const char* s2_expression,
+                                          const char* s1,
+                                          const char* s2);
+
+// The helper function for {ASSERT|EXPECT}_STRCASENE.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult CmpHelperSTRCASENE(const char* s1_expression,
+                                              const char* s2_expression,
+                                              const char* s1,
+                                              const char* s2);
+
+
+// Helper function for *_STREQ on wide strings.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult CmpHelperSTREQ(const char* expected_expression,
+                                          const char* actual_expression,
+                                          const wchar_t* expected,
+                                          const wchar_t* actual);
+
+// Helper function for *_STRNE on wide strings.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult CmpHelperSTRNE(const char* s1_expression,
+                                          const char* s2_expression,
+                                          const wchar_t* s1,
+                                          const wchar_t* s2);
+
+}  // namespace internal
+
+// IsSubstring() and IsNotSubstring() are intended to be used as the
+// first argument to {EXPECT,ASSERT}_PRED_FORMAT2(), not by
+// themselves.  They check whether needle is a substring of haystack
+// (NULL is considered a substring of itself only), and return an
+// appropriate error message when they fail.
+//
+// The {needle,haystack}_expr arguments are the stringified
+// expressions that generated the two real arguments.
+GTEST_API_ AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const char* needle, const char* haystack);
+GTEST_API_ AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const wchar_t* needle, const wchar_t* haystack);
+GTEST_API_ AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const char* needle, const char* haystack);
+GTEST_API_ AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const wchar_t* needle, const wchar_t* haystack);
+GTEST_API_ AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::string& needle, const ::std::string& haystack);
+GTEST_API_ AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::string& needle, const ::std::string& haystack);
+
+#if GTEST_HAS_STD_WSTRING
+GTEST_API_ AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::wstring& needle, const ::std::wstring& haystack);
+GTEST_API_ AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::wstring& needle, const ::std::wstring& haystack);
+#endif  // GTEST_HAS_STD_WSTRING
+
+namespace internal {
+
+// Helper template function for comparing floating-points.
+//
+// Template parameter:
+//
+//   RawType: the raw floating-point type (either float or double)
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+template <typename RawType>
+AssertionResult CmpHelperFloatingPointEQ(const char* expected_expression,
+                                         const char* actual_expression,
+                                         RawType expected,
+                                         RawType actual) {
+  const FloatingPoint<RawType> lhs(expected), rhs(actual);
+
+  if (lhs.AlmostEquals(rhs)) {
+    return AssertionSuccess();
+  }
+
+  ::std::stringstream expected_ss;
+  expected_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
+              << expected;
+
+  ::std::stringstream actual_ss;
+  actual_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
+            << actual;
+
+  return EqFailure(expected_expression,
+                   actual_expression,
+                   StringStreamToString(&expected_ss),
+                   StringStreamToString(&actual_ss),
+                   false);
+}
+
+// Helper function for implementing ASSERT_NEAR.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult DoubleNearPredFormat(const char* expr1,
+                                                const char* expr2,
+                                                const char* abs_error_expr,
+                                                double val1,
+                                                double val2,
+                                                double abs_error);
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+// A class that enables one to stream messages to assertion macros
+class GTEST_API_ AssertHelper {
+ public:
+  // Constructor.
+  AssertHelper(TestPartResult::Type type,
+               const char* file,
+               int line,
+               const char* message);
+  ~AssertHelper();
+
+  // Message assignment is a semantic trick to enable assertion
+  // streaming; see the GTEST_MESSAGE_ macro below.
+  void operator=(const Message& message) const;
+
+ private:
+  // We put our data in a struct so that the size of the AssertHelper class can
+  // be as small as possible.  This is important because gcc is incapable of
+  // re-using stack space even for temporary variables, so every EXPECT_EQ
+  // reserves stack space for another AssertHelper.
+  struct AssertHelperData {
+    AssertHelperData(TestPartResult::Type t,
+                     const char* srcfile,
+                     int line_num,
+                     const char* msg)
+        : type(t), file(srcfile), line(line_num), message(msg) { }
+
+    TestPartResult::Type const type;
+    const char* const file;
+    int const line;
+    std::string const message;
+
+   private:
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(AssertHelperData);
+  };
+
+  AssertHelperData* const data_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(AssertHelper);
+};
+
+}  // namespace internal
+
+#if GTEST_HAS_PARAM_TEST
+// The pure interface class that all value-parameterized tests inherit from.
+// A value-parameterized class must inherit from both ::testing::Test and
+// ::testing::WithParamInterface. In most cases that just means inheriting
+// from ::testing::TestWithParam, but more complicated test hierarchies
+// may need to inherit from Test and WithParamInterface at different levels.
+//
+// This interface has support for accessing the test parameter value via
+// the GetParam() method.
+//
+// Use it with one of the parameter generator defining functions, like Range(),
+// Values(), ValuesIn(), Bool(), and Combine().
+//
+// class FooTest : public ::testing::TestWithParam<int> {
+//  protected:
+//   FooTest() {
+//     // Can use GetParam() here.
+//   }
+//   virtual ~FooTest() {
+//     // Can use GetParam() here.
+//   }
+//   virtual void SetUp() {
+//     // Can use GetParam() here.
+//   }
+//   virtual void TearDown {
+//     // Can use GetParam() here.
+//   }
+// };
+// TEST_P(FooTest, DoesBar) {
+//   // Can use GetParam() method here.
+//   Foo foo;
+//   ASSERT_TRUE(foo.DoesBar(GetParam()));
+// }
+// INSTANTIATE_TEST_CASE_P(OneToTenRange, FooTest, ::testing::Range(1, 10));
+
+template <typename T>
+class WithParamInterface {
+ public:
+  typedef T ParamType;
+  virtual ~WithParamInterface() {}
+
+  // The current parameter value. Is also available in the test fixture's
+  // constructor. This member function is non-static, even though it only
+  // references static data, to reduce the opportunity for incorrect uses
+  // like writing 'WithParamInterface<bool>::GetParam()' for a test that
+  // uses a fixture whose parameter type is int.
+  const ParamType& GetParam() const {
+    GTEST_CHECK_(parameter_ != NULL)
+        << "GetParam() can only be called inside a value-parameterized test "
+        << "-- did you intend to write TEST_P instead of TEST_F?";
+    return *parameter_;
+  }
+
+ private:
+  // Sets parameter value. The caller is responsible for making sure the value
+  // remains alive and unchanged throughout the current test.
+  static void SetParam(const ParamType* parameter) {
+    parameter_ = parameter;
+  }
+
+  // Static value used for accessing parameter during a test lifetime.
+  static const ParamType* parameter_;
+
+  // TestClass must be a subclass of WithParamInterface<T> and Test.
+  template <class TestClass> friend class internal::ParameterizedTestFactory;
+};
+
+template <typename T>
+const T* WithParamInterface<T>::parameter_ = NULL;
+
+// Most value-parameterized classes can ignore the existence of
+// WithParamInterface, and can just inherit from ::testing::TestWithParam.
+
+template <typename T>
+class TestWithParam : public Test, public WithParamInterface<T> {
+};
+
+#endif  // GTEST_HAS_PARAM_TEST
+
+// Macros for indicating success/failure in test code.
+
+// ADD_FAILURE unconditionally adds a failure to the current test.
+// SUCCEED generates a success - it doesn't automatically make the
+// current test successful, as a test is only successful when it has
+// no failure.
+//
+// EXPECT_* verifies that a certain condition is satisfied.  If not,
+// it behaves like ADD_FAILURE.  In particular:
+//
+//   EXPECT_TRUE  verifies that a Boolean condition is true.
+//   EXPECT_FALSE verifies that a Boolean condition is false.
+//
+// FAIL and ASSERT_* are similar to ADD_FAILURE and EXPECT_*, except
+// that they will also abort the current function on failure.  People
+// usually want the fail-fast behavior of FAIL and ASSERT_*, but those
+// writing data-driven tests often find themselves using ADD_FAILURE
+// and EXPECT_* more.
+
+// Generates a nonfatal failure with a generic message.
+#define ADD_FAILURE() GTEST_NONFATAL_FAILURE_("Failed")
+
+// Generates a nonfatal failure at the given source file location with
+// a generic message.
+#define ADD_FAILURE_AT(file, line) \
+  GTEST_MESSAGE_AT_(file, line, "Failed", \
+                    ::testing::TestPartResult::kNonFatalFailure)
+
+// Generates a fatal failure with a generic message.
+#define GTEST_FAIL() GTEST_FATAL_FAILURE_("Failed")
+
+// Define this macro to 1 to omit the definition of FAIL(), which is a
+// generic name and clashes with some other libraries.
+#if !GTEST_DONT_DEFINE_FAIL
+# define FAIL() GTEST_FAIL()
+#endif
+
+// Generates a success with a generic message.
+#define GTEST_SUCCEED() GTEST_SUCCESS_("Succeeded")
+
+// Define this macro to 1 to omit the definition of SUCCEED(), which
+// is a generic name and clashes with some other libraries.
+#if !GTEST_DONT_DEFINE_SUCCEED
+# define SUCCEED() GTEST_SUCCEED()
+#endif
+
+// Macros for testing exceptions.
+//
+//    * {ASSERT|EXPECT}_THROW(statement, expected_exception):
+//         Tests that the statement throws the expected exception.
+//    * {ASSERT|EXPECT}_NO_THROW(statement):
+//         Tests that the statement doesn't throw any exception.
+//    * {ASSERT|EXPECT}_ANY_THROW(statement):
+//         Tests that the statement throws an exception.
+
+#define EXPECT_THROW(statement, expected_exception) \
+  GTEST_TEST_THROW_(statement, expected_exception, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_NO_THROW(statement) \
+  GTEST_TEST_NO_THROW_(statement, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_ANY_THROW(statement) \
+  GTEST_TEST_ANY_THROW_(statement, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_THROW(statement, expected_exception) \
+  GTEST_TEST_THROW_(statement, expected_exception, GTEST_FATAL_FAILURE_)
+#define ASSERT_NO_THROW(statement) \
+  GTEST_TEST_NO_THROW_(statement, GTEST_FATAL_FAILURE_)
+#define ASSERT_ANY_THROW(statement) \
+  GTEST_TEST_ANY_THROW_(statement, GTEST_FATAL_FAILURE_)
+
+// Boolean assertions. Condition can be either a Boolean expression or an
+// AssertionResult. For more information on how to use AssertionResult with
+// these macros see comments on that class.
+#define EXPECT_TRUE(condition) \
+  GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \
+                      GTEST_NONFATAL_FAILURE_)
+#define EXPECT_FALSE(condition) \
+  GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \
+                      GTEST_NONFATAL_FAILURE_)
+#define ASSERT_TRUE(condition) \
+  GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \
+                      GTEST_FATAL_FAILURE_)
+#define ASSERT_FALSE(condition) \
+  GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \
+                      GTEST_FATAL_FAILURE_)
+
+// Includes the auto-generated header that implements a family of
+// generic predicate assertion macros.
+// Copyright 2006, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// This file is AUTOMATICALLY GENERATED on 10/31/2011 by command
+// 'gen_gtest_pred_impl.py 5'.  DO NOT EDIT BY HAND!
+//
+// Implements a family of generic predicate assertion macros.
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
+#define GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
+
+// Makes sure this header is not included before gtest.h.
+#ifndef GTEST_INCLUDE_GTEST_GTEST_H_
+# error Do not include gtest_pred_impl.h directly.  Include gtest.h instead.
+#endif  // GTEST_INCLUDE_GTEST_GTEST_H_
+
+// This header implements a family of generic predicate assertion
+// macros:
+//
+//   ASSERT_PRED_FORMAT1(pred_format, v1)
+//   ASSERT_PRED_FORMAT2(pred_format, v1, v2)
+//   ...
+//
+// where pred_format is a function or functor that takes n (in the
+// case of ASSERT_PRED_FORMATn) values and their source expression
+// text, and returns a testing::AssertionResult.  See the definition
+// of ASSERT_EQ in gtest.h for an example.
+//
+// If you don't care about formatting, you can use the more
+// restrictive version:
+//
+//   ASSERT_PRED1(pred, v1)
+//   ASSERT_PRED2(pred, v1, v2)
+//   ...
+//
+// where pred is an n-ary function or functor that returns bool,
+// and the values v1, v2, ..., must support the << operator for
+// streaming to std::ostream.
+//
+// We also define the EXPECT_* variations.
+//
+// For now we only support predicates whose arity is at most 5.
+// Please email googletestframework@googlegroups.com if you need
+// support for higher arities.
+
+// GTEST_ASSERT_ is the basic statement to which all of the assertions
+// in this file reduce.  Don't use this in your code.
+
+#define GTEST_ASSERT_(expression, on_failure) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (const ::testing::AssertionResult gtest_ar = (expression)) \
+    ; \
+  else \
+    on_failure(gtest_ar.failure_message())
+
+
+// Helper function for implementing {EXPECT|ASSERT}_PRED1.  Don't use
+// this in your code.
+template <typename Pred,
+          typename T1>
+AssertionResult AssertPred1Helper(const char* pred_text,
+                                  const char* e1,
+                                  Pred pred,
+                                  const T1& v1) {
+  if (pred(v1)) return AssertionSuccess();
+
+  return AssertionFailure() << pred_text << "("
+                            << e1 << ") evaluates to false, where"
+                            << "\n" << e1 << " evaluates to " << v1;
+}
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT1.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT1_(pred_format, v1, on_failure)\
+  GTEST_ASSERT_(pred_format(#v1, v1), \
+                on_failure)
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED1.  Don't use
+// this in your code.
+#define GTEST_PRED1_(pred, v1, on_failure)\
+  GTEST_ASSERT_(::testing::AssertPred1Helper(#pred, \
+                                             #v1, \
+                                             pred, \
+                                             v1), on_failure)
+
+// Unary predicate assertion macros.
+#define EXPECT_PRED_FORMAT1(pred_format, v1) \
+  GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED1(pred, v1) \
+  GTEST_PRED1_(pred, v1, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT1(pred_format, v1) \
+  GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED1(pred, v1) \
+  GTEST_PRED1_(pred, v1, GTEST_FATAL_FAILURE_)
+
+
+
+// Helper function for implementing {EXPECT|ASSERT}_PRED2.  Don't use
+// this in your code.
+template <typename Pred,
+          typename T1,
+          typename T2>
+AssertionResult AssertPred2Helper(const char* pred_text,
+                                  const char* e1,
+                                  const char* e2,
+                                  Pred pred,
+                                  const T1& v1,
+                                  const T2& v2) {
+  if (pred(v1, v2)) return AssertionSuccess();
+
+  return AssertionFailure() << pred_text << "("
+                            << e1 << ", "
+                            << e2 << ") evaluates to false, where"
+                            << "\n" << e1 << " evaluates to " << v1
+                            << "\n" << e2 << " evaluates to " << v2;
+}
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT2.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT2_(pred_format, v1, v2, on_failure)\
+  GTEST_ASSERT_(pred_format(#v1, #v2, v1, v2), \
+                on_failure)
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED2.  Don't use
+// this in your code.
+#define GTEST_PRED2_(pred, v1, v2, on_failure)\
+  GTEST_ASSERT_(::testing::AssertPred2Helper(#pred, \
+                                             #v1, \
+                                             #v2, \
+                                             pred, \
+                                             v1, \
+                                             v2), on_failure)
+
+// Binary predicate assertion macros.
+#define EXPECT_PRED_FORMAT2(pred_format, v1, v2) \
+  GTEST_PRED_FORMAT2_(pred_format, v1, v2, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED2(pred, v1, v2) \
+  GTEST_PRED2_(pred, v1, v2, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT2(pred_format, v1, v2) \
+  GTEST_PRED_FORMAT2_(pred_format, v1, v2, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED2(pred, v1, v2) \
+  GTEST_PRED2_(pred, v1, v2, GTEST_FATAL_FAILURE_)
+
+
+
+// Helper function for implementing {EXPECT|ASSERT}_PRED3.  Don't use
+// this in your code.
+template <typename Pred,
+          typename T1,
+          typename T2,
+          typename T3>
+AssertionResult AssertPred3Helper(const char* pred_text,
+                                  const char* e1,
+                                  const char* e2,
+                                  const char* e3,
+                                  Pred pred,
+                                  const T1& v1,
+                                  const T2& v2,
+                                  const T3& v3) {
+  if (pred(v1, v2, v3)) return AssertionSuccess();
+
+  return AssertionFailure() << pred_text << "("
+                            << e1 << ", "
+                            << e2 << ", "
+                            << e3 << ") evaluates to false, where"
+                            << "\n" << e1 << " evaluates to " << v1
+                            << "\n" << e2 << " evaluates to " << v2
+                            << "\n" << e3 << " evaluates to " << v3;
+}
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT3.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, on_failure)\
+  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, v1, v2, v3), \
+                on_failure)
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED3.  Don't use
+// this in your code.
+#define GTEST_PRED3_(pred, v1, v2, v3, on_failure)\
+  GTEST_ASSERT_(::testing::AssertPred3Helper(#pred, \
+                                             #v1, \
+                                             #v2, \
+                                             #v3, \
+                                             pred, \
+                                             v1, \
+                                             v2, \
+                                             v3), on_failure)
+
+// Ternary predicate assertion macros.
+#define EXPECT_PRED_FORMAT3(pred_format, v1, v2, v3) \
+  GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED3(pred, v1, v2, v3) \
+  GTEST_PRED3_(pred, v1, v2, v3, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT3(pred_format, v1, v2, v3) \
+  GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED3(pred, v1, v2, v3) \
+  GTEST_PRED3_(pred, v1, v2, v3, GTEST_FATAL_FAILURE_)
+
+
+
+// Helper function for implementing {EXPECT|ASSERT}_PRED4.  Don't use
+// this in your code.
+template <typename Pred,
+          typename T1,
+          typename T2,
+          typename T3,
+          typename T4>
+AssertionResult AssertPred4Helper(const char* pred_text,
+                                  const char* e1,
+                                  const char* e2,
+                                  const char* e3,
+                                  const char* e4,
+                                  Pred pred,
+                                  const T1& v1,
+                                  const T2& v2,
+                                  const T3& v3,
+                                  const T4& v4) {
+  if (pred(v1, v2, v3, v4)) return AssertionSuccess();
+
+  return AssertionFailure() << pred_text << "("
+                            << e1 << ", "
+                            << e2 << ", "
+                            << e3 << ", "
+                            << e4 << ") evaluates to false, where"
+                            << "\n" << e1 << " evaluates to " << v1
+                            << "\n" << e2 << " evaluates to " << v2
+                            << "\n" << e3 << " evaluates to " << v3
+                            << "\n" << e4 << " evaluates to " << v4;
+}
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT4.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, on_failure)\
+  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, v1, v2, v3, v4), \
+                on_failure)
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED4.  Don't use
+// this in your code.
+#define GTEST_PRED4_(pred, v1, v2, v3, v4, on_failure)\
+  GTEST_ASSERT_(::testing::AssertPred4Helper(#pred, \
+                                             #v1, \
+                                             #v2, \
+                                             #v3, \
+                                             #v4, \
+                                             pred, \
+                                             v1, \
+                                             v2, \
+                                             v3, \
+                                             v4), on_failure)
+
+// 4-ary predicate assertion macros.
+#define EXPECT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \
+  GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED4(pred, v1, v2, v3, v4) \
+  GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \
+  GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED4(pred, v1, v2, v3, v4) \
+  GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_FATAL_FAILURE_)
+
+
+
+// Helper function for implementing {EXPECT|ASSERT}_PRED5.  Don't use
+// this in your code.
+template <typename Pred,
+          typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5>
+AssertionResult AssertPred5Helper(const char* pred_text,
+                                  const char* e1,
+                                  const char* e2,
+                                  const char* e3,
+                                  const char* e4,
+                                  const char* e5,
+                                  Pred pred,
+                                  const T1& v1,
+                                  const T2& v2,
+                                  const T3& v3,
+                                  const T4& v4,
+                                  const T5& v5) {
+  if (pred(v1, v2, v3, v4, v5)) return AssertionSuccess();
+
+  return AssertionFailure() << pred_text << "("
+                            << e1 << ", "
+                            << e2 << ", "
+                            << e3 << ", "
+                            << e4 << ", "
+                            << e5 << ") evaluates to false, where"
+                            << "\n" << e1 << " evaluates to " << v1
+                            << "\n" << e2 << " evaluates to " << v2
+                            << "\n" << e3 << " evaluates to " << v3
+                            << "\n" << e4 << " evaluates to " << v4
+                            << "\n" << e5 << " evaluates to " << v5;
+}
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT5.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, on_failure)\
+  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, #v5, v1, v2, v3, v4, v5), \
+                on_failure)
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED5.  Don't use
+// this in your code.
+#define GTEST_PRED5_(pred, v1, v2, v3, v4, v5, on_failure)\
+  GTEST_ASSERT_(::testing::AssertPred5Helper(#pred, \
+                                             #v1, \
+                                             #v2, \
+                                             #v3, \
+                                             #v4, \
+                                             #v5, \
+                                             pred, \
+                                             v1, \
+                                             v2, \
+                                             v3, \
+                                             v4, \
+                                             v5), on_failure)
+
+// 5-ary predicate assertion macros.
+#define EXPECT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \
+  GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED5(pred, v1, v2, v3, v4, v5) \
+  GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \
+  GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED5(pred, v1, v2, v3, v4, v5) \
+  GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_)
+
+
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
+
+// Macros for testing equalities and inequalities.
+//
+//    * {ASSERT|EXPECT}_EQ(expected, actual): Tests that expected == actual
+//    * {ASSERT|EXPECT}_NE(v1, v2):           Tests that v1 != v2
+//    * {ASSERT|EXPECT}_LT(v1, v2):           Tests that v1 < v2
+//    * {ASSERT|EXPECT}_LE(v1, v2):           Tests that v1 <= v2
+//    * {ASSERT|EXPECT}_GT(v1, v2):           Tests that v1 > v2
+//    * {ASSERT|EXPECT}_GE(v1, v2):           Tests that v1 >= v2
+//
+// When they are not, Google Test prints both the tested expressions and
+// their actual values.  The values must be compatible built-in types,
+// or you will get a compiler error.  By "compatible" we mean that the
+// values can be compared by the respective operator.
+//
+// Note:
+//
+//   1. It is possible to make a user-defined type work with
+//   {ASSERT|EXPECT}_??(), but that requires overloading the
+//   comparison operators and is thus discouraged by the Google C++
+//   Usage Guide.  Therefore, you are advised to use the
+//   {ASSERT|EXPECT}_TRUE() macro to assert that two objects are
+//   equal.
+//
+//   2. The {ASSERT|EXPECT}_??() macros do pointer comparisons on
+//   pointers (in particular, C strings).  Therefore, if you use it
+//   with two C strings, you are testing how their locations in memory
+//   are related, not how their content is related.  To compare two C
+//   strings by content, use {ASSERT|EXPECT}_STR*().
+//
+//   3. {ASSERT|EXPECT}_EQ(expected, actual) is preferred to
+//   {ASSERT|EXPECT}_TRUE(expected == actual), as the former tells you
+//   what the actual value is when it fails, and similarly for the
+//   other comparisons.
+//
+//   4. Do not depend on the order in which {ASSERT|EXPECT}_??()
+//   evaluate their arguments, which is undefined.
+//
+//   5. These macros evaluate their arguments exactly once.
+//
+// Examples:
+//
+//   EXPECT_NE(5, Foo());
+//   EXPECT_EQ(NULL, a_pointer);
+//   ASSERT_LT(i, array_size);
+//   ASSERT_GT(records.size(), 0) << "There is no record left.";
+
+#define EXPECT_EQ(expected, actual) \
+  EXPECT_PRED_FORMAT2(::testing::internal:: \
+                      EqHelper<GTEST_IS_NULL_LITERAL_(expected)>::Compare, \
+                      expected, actual)
+#define EXPECT_NE(expected, actual) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperNE, expected, actual)
+#define EXPECT_LE(val1, val2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperLE, val1, val2)
+#define EXPECT_LT(val1, val2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperLT, val1, val2)
+#define EXPECT_GE(val1, val2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperGE, val1, val2)
+#define EXPECT_GT(val1, val2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperGT, val1, val2)
+
+#define GTEST_ASSERT_EQ(expected, actual) \
+  ASSERT_PRED_FORMAT2(::testing::internal:: \
+                      EqHelper<GTEST_IS_NULL_LITERAL_(expected)>::Compare, \
+                      expected, actual)
+#define GTEST_ASSERT_NE(val1, val2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperNE, val1, val2)
+#define GTEST_ASSERT_LE(val1, val2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperLE, val1, val2)
+#define GTEST_ASSERT_LT(val1, val2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperLT, val1, val2)
+#define GTEST_ASSERT_GE(val1, val2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperGE, val1, val2)
+#define GTEST_ASSERT_GT(val1, val2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperGT, val1, val2)
+
+// Define macro GTEST_DONT_DEFINE_ASSERT_XY to 1 to omit the definition of
+// ASSERT_XY(), which clashes with some users' own code.
+
+#if !GTEST_DONT_DEFINE_ASSERT_EQ
+# define ASSERT_EQ(val1, val2) GTEST_ASSERT_EQ(val1, val2)
+#endif
+
+#if !GTEST_DONT_DEFINE_ASSERT_NE
+# define ASSERT_NE(val1, val2) GTEST_ASSERT_NE(val1, val2)
+#endif
+
+#if !GTEST_DONT_DEFINE_ASSERT_LE
+# define ASSERT_LE(val1, val2) GTEST_ASSERT_LE(val1, val2)
+#endif
+
+#if !GTEST_DONT_DEFINE_ASSERT_LT
+# define ASSERT_LT(val1, val2) GTEST_ASSERT_LT(val1, val2)
+#endif
+
+#if !GTEST_DONT_DEFINE_ASSERT_GE
+# define ASSERT_GE(val1, val2) GTEST_ASSERT_GE(val1, val2)
+#endif
+
+#if !GTEST_DONT_DEFINE_ASSERT_GT
+# define ASSERT_GT(val1, val2) GTEST_ASSERT_GT(val1, val2)
+#endif
+
+// C-string Comparisons.  All tests treat NULL and any non-NULL string
+// as different.  Two NULLs are equal.
+//
+//    * {ASSERT|EXPECT}_STREQ(s1, s2):     Tests that s1 == s2
+//    * {ASSERT|EXPECT}_STRNE(s1, s2):     Tests that s1 != s2
+//    * {ASSERT|EXPECT}_STRCASEEQ(s1, s2): Tests that s1 == s2, ignoring case
+//    * {ASSERT|EXPECT}_STRCASENE(s1, s2): Tests that s1 != s2, ignoring case
+//
+// For wide or narrow string objects, you can use the
+// {ASSERT|EXPECT}_??() macros.
+//
+// Don't depend on the order in which the arguments are evaluated,
+// which is undefined.
+//
+// These macros evaluate their arguments exactly once.
+
+#define EXPECT_STREQ(expected, actual) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTREQ, expected, actual)
+#define EXPECT_STRNE(s1, s2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2)
+#define EXPECT_STRCASEEQ(expected, actual) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, expected, actual)
+#define EXPECT_STRCASENE(s1, s2)\
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2)
+
+#define ASSERT_STREQ(expected, actual) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTREQ, expected, actual)
+#define ASSERT_STRNE(s1, s2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2)
+#define ASSERT_STRCASEEQ(expected, actual) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, expected, actual)
+#define ASSERT_STRCASENE(s1, s2)\
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2)
+
+// Macros for comparing floating-point numbers.
+//
+//    * {ASSERT|EXPECT}_FLOAT_EQ(expected, actual):
+//         Tests that two float values are almost equal.
+//    * {ASSERT|EXPECT}_DOUBLE_EQ(expected, actual):
+//         Tests that two double values are almost equal.
+//    * {ASSERT|EXPECT}_NEAR(v1, v2, abs_error):
+//         Tests that v1 and v2 are within the given distance to each other.
+//
+// Google Test uses ULP-based comparison to automatically pick a default
+// error bound that is appropriate for the operands.  See the
+// FloatingPoint template class in gtest-internal.h if you are
+// interested in the implementation details.
+
+#define EXPECT_FLOAT_EQ(expected, actual)\
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<float>, \
+                      expected, actual)
+
+#define EXPECT_DOUBLE_EQ(expected, actual)\
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<double>, \
+                      expected, actual)
+
+#define ASSERT_FLOAT_EQ(expected, actual)\
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<float>, \
+                      expected, actual)
+
+#define ASSERT_DOUBLE_EQ(expected, actual)\
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<double>, \
+                      expected, actual)
+
+#define EXPECT_NEAR(val1, val2, abs_error)\
+  EXPECT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, \
+                      val1, val2, abs_error)
+
+#define ASSERT_NEAR(val1, val2, abs_error)\
+  ASSERT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, \
+                      val1, val2, abs_error)
+
+// These predicate format functions work on floating-point values, and
+// can be used in {ASSERT|EXPECT}_PRED_FORMAT2*(), e.g.
+//
+//   EXPECT_PRED_FORMAT2(testing::DoubleLE, Foo(), 5.0);
+
+// Asserts that val1 is less than, or almost equal to, val2.  Fails
+// otherwise.  In particular, it fails if either val1 or val2 is NaN.
+GTEST_API_ AssertionResult FloatLE(const char* expr1, const char* expr2,
+                                   float val1, float val2);
+GTEST_API_ AssertionResult DoubleLE(const char* expr1, const char* expr2,
+                                    double val1, double val2);
+
+
+#if GTEST_OS_WINDOWS
+
+// Macros that test for HRESULT failure and success, these are only useful
+// on Windows, and rely on Windows SDK macros and APIs to compile.
+//
+//    * {ASSERT|EXPECT}_HRESULT_{SUCCEEDED|FAILED}(expr)
+//
+// When expr unexpectedly fails or succeeds, Google Test prints the
+// expected result and the actual result with both a human-readable
+// string representation of the error, if available, as well as the
+// hex result code.
+# define EXPECT_HRESULT_SUCCEEDED(expr) \
+    EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr))
+
+# define ASSERT_HRESULT_SUCCEEDED(expr) \
+    ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr))
+
+# define EXPECT_HRESULT_FAILED(expr) \
+    EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr))
+
+# define ASSERT_HRESULT_FAILED(expr) \
+    ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr))
+
+#endif  // GTEST_OS_WINDOWS
+
+// Macros that execute statement and check that it doesn't generate new fatal
+// failures in the current thread.
+//
+//   * {ASSERT|EXPECT}_NO_FATAL_FAILURE(statement);
+//
+// Examples:
+//
+//   EXPECT_NO_FATAL_FAILURE(Process());
+//   ASSERT_NO_FATAL_FAILURE(Process()) << "Process() failed";
+//
+#define ASSERT_NO_FATAL_FAILURE(statement) \
+    GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_FATAL_FAILURE_)
+#define EXPECT_NO_FATAL_FAILURE(statement) \
+    GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_NONFATAL_FAILURE_)
+
+// Causes a trace (including the source file path, the current line
+// number, and the given message) to be included in every test failure
+// message generated by code in the current scope.  The effect is
+// undone when the control leaves the current scope.
+//
+// The message argument can be anything streamable to std::ostream.
+//
+// In the implementation, we include the current line number as part
+// of the dummy variable name, thus allowing multiple SCOPED_TRACE()s
+// to appear in the same block - as long as they are on different
+// lines.
+#define SCOPED_TRACE(message) \
+  ::testing::internal::ScopedTrace GTEST_CONCAT_TOKEN_(gtest_trace_, __LINE__)(\
+    __FILE__, __LINE__, ::testing::Message() << (message))
+
+// Compile-time assertion for type equality.
+// StaticAssertTypeEq<type1, type2>() compiles iff type1 and type2 are
+// the same type.  The value it returns is not interesting.
+//
+// Instead of making StaticAssertTypeEq a class template, we make it a
+// function template that invokes a helper class template.  This
+// prevents a user from misusing StaticAssertTypeEq<T1, T2> by
+// defining objects of that type.
+//
+// CAVEAT:
+//
+// When used inside a method of a class template,
+// StaticAssertTypeEq<T1, T2>() is effective ONLY IF the method is
+// instantiated.  For example, given:
+//
+//   template <typename T> class Foo {
+//    public:
+//     void Bar() { testing::StaticAssertTypeEq<int, T>(); }
+//   };
+//
+// the code:
+//
+//   void Test1() { Foo<bool> foo; }
+//
+// will NOT generate a compiler error, as Foo<bool>::Bar() is never
+// actually instantiated.  Instead, you need:
+//
+//   void Test2() { Foo<bool> foo; foo.Bar(); }
+//
+// to cause a compiler error.
+template <typename T1, typename T2>
+bool StaticAssertTypeEq() {
+  (void)internal::StaticAssertTypeEqHelper<T1, T2>();
+  return true;
+}
+
+// Defines a test.
+//
+// The first parameter is the name of the test case, and the second
+// parameter is the name of the test within the test case.
+//
+// The convention is to end the test case name with "Test".  For
+// example, a test case for the Foo class can be named FooTest.
+//
+// The user should put his test code between braces after using this
+// macro.  Example:
+//
+//   TEST(FooTest, InitializesCorrectly) {
+//     Foo foo;
+//     EXPECT_TRUE(foo.StatusIsOK());
+//   }
+
+// Note that we call GetTestTypeId() instead of GetTypeId<
+// ::testing::Test>() here to get the type ID of testing::Test.  This
+// is to work around a suspected linker bug when using Google Test as
+// a framework on Mac OS X.  The bug causes GetTypeId<
+// ::testing::Test>() to return different values depending on whether
+// the call is from the Google Test framework itself or from user test
+// code.  GetTestTypeId() is guaranteed to always return the same
+// value, as it always calls GetTypeId<>() from the Google Test
+// framework.
+#define GTEST_TEST(test_case_name, test_name)\
+  GTEST_TEST_(test_case_name, test_name, \
+              ::testing::Test, ::testing::internal::GetTestTypeId())
+
+// Define this macro to 1 to omit the definition of TEST(), which
+// is a generic name and clashes with some other libraries.
+#if !GTEST_DONT_DEFINE_TEST
+# define TEST(test_case_name, test_name) GTEST_TEST(test_case_name, test_name)
+#endif
+
+// Defines a test that uses a test fixture.
+//
+// The first parameter is the name of the test fixture class, which
+// also doubles as the test case name.  The second parameter is the
+// name of the test within the test case.
+//
+// A test fixture class must be declared earlier.  The user should put
+// his test code between braces after using this macro.  Example:
+//
+//   class FooTest : public testing::Test {
+//    protected:
+//     virtual void SetUp() { b_.AddElement(3); }
+//
+//     Foo a_;
+//     Foo b_;
+//   };
+//
+//   TEST_F(FooTest, InitializesCorrectly) {
+//     EXPECT_TRUE(a_.StatusIsOK());
+//   }
+//
+//   TEST_F(FooTest, ReturnsElementCountCorrectly) {
+//     EXPECT_EQ(0, a_.size());
+//     EXPECT_EQ(1, b_.size());
+//   }
+
+#define TEST_F(test_fixture, test_name)\
+  GTEST_TEST_(test_fixture, test_name, test_fixture, \
+              ::testing::internal::GetTypeId<test_fixture>())
+
+}  // namespace testing
+
+// Use this function in main() to run all tests.  It returns 0 if all
+// tests are successful, or 1 otherwise.
+//
+// RUN_ALL_TESTS() should be invoked after the command line has been
+// parsed by InitGoogleTest().
+//
+// This function was formerly a macro; thus, it is in the global
+// namespace and has an all-caps name.
+int RUN_ALL_TESTS() GTEST_MUST_USE_RESULT_;
+
+inline int RUN_ALL_TESTS() {
+  return ::testing::UnitTest::GetInstance()->Run();
+}
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_H_